# Pandas

There are two types in pandas 1=series, 2=dataframe

We can handle any type of data (int, string, bool, etc), but in numpy we only handle int type data. 


## Series

Its repersents one dimentional labeled index array based on numpy ndarray. As we know that in numpy there is no indexing, only one index exist in numpy.


Like an array, a series can hold zero or more values of any single data type.


We can create series by passing scaler value, a numpy ndarray, a python list, or a python dictionary.

In [127]:
import numpy as np 
import pandas as pd

In [128]:
#Create one item series by scaler value
s1 = pd.Series(2)
s1

0    2
dtype: int64

In [129]:
#Create a series of multiple items series from a list
s2 = pd.Series([1,2,3,4,5])
s2

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [130]:
#Get the values in the series
s2.values

array([1, 2, 3, 4, 5], dtype=int64)

In [131]:
#Get the indexs in the series
s2.index

RangeIndex(start=0, stop=5, step=1)

In [132]:
#Explicitly create an index
#index in alphabats, not integers
s3 = pd.Series([1,2,3], index=['a','b','c'])
s3

a    1
b    2
c    3
dtype: int64

In [133]:
#Print value by its inter index as well as by its label index
print(f"Value by label index 's3['c']' is {s3['c']} and value by integer index 's3[3] is {s3[2]}")

Value by label index 's3['c']' is 3 and value by integer index 's3[3] is 3


In [134]:
#Create series from an existing index
#NOTE: series value and exsiting index must be same in lenght
s4 = pd.Series(['A','B','C','D','E'], index=s2.index)
s4 

0    A
1    B
2    C
3    D
4    E
dtype: object

In [135]:
#Create a series from a dictionary
s4 = pd.Series({
    'a': 1,
    'b': 2,
    'c': 3,
    'd': 4})
s4

a    1
b    2
c    3
d    4
dtype: int64

In [136]:
s5 = pd.Series(np.array([22,33,44,55,66]))
s5

0    22
1    33
2    44
3    55
4    66
dtype: int32

# Size, Shape, Uniqueness, and Counts of values


In [137]:
#Series with NAN values
s = pd.Series([0,1,1,2,3,4,4,5,6,7,np.NAN])
s

0     0.0
1     1.0
2     1.0
3     2.0
4     3.0
5     4.0
6     4.0
7     5.0
8     6.0
9     7.0
10    NaN
dtype: float64

In [138]:
print(len(s))       #Return the lenght of values
print(s.size)       #As len and size both are same
print(s.shape)      #Return no of elements with comma
print(s.count())    #Return counts of values or index but NAN not included 
print(s.unique())   #Return counts only none duplicated values or index
print(s.value_counts()) #Return counts for each values

11
11
(11,)
10
[ 0.  1.  2.  3.  4.  5.  6.  7. nan]
4.0    2
1.0    2
7.0    1
6.0    1
5.0    1
3.0    1
2.0    1
0.0    1
dtype: int64


# Peeking at data with heads, tails and take

In [139]:
#Get first five
s.head()

0    0.0
1    1.0
2    1.0
3    2.0
4    3.0
dtype: float64

In [140]:
#Get last five
s.tail()

6     4.0
7     5.0
8     6.0
9     7.0
10    NaN
dtype: float64

In [141]:
#Get last three
s.tail(n=3)         #equalvalent to s.tail(3)

8     6.0
9     7.0
10    NaN
dtype: float64

In [142]:
#Get desired values by selected provided index
s.take([9,3,9])

9    7.0
3    2.0
9    7.0
dtype: float64

# Looking up values in series

In [143]:
#Get single item lookup
print(s3)
s3['a']

a    1
b    2
c    3
dtype: int64


1

In [144]:
#Get single item lookup by position as index in not an integer
s3[1]

2

In [145]:
#Get multiple items
s3[['c','a']]

c    3
a    1
dtype: int64

In [146]:
#Series with an integer index, but not strating with 0
s5 = pd.Series([1,2,3], index=[2,3,4])
s5

2    1
3    2
4    3
dtype: int64

# Label-based lookup vs Position based lookup

In [147]:
#2 is consider as label based lookup
s5[2]

1

In [153]:
#loc is also works on label based look up
s5.loc[2]

1

In [161]:
#iloc forcefully works on position besed look up
s5.iloc[0]
#s5.iloc[2]

1

In [155]:
#multiple items by label 
s5.loc[[4,3]]

4    3
3    2
dtype: int64

In [170]:
s5.iloc[[0,2]]

2    1
4    3
dtype: int64

# Alignment via index label 

In [173]:
s6 = pd.Series([1,2,3,4], index=['a', 'b', 'c', 'd'])
s6

a    1
b    2
c    3
d    4
dtype: int64

In [174]:
s7 = pd.Series([4,3,2,1], index=['d', 'c', 'b', 'a'])
s7

d    4
c    3
b    2
a    1
dtype: int64

In [178]:
s6 + s7

a    2
b    4
c    6
d    8
dtype: int64

In [179]:
#NAN + number = NAN
#number + NAN = NAN

s8 = pd.Series({'a':1, 'b':2, 'c':3, 'd':5})
s8

a    1
b    2
c    3
d    5
dtype: int64

In [180]:
s9 = pd.Series({'b':6, 'c':7, 'd':9, 'e':10})
s9

b     6
c     7
d     9
e    10
dtype: int64

In [181]:
#NAN + number = NAN
#number + NAN = NAN

s8 + s9

a     NaN
b     8.0
c    10.0
d    14.0
e     NaN
dtype: float64

In [191]:
s10 = pd.Series([1.0, 2.0, 3.0], index=['a', 'a', 'b'])
s10

a    1.0
a    2.0
b    3.0
dtype: float64

In [192]:
s11 = pd.Series([4.0, 5.0, 6.0], index=['a', 'a', 'c'])
s11

a    4.0
a    5.0
c    6.0
dtype: float64

In [193]:
s10 + s11

a    5.0
a    6.0
a    6.0
a    7.0
b    NaN
c    NaN
dtype: float64

# Special case of NAN

In [199]:
nda = np.array([1,2,3,4,5])
nda.mean()

3.0

In [200]:
#Numpy always consider NAN values therefore all value add in NAN value and turns into NAN value
nda = np.array([1,2,3,4,np.NAN])
nda.mean()

nan

In [202]:
#Pandas ignores NAN values
s = pd.Series(nda)
s.mean()

2.5

In [213]:
#Handle NAN value like Numpy
s.mean(skipna=True)

2.5

In [212]:
s.mean(skipna=False)

nan

# Boolean Selection

In [222]:
#which rows have values that are > 5
s = pd.Series(np.arange(0,10))
s > 5

0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
8     True
9     True
dtype: bool

In [223]:
s[s > 5]

6    6
7    7
8    8
9    9
dtype: int32

In [226]:
#select rows where values are > 5
logicalResults = s > 5
s[logicalResults]

6    6
7    7
8    8
9    9
dtype: int32

In [227]:
#select rows where values are > 5 (shorter version)
s[s > 5]

6    6
7    7
8    8
9    9
dtype: int32

In [228]:
s[(s > 5) & (s < 8)]

6    6
7    7
dtype: int32

In [230]:
pd.Series([True, False, False, True, True]).all(), pd.Series([True, False, False, True, True]).any()

(False, True)

In [231]:
np.array([1,0,1,1]).sum()

3

In [233]:
np.array([True, True, False, True, True]).sum()

4

In [240]:
#are all items >= 0
(s >= 0).all()

True

In [239]:
s < 2

0     True
1     True
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
dtype: bool

In [241]:
#any items < 2
s[s < 2].any()

True

In [242]:
#how many values < 2
(s < 2).sum()

2

# Reindexing in Series

- Reordering existing data to match a set of labels
- Inserting NAN marker where no data exists for a label
- Possibly, Filling missing data for a label using some type of logic (defaulting to adding NAN values)

In [243]:
#Sample series of five items
s = pd.Series(np.random.randn(5))
s

0   -0.603751
1    0.088512
2   -1.973457
3   -2.225498
4    0.993273
dtype: float64

In [244]:
#Change the index (re-indexing)
s.index = ['a','b','c','d','e']
s

a   -0.603751
b    0.088512
c   -1.973457
d   -2.225498
e    0.993273
dtype: float64

In [246]:
#Concate copies index values
np.random.seed(123456)

s1 = pd.Series(np.random.randn(3))
s2 = pd.Series(np.random.randn(3))

combined = pd.concat([s1, s2])
combined

0    0.469112
1   -0.282863
2   -1.509059
0   -1.135632
1    1.212112
2   -0.173215
dtype: float64

In [247]:
#Reset the index so that duplication of index may be removed (dynamic way)
combined.index = np.arange(0, len(combined))
combined

0    0.469112
1   -0.282863
2   -1.509059
3   -1.135632
4    1.212112
5   -0.173215
dtype: float64

In [248]:
#Reset the index so that duplication of index may be removed (static way)
combined.index = [11,22,33,44,55,66]
combined

11    0.469112
22   -0.282863
33   -1.509059
44   -1.135632
55    1.212112
66   -0.173215
dtype: float64

In [250]:
np.random.seed(123456)
s1 = pd.Series(np.random.randn(4), index=['a','b','c','d'])
s1

s2 = s1.reindex(['a', 'c', 'g'])
s2

#1 - reindex() do not re-index inplace, it will return a new series, orignal will not be modified
#2 - if any index not matching the pervious index, will be assigned NAN.
#3 - The index present in pervious indexes, if not included in re-index then the row will not be added in new series.

a    0.469112
c   -1.509059
g         NaN
dtype: float64

In [253]:
combined.reindex([9,5,3,4,0,1,6,11])

9          NaN
5          NaN
3          NaN
4          NaN
0          NaN
1          NaN
6          NaN
11    0.469112
dtype: float64

In [252]:
combined            #Last index is still there

11    0.469112
22   -0.282863
33   -1.509059
44   -1.135632
55    1.212112
66   -0.173215
dtype: float64

In [254]:
#Different types for the same values of labels
s1 = pd.Series([0,1,2], index=[0,1,2])
s2 = pd.Series([3,4,5], index=['0', '1', '2'])

s1 + s2

0   NaN
1   NaN
2   NaN
0   NaN
1   NaN
2   NaN
dtype: float64

In [255]:
#reindex by casting the label types
s2.index = s2.index.astype(int)
s1 + s2 

0    3
1    5
2    7
dtype: int64

In [258]:
#Fill with 0 instead of NAN
s2 = s.copy()
s2

a   -0.603751
b    0.088512
c   -1.973457
d   -2.225498
e    0.993273
dtype: float64

In [259]:
s2_reindex = s2.reindex(['a','f'], fill_value=0)
s2_reindex

a   -0.603751
f    0.000000
dtype: float64

In [261]:
s2_reindex = s2.reindex(['a','f'], fill_value=np.mean([1,2,3,4,5,6,7]))
s2_reindex

a   -0.603751
f    4.000000
dtype: float64

# ffill, bfill and nearest

In [262]:
s3 = pd.Series(['red','green','blue'], index=[0,8,10])
s3

0       red
8     green
10     blue
dtype: object

In [264]:
#ffile = forward fill
s3.reindex(np.arange(0,15), method='ffill')

0       red
1       red
2       red
3       red
4       red
5       red
6       red
7       red
8     green
9     green
10     blue
11     blue
12     blue
13     blue
14     blue
dtype: object

In [267]:
#bfile = backward fill
s3.reindex(np.arange(0,15), method='bfill')

0       red
1     green
2     green
3     green
4     green
5     green
6     green
7     green
8     green
9      blue
10     blue
11      NaN
12      NaN
13      NaN
14      NaN
dtype: object

In [269]:
#nearest = nearest fill
s3.reindex(np.arange(0,10), method='nearest')

0      red
1      red
2      red
3      red
4    green
5    green
6    green
7    green
8    green
9     blue
dtype: object

# Slicing = [startOfRow:endOfRow:StepValue]

In [271]:
print(s[0:6:2])

a   -0.603751
c   -1.973457
e    0.993273
dtype: float64


In [274]:
s.iloc[[0,2,4]]  #equalvalient

a   -0.603751
c   -1.973457
e    0.993273
dtype: float64

In [275]:
#First five element by slicing, same as .head(5)
s[:5]

a   -0.603751
b    0.088512
c   -1.973457
d   -2.225498
e    0.993273
dtype: float64

# Missing Data in the Series

In [276]:
sdata = {'Ohio': 3500, 'Texas': 71000, 'Oregon': 16000, 'Utah':5000}
obj3 = pd.Series(sdata)
obj3

Ohio       3500
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [279]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)
obj4

California        NaN
Ohio           3500.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [280]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [281]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [284]:
#
obj4.name = "Population"
obj4.index.name = "State"
obj4

State
California        NaN
Ohio           3500.0
Oregon        16000.0
Texas         71000.0
Name: Population, dtype: float64