In [1]:
# all imports go at the top, even though python is a very open language
import numpy as np
import pandas as pd

In [3]:
print(dir(pd))

['BooleanDtype', 'Categorical', 'CategoricalDtype', 'CategoricalIndex', 'DataFrame', 'DateOffset', 'DatetimeIndex', 'DatetimeTZDtype', 'ExcelFile', 'ExcelWriter', 'Float64Index', 'Grouper', 'HDFStore', 'Index', 'IndexSlice', 'Int16Dtype', 'Int32Dtype', 'Int64Dtype', 'Int64Index', 'Int8Dtype', 'Interval', 'IntervalDtype', 'IntervalIndex', 'MultiIndex', 'NA', 'NaT', 'NamedAgg', 'Period', 'PeriodDtype', 'PeriodIndex', 'RangeIndex', 'Series', 'SparseDtype', 'StringDtype', 'Timedelta', 'TimedeltaIndex', 'Timestamp', 'UInt16Dtype', 'UInt32Dtype', 'UInt64Dtype', 'UInt64Index', 'UInt8Dtype', '__builtins__', '__cached__', '__doc__', '__docformat__', '__file__', '__getattr__', '__git_version__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', '_config', '_hashtable', '_is_numpy_dev', '_lib', '_libs', '_np_version_under1p14', '_np_version_under1p15', '_np_version_under1p16', '_np_version_under1p17', '_np_version_under1p18', '_testing', '_tslib', '_typing', '_versio

## Series

In [78]:
# Normal series and series with custom index
# 1-D numpy array
s1 = pd.Series([1,2,3], index=range(3))
s2 = pd.Series([1,2,3, np.nan], index=tuple("abcd"))
print(s1)
print()
print(s2)

0    1
1    2
2    3
dtype: int64

a    1.0
b    2.0
c    3.0
d    NaN
dtype: float64


In [9]:
print(type(s1))
print(s1.index)
print(s1.values, type(s1.values))

<class 'pandas.core.series.Series'>
RangeIndex(start=0, stop=3, step=1)
[1 2 3] <class 'numpy.ndarray'>


In [21]:
# changing dtype: astype
print(s1.dtype)
r = s1.astype("unicode")
print(r, r.dtype)
print(r.values.dtype)

r = s1.astype("float64")
print(r)

print()
print(s1)

print()
r = pd.Series(s1, dtype="<U2")
print(r)

int64
0    1
1    2
2    3
dtype: object object
object
0    1.0
1    2.0
2    3.0
dtype: float64

0    1
1    2
2    3
dtype: int64

0    1
1    2
2    3
dtype: object


In [22]:
# shape, size, dtype, index, values
print(s1.shape)
print(s1.size)
print(s1.dtype)

(3,)
3
int64


In [26]:
# aggregate operations: min, max, unique, sum etc.
print(s1.unique())
print(s2.min())
print(s2.max())

[1 2 3]
1.0
3.0


In [29]:
# arithmetic operations(new copy) and broadcast like numpy
print(s1 * 3) # with scalar
print(s1)


0    3
1    6
2    9
dtype: int64
0    1
1    2
2    3
dtype: int64


In [30]:
print(s1 + s2)

0   NaN
1   NaN
2   NaN
a   NaN
b   NaN
c   NaN
d   NaN
dtype: float64


In [31]:
s3 = pd.Series([10,20,30], index = (1,2,4))
print(s1)
print(s3)
print()
print(s1 + s3)

0    1
1    2
2    3
dtype: int64
1    10
2    20
4    30
dtype: int64

0     NaN
1    12.0
2    23.0
4     NaN
dtype: float64


In [42]:
# indexing and slicing
print(s1)
print()
print(s1[1:])

print()
print(s1[:2])

print()
print(s1[2])

0     1
1    20
2    20
dtype: int64

1    20
2    20
dtype: int64

0     1
1    20
dtype: int64

20


In [39]:
s1[2] = 10
print(s1)

s1[1:] = 20
print(s1)

0     1
1     2
2    10
dtype: int64
0     1
1    20
2    20
dtype: int64


In [43]:
print(s1[:-1])

0     1
1    20
dtype: int64


In [48]:
print(s2[1]) # internal index
print(s2['a']) # user assigned index

2.0
1.0


In [54]:
# use the user assigned index
print(s2.loc['a'])
print(s2.loc['a':'c'])
#print(s2.loc[1])

1.0
a    1.0
b    2.0
c    3.0
dtype: float64


In [60]:
# use the user assigned index
print(s2.iloc[1:3])
#print(s2.iloc['a']

print()
print(s2.iloc[::-1])

print()
print(s2.iloc[-1])

print()
print(s2.iloc[-1:-3:-1])

b    2.0
c    3.0
dtype: float64

d    NaN
c    3.0
b    2.0
a    1.0
dtype: float64

nan

d    NaN
c    3.0
dtype: float64


In [73]:
# result of operations as index
print(s1)

print()
print(s1[ [2,1] ])
idx = [2,-1] # list of indexes to pick
print(s1.iloc[ idx ])

print()
r = s1 %2 == 0
print(s1[r])

0     1
1    20
2    20
dtype: int64

2    20
1    20
dtype: int64
2    20
2    20
dtype: int64

1    20
2    20
dtype: int64


In [81]:
# frequency : value_counts
r = s1.value_counts()
print(r)

print()
s4 = pd.Series([10,20,10,10,30,20])
s4.value_counts()

3    1
2    1
1    1
dtype: int64



10    3
20    2
30    1
dtype: int64

## DataFrame

In [83]:
# dataframe from dict or list of rows
df1 = pd.DataFrame(
    {
        "Name":   ["Gaurav", "Abhishek1", "Krishna", "Abhishek2", "Harshita", "Joey", "Shweta", "na-1" , "na-2"  , np.nan],
        "Age":    [ 0      , 1          , 2        , 2          , 1         , 2     , 3       , 1      ,  np.nan , 2     ],
        "Gender": [ 0      , 0          , 0        , 0          , 1         , 1     , 1       , np.nan ,  0      , 0     ]
    }
)
df2 = pd.DataFrame([("Gaurav",1), ("Abhiskek",2), ("Krishna",3), ("Abhishek2",4)],
                    index=[0,11,2,3], 
                    columns=["Name", "Number"])
print(df1)
print()
print(df2)

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0

         Name  Number
0      Gaurav       1
11   Abhiskek       2
2     Krishna       3
3   Abhishek2       4


In [89]:
# shape, size, dtype, index, columns, unique, T
print(df1.shape)
print(df1.size)

print()
print(df1.dtypes, type(df1.dtypes))

print()
print(df1.index)

print()
print(df1.columns)

print()
print(df1.T)

(10, 3)
30

Name       object
Age       float64
Gender    float64
dtype: object <class 'pandas.core.series.Series'>

RangeIndex(start=0, stop=10, step=1)

Index(['Name', 'Age', 'Gender'], dtype='object')

             0          1        2          3         4     5       6     7  \
Name    Gaurav  Abhishek1  Krishna  Abhishek2  Harshita  Joey  Shweta  na-1   
Age          0          1        2          2         1     2       3     1   
Gender       0          0        0          0         1     1       1   NaN   

           8    9  
Name    na-2  NaN  
Age      NaN    2  
Gender     0    0  


In [93]:
print(df1.describe(include='all'))

             Name       Age    Gender
count           9  9.000000  9.000000
unique          9       NaN       NaN
top     Abhishek2       NaN       NaN
freq            1       NaN       NaN
mean          NaN  1.555556  0.333333
std           NaN  0.881917  0.500000
min           NaN  0.000000  0.000000
25%           NaN  1.000000  0.000000
50%           NaN  2.000000  0.000000
75%           NaN  2.000000  1.000000
max           NaN  3.000000  1.000000


In [None]:
['a','b','b', 'a', 'c', 'a']

In [100]:
print(df1.mode(axis=0))

        Name  Age  Gender
0  Abhishek1  2.0     0.0
1  Abhishek2  NaN     NaN
2     Gaurav  NaN     NaN
3   Harshita  NaN     NaN
4       Joey  NaN     NaN
5    Krishna  NaN     NaN
6     Shweta  NaN     NaN
7       na-1  NaN     NaN
8       na-2  NaN     NaN


In [97]:
print(s4)
print(s4.mode())

print()
print(s1)
print(s1.mode())

0    10
1    20
2    10
3    10
4    30
5    20
dtype: int64
0    10
dtype: int64

0    1
1    2
2    3
dtype: int64
0    1
1    2
2    3
dtype: int64


In [105]:
print(df1.head())  #give me the first five rows

print()
print(df1.head(2)) 

print()
print(df1.head(1)) 

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0

     Name  Age  Gender
0  Gaurav  0.0     0.0


In [106]:
print(df1.tail())  #give me the first five rows

print()
print(df1.tail(2)) 

print()
print(df1.tail(1)) 

     Name  Age  Gender
5    Joey  2.0     1.0
6  Shweta  3.0     1.0
7    na-1  1.0     NaN
8    na-2  NaN     0.0
9     NaN  2.0     0.0

   Name  Age  Gender
8  na-2  NaN     0.0
9   NaN  2.0     0.0

  Name  Age  Gender
9  NaN  2.0     0.0


In [110]:
# Rename index and columns on df2.
# Inplace vs normal operation

r = df1.rename(mapper={"Name": "N"}, axis=1, inplace=True) # column names
print(r)
print()
print(df1.head())

None

           N  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0


In [111]:
r = df1.rename(columns={"N": "Name"}, inplace=True) # column names
print(r)
print()
print(df1.head())

None

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0


In [113]:
r = df1.rename(mapper=lambda x: x*10, axis=0, inplace=True) # row  index
print(r)
print()
print(df1.head())

None

          Name  Age  Gender
0       Gaurav  0.0     0.0
100  Abhishek1  1.0     0.0
200    Krishna  2.0     0.0
300  Abhishek2  2.0     0.0
400   Harshita  1.0     1.0


In [120]:
# Indexing columns directly and via index. 
print(df1["Name"].head(2))

print()
print(df1.Name.head())

0         Gaurav
100    Abhishek1
Name: Name, dtype: object

0         Gaurav
100    Abhishek1
200      Krishna
300    Abhishek2
400     Harshita
Name: Name, dtype: object


In [123]:
df1.replace("Gaurav", "Gaurav-1", inplace=True)
print(df1)

          Name  Age  Gender
0     Gaurav-1  0.0     0.0
100  Abhishek1  1.0     0.0
200    Krishna  2.0     0.0
300  Abhishek2  2.0     0.0
400   Harshita  1.0     1.0
500       Joey  2.0     1.0
600     Shweta  3.0     1.0
700       na-1  1.0     NaN
800       na-2  NaN     0.0
900        NaN  2.0     0.0


In [126]:
df1.Age.replace({1.0:10, 2.0:200}, inplace=True)
print(df1)

          Name    Age  Gender
0     Gaurav-1    0.0     0.0
100  Abhishek1   10.0     0.0
200    Krishna  200.0     0.0
300  Abhishek2  200.0     0.0
400   Harshita   10.0     1.0
500       Joey  200.0     1.0
600     Shweta    3.0     1.0
700       na-1   10.0     NaN
800       na-2    NaN     0.0
900        NaN  200.0     0.0


In [127]:
# Series in a DataFrame
print(type(df1.Name))

<class 'pandas.core.series.Series'>


In [None]:
# describe, top/bottom rows of data
df1.head
df1.tail

In [None]:
# indexing and slicing
# loc, iloc [rows, cols]

In [129]:
print(df2)

         Name  Number
0      Gaurav       1
11   Abhiskek       2
2     Krishna       3
3   Abhishek2       4


In [148]:
print(df2.iloc[0])

print()
print(df2.iloc[ 0, :]) # row-0 col-all

print()
print(df2.iloc[0,1]) # row-0 col-1

print()
print(df2.iloc[ [1,2] ]) # row-selection

print()
print(df2.iloc[ [1,2], [0,1] ]) #  row and  columnn selection

Name      Gaurav
Number         1
Name: 0, dtype: object

Name      Gaurav
Number         1
Name: 0, dtype: object

1

        Name  Number
11  Abhiskek       2
2    Krishna       3

        Name  Number
11  Abhiskek       2
2    Krishna       3


In [157]:
r1 = df2  ==  2
r2 = df2  == "Krishna"
print(r1)
print(r2)

print()
r3  = r1 | r2
print(r3)

df2[r3]

     Name  Number
0   False   False
11  False    True
2   False   False
3   False   False
     Name  Number
0   False   False
11  False   False
2    True   False
3   False   False

     Name  Number
0   False   False
11  False    True
2    True   False
3   False   False


  res_values = method(rvalues)


Unnamed: 0,Name,Number
0,,
11,,2.0
2,Krishna,
3,,


In [168]:

print()
print(df2.loc[ [11,2] ]) # row-selection

print()
print(df2.loc[11])


print()
print(df2.loc[:, "Name"])

print()
print(df2.loc[11:, "Number"])

print()
print(df2.loc[11, "Name"])


        Name  Number
11  Abhiskek       2
2    Krishna       3

Name      Abhiskek
Number           2
Name: 11, dtype: object

0        Gaurav
11     Abhiskek
2       Krishna
3     Abhishek2
Name: Name, dtype: object

11    2
2     3
3     4
Name: Number, dtype: int64

Abhiskek


In [167]:
print(s4)
s4.name = 'S4'
print(s4)

0    10
1    20
2    10
3    10
4    30
5    20
dtype: int64
0    10
1    20
2    10
3    10
4    30
5    20
Name: S4, dtype: int64


In [175]:
# aggregate operations (with and without axis : row-1, col-0)
# min, max, count, ...
print(df1.shape)
print()
print(df1.count()) # count non-na values

print()
print(df1.count(axis=1)) # count non-na values

(10, 3)

Name      9
Age       9
Gender    9
dtype: int64

0      3
100    3
200    3
300    3
400    3
500    3
600    3
700    2
800    2
900    2
dtype: int64


In [177]:
# arithmetic operations
print(df1.head())
df1.Age = df1.Age + 1

print(df1.head())

          Name    Age  Gender
0     Gaurav-1    0.0     0.0
100  Abhishek1   10.0     0.0
200    Krishna  200.0     0.0
300  Abhishek2  200.0     0.0
400   Harshita   10.0     1.0
          Name    Age  Gender
0     Gaurav-1    1.0     0.0
100  Abhishek1   11.0     0.0
200    Krishna  201.0     0.0
300  Abhishek2  201.0     0.0
400   Harshita   11.0     1.0


In [None]:
# Na Values: isna, fillna, dropna

In [179]:
# any, all
r = df1.isna()
print(r)

      Name    Age  Gender
0    False  False   False
100  False  False   False
200  False  False   False
300  False  False   False
400  False  False   False
500  False  False   False
600  False  False   False
700  False  False    True
800  False   True   False
900   True  False   False


In [183]:
print(r.any())
print()
print(r.any(axis=1))

Name      True
Age       True
Gender    True
dtype: bool

0      False
100    False
200    False
300    False
400    False
500    False
600    False
700     True
800     True
900     True
dtype: bool


In [191]:
print(df1.all(skipna=False))
print(df1)

Name       True
Age        True
Gender    False
dtype: bool
          Name    Age  Gender
0     Gaurav-1    1.0     0.0
100  Abhishek1   11.0     0.0
200    Krishna  201.0     0.0
300  Abhishek2  201.0     0.0
400   Harshita   11.0     1.0
500       Joey  201.0     1.0
600     Shweta    4.0     1.0
700       na-1   11.0     NaN
800       na-2    NaN     0.0
900        NaN  201.0     0.0


In [192]:
print(df1.all(skipna=True))
print(df1)

Name       True
Age        True
Gender    False
dtype: bool
          Name    Age  Gender
0     Gaurav-1    1.0     0.0
100  Abhishek1   11.0     0.0
200    Krishna  201.0     0.0
300  Abhishek2  201.0     0.0
400   Harshita   11.0     1.0
500       Joey  201.0     1.0
600     Shweta    4.0     1.0
700       na-1   11.0     NaN
800       na-2    NaN     0.0
900        NaN  201.0     0.0


In [189]:
print(1 and 0)
print(1 and 10)

print()
print(1 or 0)
print('' or 10 or 0)
print('' or 0.0)

0
10

1
10
0.0


In [193]:
# ordering data, sort_values
df2.sort_index()

Unnamed: 0,Name,Number
0,Gaurav,1
2,Krishna,3
3,Abhishek2,4
11,Abhiskek,2


In [195]:
df2.sort_values(by='Name')

Unnamed: 0,Name,Number
3,Abhishek2,4
11,Abhiskek,2
0,Gaurav,1
2,Krishna,3


In [196]:
df1.sort_values(by='Name')

Unnamed: 0,Name,Age,Gender
100,Abhishek1,11.0,0.0
300,Abhishek2,201.0,0.0
0,Gaurav-1,1.0,0.0
400,Harshita,11.0,1.0
500,Joey,201.0,1.0
200,Krishna,201.0,0.0
600,Shweta,4.0,1.0
700,na-1,11.0,
800,na-2,,0.0
900,,201.0,0.0


In [197]:
df1.sort_values(by=["Age"])

Unnamed: 0,Name,Age,Gender
0,Gaurav-1,1.0,0.0
600,Shweta,4.0,1.0
100,Abhishek1,11.0,0.0
400,Harshita,11.0,1.0
700,na-1,11.0,
200,Krishna,201.0,0.0
300,Abhishek2,201.0,0.0
500,Joey,201.0,1.0
900,,201.0,0.0
800,na-2,,0.0


In [201]:
df1.sort_values(by=["Age", "Name"])

Unnamed: 0,Name,Age,Gender
0,Gaurav-1,1.0,0.0
600,Shweta,4.0,1.0
100,Abhishek1,11.0,0.0
400,Harshita,11.0,1.0
700,na-1,11.0,
300,Abhishek2,201.0,0.0
500,Joey,201.0,1.0
200,Krishna,201.0,0.0
900,,201.0,0.0
800,na-2,,0.0


In [202]:
df1.sort_values(by=["Age", "Name"], ignore_index=True)

Unnamed: 0,Name,Age,Gender
0,Gaurav-1,1.0,0.0
1,Shweta,4.0,1.0
2,Abhishek1,11.0,0.0
3,Harshita,11.0,1.0
4,na-1,11.0,
5,Abhishek2,201.0,0.0
6,Joey,201.0,1.0
7,Krishna,201.0,0.0
8,,201.0,0.0
9,na-2,,0.0


In [199]:
print(df1)

          Name    Age  Gender
0     Gaurav-1    1.0     0.0
100  Abhishek1   11.0     0.0
200    Krishna  201.0     0.0
300  Abhishek2  201.0     0.0
400   Harshita   11.0     1.0
500       Joey  201.0     1.0
600     Shweta    4.0     1.0
700       na-1   11.0     NaN
800       na-2    NaN     0.0
900        NaN  201.0     0.0


In [205]:
# row / column wise operation: apply
tmp = pd.DataFrame(np.random.randint(10,20,20).reshape(5,4))

In [206]:
print(tmp)

    0   1   2   3
0  18  16  11  18
1  18  18  10  19
2  19  14  10  10
3  17  19  11  11
4  13  17  16  18


In [207]:
print(tmp.apply(max)) # column

0    19
1    19
2    16
3    19
dtype: int64


In [214]:
print(tmp.apply(lambda x: np.sqrt(x)))

          0         1         2         3
0  4.242641  4.000000  3.316625  4.242641
1  4.242641  4.242641  3.162278  4.358899
2  4.358899  3.741657  3.162278  3.162278
3  4.123106  4.358899  3.316625  3.316625
4  3.605551  4.123106  4.000000  4.242641


In [208]:
print(tmp.apply(np.sqrt)) # column

          0         1         2         3
0  4.242641  4.000000  3.316625  4.242641
1  4.242641  4.242641  3.162278  4.358899
2  4.358899  3.741657  3.162278  3.162278
3  4.123106  4.358899  3.316625  3.316625
4  3.605551  4.123106  4.000000  4.242641


In [209]:
tmp.iloc[0]

0    18
1    16
2    11
3    18
Name: 0, dtype: int64

In [210]:
tmp.applymap(lambda x: x**2)

Unnamed: 0,0,1,2,3
0,324,256,121,324
1,324,324,100,361
2,361,196,100,100
3,289,361,121,121
4,169,289,256,324


In [211]:
# str submodule (replace etc)
df1.Name.str

<pandas.core.strings.StringMethods at 0x7fe08de09880>

In [223]:
print(df1.head())

# select Name, Age from Table where Name like  "A%" and Age < 100;

idx = (df1.Name.str.startswith("A")) & (df1.Age < 100)
print(idx)
df1[idx].loc[ :,["Name", "Age"] ]

          Name    Age  Gender
0     Gaurav-1    1.0     0.0
100  Abhishek1   11.0     0.0
200    Krishna  201.0     0.0
300  Abhishek2  201.0     0.0
400   Harshita   11.0     1.0
0      False
100     True
200    False
300    False
400    False
500    False
600    False
700    False
800    False
900    False
dtype: bool


Unnamed: 0,Name,Age
100,Abhishek1,11.0


In [None]:
# load and save data

In [226]:
print(df1)

          Name    Age  Gender
0     Gaurav-1    1.0     0.0
100  Abhishek1   11.0     0.0
200    Krishna  201.0     0.0
300  Abhishek2  201.0     0.0
400   Harshita   11.0     1.0
500       Joey  201.0     1.0
600     Shweta    4.0     1.0
700       na-1   11.0     NaN
800       na-2    NaN     0.0
900        NaN  201.0     0.0


In [228]:
# groups: groupby
grp = df1.groupby(by='Age')
print(grp)

print(grp.groups)


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fe08ddf6af0>
{1.0: Int64Index([0], dtype='int64'), 4.0: Int64Index([600], dtype='int64'), 11.0: Int64Index([100, 400, 700], dtype='int64'), 201.0: Int64Index([200, 300, 500, 900], dtype='int64')}


<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fe08e8c4220>

In [229]:
# count non-na values in Gender column, group wise
grp.Gender.count()

Age
1.0      1
4.0      1
11.0     2
201.0    4
Name: Gender, dtype: int64

In [232]:
# count non-na values for each col, group wise
grp.count()

Unnamed: 0_level_0,Name,Gender
Age,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,1,1
4.0,1,1
11.0,3,2
201.0,3,4


In [233]:
# count no of elements/size in each group
grp.size()

Age
1.0      1
4.0      1
11.0     3
201.0    4
dtype: int64

In [None]:
# groups and aggregates