### Python Pandas Tutorial

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from IPython.display import display

series_list = pd.Series([1, 3, 5, np.nan, 6, 8])
series_np = pd.Series(np.array([0, 1, 4, 9, 16, 25]), name='Squares')

print(series_list)
print(series_np[:3])

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64
0    0
1    1
2    4
Name: Squares, dtype: int32


In [2]:
data = {
    'apples': [3, 2, 0, 1], 
    'oranges': [0, 3, 7, 2]
}

purchases = pd.DataFrame(data)
display(purchases)

Unnamed: 0,apples,oranges
0,3,0
1,2,3
2,0,7
3,1,2


In [3]:
purchases = pd.DataFrame(data, index=['June', 'Robert', 'Lily', 'David'])
display(purchases)

Unnamed: 0,apples,oranges
June,3,0
Robert,2,3
Lily,0,7
David,1,2


In [4]:
purchases.loc['June']

apples     3
oranges    0
Name: June, dtype: int64

In [5]:
df = pd.DataFrame()
display(df)

data = [['Alex',10],['Bob',12],['Clarke',13]]
df = pd.DataFrame(data,columns=['Name','Age'],dtype=float)
display(df)

data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age':[28,34,29,42]}
df = pd.DataFrame(data,dtype=float)
display(df)

Unnamed: 0,Name,Age
0,Alex,10.0
1,Bob,12.0
2,Clarke,13.0


Unnamed: 0,Name,Age
0,Tom,28.0
1,Jack,34.0
2,Steve,29.0
3,Ricky,42.0


In [6]:
col = pd.DataFrame(df['Name'])
display(col)

Unnamed: 0,Name
0,Tom
1,Jack
2,Steve
3,Ricky


In [7]:
df = df.sort_values(by='Age')
display(df)

row = pd.DataFrame(df.iloc[2])
display(row)

row = pd.DataFrame(df.loc[3])
display(row)

Unnamed: 0,Name,Age
0,Tom,28.0
2,Steve,29.0
1,Jack,34.0
3,Ricky,42.0


Unnamed: 0,1
Name,Jack
Age,34


Unnamed: 0,3
Name,Ricky
Age,42


In [8]:
#.loc() - Label based
#.iloc() - Integer based
df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])
display(df)

Unnamed: 0,A,B,C,D
a,2.255886,3.070715,0.196333,-0.881413
b,-1.837882,-0.861416,-1.008404,1.371812
c,0.549241,-0.118438,0.521802,-0.610682
d,0.404846,-0.450172,-1.157401,0.40045
e,-0.517218,0.289338,-1.560531,-0.900231
f,0.264137,1.950847,0.084668,-0.083794
g,-1.247032,0.585988,-0.102587,1.431847
h,0.415488,0.02181,0.579152,-0.628987


In [9]:
#select all rows for a specific column
col = pd.DataFrame(df.loc[:,'A'])
display(col)

Unnamed: 0,A
a,2.255886
b,-1.837882
c,0.549241
d,0.404846
e,-0.517218
f,0.264137
g,-1.247032
h,0.415488


In [10]:
# Select all rows for multiple columns, say list[]
cols = pd.DataFrame(df.loc[:,['A','C']])
display(cols)

Unnamed: 0,A,C
a,2.255886,0.196333
b,-1.837882,-1.008404
c,0.549241,0.521802
d,0.404846,-1.157401
e,-0.517218,-1.560531
f,0.264137,0.084668
g,-1.247032,-0.102587
h,0.415488,0.579152


In [11]:
# Select few rows for multiple columns, say list[]
cols_rows = pd.DataFrame(df.loc[['a','b','f','h'],['A','C']])
display(cols_rows)

Unnamed: 0,A,C
a,2.255886,0.196333
b,-1.837882,-1.008404
f,0.264137,0.084668
h,0.415488,0.579152


In [12]:
# Select range of rows for all columns
rows = pd.DataFrame(df.loc['a':'h'])
display(rows)

Unnamed: 0,A,B,C,D
a,2.255886,3.070715,0.196333,-0.881413
b,-1.837882,-0.861416,-1.008404,1.371812
c,0.549241,-0.118438,0.521802,-0.610682
d,0.404846,-0.450172,-1.157401,0.40045
e,-0.517218,0.289338,-1.560531,-0.900231
f,0.264137,1.950847,0.084668,-0.083794
g,-1.247032,0.585988,-0.102587,1.431847
h,0.415488,0.02181,0.579152,-0.628987


In [13]:
# select all rows for a specific column
rows = pd.DataFrame(df.iloc[:4])
display(rows)

Unnamed: 0,A,B,C,D
a,2.255886,3.070715,0.196333,-0.881413
b,-1.837882,-0.861416,-1.008404,1.371812
c,0.549241,-0.118438,0.521802,-0.610682
d,0.404846,-0.450172,-1.157401,0.40045


In [14]:
#.iloc() - Integer based
# Integer slicing
rows1 = pd.DataFrame(df.iloc[:1])
rows4 = pd.DataFrame(df.iloc[:4])
rows5 = pd.DataFrame(df.iloc[1:5, 2:4])
display(rows1)
display(rows4)
display(rows5)

Unnamed: 0,A,B,C,D
a,2.255886,3.070715,0.196333,-0.881413


Unnamed: 0,A,B,C,D
a,2.255886,3.070715,0.196333,-0.881413
b,-1.837882,-0.861416,-1.008404,1.371812
c,0.549241,-0.118438,0.521802,-0.610682
d,0.404846,-0.450172,-1.157401,0.40045


Unnamed: 0,C,D
b,-1.008404,1.371812
c,0.521802,-0.610682
d,-1.157401,0.40045
e,-1.560531,-0.900231


In [15]:
# Slicing through list of values
cols2 = pd.DataFrame(df.iloc[[1, 3, 5], [1, 3]])
cols4 = pd.DataFrame(df.iloc[1:3, :])
cols3 = pd.DataFrame(df.iloc[:,1:3])
display(cols2)
display(cols4)
display(cols3)

Unnamed: 0,B,D
b,-0.861416,1.371812
d,-0.450172,0.40045
f,1.950847,-0.083794


Unnamed: 0,A,B,C,D
b,-1.837882,-0.861416,-1.008404,1.371812
c,0.549241,-0.118438,0.521802,-0.610682


Unnamed: 0,B,C
a,3.070715,0.196333
b,-0.861416,-1.008404
c,-0.118438,0.521802
d,-0.450172,-1.157401
e,0.289338,-1.560531
f,1.950847,0.084668
g,0.585988,-0.102587
h,0.02181,0.579152


In [16]:
col = pd.DataFrame(df['A'])
cols = pd.DataFrame(df[['A','B']])
colA = pd.DataFrame(df.A)
display(col)
display(cols)
display(colA)

Unnamed: 0,A
a,2.255886
b,-1.837882
c,0.549241
d,0.404846
e,-0.517218
f,0.264137
g,-1.247032
h,0.415488


Unnamed: 0,A,B
a,2.255886,3.070715
b,-1.837882,-0.861416
c,0.549241,-0.118438
d,0.404846,-0.450172
e,-0.517218,0.289338
f,0.264137,1.950847
g,-1.247032,0.585988
h,0.415488,0.02181


Unnamed: 0,A
a,2.255886
b,-1.837882
c,0.549241
d,0.404846
e,-0.517218
f,0.264137
g,-1.247032
h,0.415488


In [17]:
ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',
   'kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
   'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],
   'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
   'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}
df = pd.DataFrame(ipl_data)
display(df)

Unnamed: 0,Team,Rank,Year,Points
0,Riders,1,2014,876
1,Riders,2,2015,789
2,Devils,2,2014,863
3,Devils,3,2015,673
4,Kings,3,2014,741
5,kings,4,2015,812
6,Kings,1,2016,756
7,Kings,1,2017,788
8,Riders,2,2016,694
9,Royals,4,2014,701


In [18]:
print(df.groupby('Team').groups)
print(df.groupby(['Team','Year']).groups)

{'Devils': [2, 3], 'Kings': [4, 6, 7], 'Riders': [0, 1, 8, 11], 'Royals': [9, 10], 'kings': [5]}
{('Devils', 2014): [2], ('Devils', 2015): [3], ('Kings', 2014): [4], ('Kings', 2016): [6], ('Kings', 2017): [7], ('Riders', 2014): [0], ('Riders', 2015): [1], ('Riders', 2016): [8], ('Riders', 2017): [11], ('Royals', 2014): [9], ('Royals', 2015): [10], ('kings', 2015): [5]}


In [19]:
grouped = df.groupby('Year')
display(grouped.get_group(2014))

Unnamed: 0,Team,Rank,Year,Points
0,Riders,1,2014,876
2,Devils,2,2014,863
4,Kings,3,2014,741
9,Royals,4,2014,701


In [20]:
grouped = df.groupby('Year')
mean = pd.DataFrame(grouped['Points'].agg(np.mean))
display(mean)

Unnamed: 0_level_0,Points
Year,Unnamed: 1_level_1
2014,795.25
2015,769.5
2016,725.0
2017,739.0


In [21]:
grouped = df.groupby(['Team', 'Year'], as_index=False)
mean = pd.DataFrame(grouped['Points'].agg(np.mean))
display(mean)

Unnamed: 0,Team,Year,Points
0,Devils,2014,863
1,Devils,2015,673
2,Kings,2014,741
3,Kings,2016,756
4,Kings,2017,788
5,Riders,2014,876
6,Riders,2015,789
7,Riders,2016,694
8,Riders,2017,690
9,Royals,2014,701


In [22]:
grouped = df.groupby('Team')
display(grouped['Points'].agg([np.sum, np.mean, np.std]))

Unnamed: 0_level_0,sum,mean,std
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Devils,1536,768.0,134.350288
Kings,2285,761.666667,24.006943
Riders,3049,762.25,88.567771
Royals,1505,752.5,72.831998
kings,812,812.0,


In [23]:
grouped = df.groupby('Team')
score = lambda x: (x - x.mean()) / x.std()*10
display(grouped.transform(score))

Unnamed: 0,Rank,Year,Points
0,-15.0,-11.61895,12.843272
1,5.0,-3.872983,3.020286
2,-7.071068,-7.071068,7.071068
3,7.071068,7.071068,-7.071068
4,11.547005,-10.910895,-8.608621
5,,,
6,-5.773503,2.182179,-2.360428
7,-5.773503,8.728716,10.969049
8,5.0,3.872983,-7.705963
9,7.071068,-7.071068,-7.071068


In [24]:
df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
                         "bar", "bar", "bar", "bar"],
                   "B": ["one", "one", "one", "two", "two",
                         "one", "one", "two", "two"],
                   "C": ["small", "large", "large", "small",
                         "small", "large", "small", "small",
                         "large"],
                   "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
                   "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})

display(df)

Unnamed: 0,A,B,C,D,E
0,foo,one,small,1,2
1,foo,one,large,2,4
2,foo,one,large,2,5
3,foo,two,small,3,5
4,foo,two,small,3,6
5,bar,one,large,4,6
6,bar,one,small,5,8
7,bar,two,small,6,9
8,bar,two,large,7,9


In [25]:
table = pd.pivot_table(df, values='D', index=['A', 'B'],
                    columns=['C'], aggfunc=np.sum)

display(table)

Unnamed: 0_level_0,C,large,small
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,4.0,5.0
bar,two,7.0,6.0
foo,one,4.0,1.0
foo,two,,6.0


In [26]:
table = pd.pivot_table(df, values='D', index=['A', 'B'],
                    columns=['C'], aggfunc=np.sum, fill_value=0)

display(table)

Unnamed: 0_level_0,C,large,small
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,4,5
bar,two,7,6
foo,one,4,1
foo,two,0,6


In [27]:
table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
                    aggfunc={'D': np.mean,
                             'E': np.mean})

display(table)

Unnamed: 0_level_0,Unnamed: 1_level_0,D,E
A,C,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,large,5.5,7.5
bar,small,5.5,8.5
foo,large,2.0,4.5
foo,small,2.333333,4.333333


In [28]:
iris = pd.read_excel('iris.xlsx', sheet_name='iris')
print(iris.columns.ravel())
display(iris)

['sepal_length' 'sepal_width' 'petal_length' 'petal_width' 'species']


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [29]:
iris.to_json('iris.json', orient='records')

In [30]:
iris_json = pd.read_json('iris.json')
display(iris_json)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [31]:
query = iris_json.query("species == 'versicolor' & sepal_length > 6.5")
display(query)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
50,7.0,3.2,4.7,1.4,versicolor
52,6.9,3.1,4.9,1.5,versicolor
58,6.6,2.9,4.6,1.3,versicolor
65,6.7,3.1,4.4,1.4,versicolor
75,6.6,3.0,4.4,1.4,versicolor
76,6.8,2.8,4.8,1.4,versicolor
77,6.7,3.0,5.0,1.7,versicolor
86,6.7,3.1,4.7,1.5,versicolor


In [32]:
query = iris_json.query("species in ['setosa', 'versicolor'] & petal_width > 0.5 & petal_width <= 1")
display(query)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
43,5.0,3.5,1.6,0.6,setosa
57,4.9,2.4,3.3,1.0,versicolor
60,5.0,2.0,3.5,1.0,versicolor
62,6.0,2.2,4.0,1.0,versicolor
67,5.8,2.7,4.1,1.0,versicolor
79,5.7,2.6,3.5,1.0,versicolor
81,5.5,2.4,3.7,1.0,versicolor
93,5.0,2.3,3.3,1.0,versicolor


In [33]:
df1 = pd.DataFrame({'key': ['foo', 'bar', 'baz', 'foo'],
                    'lvalue': [1, 2, 3, 5]})
df2 = pd.DataFrame({'key': ['foo', 'bar', 'baz', 'foo'],
                    'rvalue': [5, 6, 7, 8]})

display(df1)
display(df2)

result = pd.concat([df1, df2])
display(result)

result = df1.append(df2)
display(result)

result = pd.concat([df1, df2], ignore_index=True, sort=False)
display(result)

result = pd.merge(df1, df2, on='key')
display(result)

result = pd.merge(df1, df2, how='inner', on=['key', 'key'])
display(result)

result = pd.merge(df1, df2, how='right', on=['key', 'key'])
display(result)

Unnamed: 0,key,lvalue
0,foo,1
1,bar,2
2,baz,3
3,foo,5


Unnamed: 0,key,rvalue
0,foo,5
1,bar,6
2,baz,7
3,foo,8


Unnamed: 0,key,lvalue,rvalue
0,foo,1.0,
1,bar,2.0,
2,baz,3.0,
3,foo,5.0,
0,foo,,5.0
1,bar,,6.0
2,baz,,7.0
3,foo,,8.0


Unnamed: 0,key,lvalue,rvalue
0,foo,1.0,
1,bar,2.0,
2,baz,3.0,
3,foo,5.0,
0,foo,,5.0
1,bar,,6.0
2,baz,,7.0
3,foo,,8.0


Unnamed: 0,key,lvalue,rvalue
0,foo,1.0,
1,bar,2.0,
2,baz,3.0,
3,foo,5.0,
4,foo,,5.0
5,bar,,6.0
6,baz,,7.0
7,foo,,8.0


Unnamed: 0,key,lvalue,rvalue
0,foo,1,5
1,foo,1,8
2,foo,5,5
3,foo,5,8
4,bar,2,6
5,baz,3,7


Unnamed: 0,key,lvalue,rvalue
0,foo,1,5
1,foo,1,8
2,foo,5,5
3,foo,5,8
4,bar,2,6
5,baz,3,7


Unnamed: 0,key,lvalue,rvalue
0,foo,1,5
1,foo,5,5
2,foo,1,8
3,foo,5,8
4,bar,2,6
5,baz,3,7


In [34]:
s = pd.Series(np.random.randn(4))

print(s.axes)
print(s.empty)
print(s.ndim)
print(s.size)
print(s.values)
print(s.dtypes)

a = pd.Series(np.random.randn(4))
b = pd.Series(np.random.randn(5))

frame = { 'a': a, 'b': b} 
result = pd.DataFrame(frame) 
result['c'] = a/b
display(result) 

[RangeIndex(start=0, stop=4, step=1)]
False
1
4
[-1.88208986  0.52621279 -1.09531859 -0.62154971]
float64


Unnamed: 0,a,b,c
0,0.054316,0.154039,0.352609
1,-1.28391,-1.531322,0.838432
2,0.65098,-0.169529,-3.839934
3,-0.240341,-1.142895,0.210291
4,,0.817178,


In [35]:
result.eval('d = (a - b) / c', inplace=True)
display(result) 

Unnamed: 0,a,b,c,d
0,0.054316,0.154039,0.352609,-0.282817
1,-1.28391,-1.531322,0.838432,0.295089
2,0.65098,-0.169529,-3.839934,-0.213678
3,-0.240341,-1.142895,0.210291,4.29193
4,,0.817178,,


In [36]:
result['d_max'] = np.where(result['d']==max(result['d']), 1, 0)
display(result)

Unnamed: 0,a,b,c,d,d_max
0,0.054316,0.154039,0.352609,-0.282817,0
1,-1.28391,-1.531322,0.838432,0.295089,0
2,0.65098,-0.169529,-3.839934,-0.213678,0
3,-0.240341,-1.142895,0.210291,4.29193,1
4,,0.817178,,,0


In [37]:
d = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack',
   'Lee','David','Gasper','Betina','Andres']),
   'Age':pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65])
}

df = pd.DataFrame(d)
display(df)

for col in df:
   print(col)

print (df.sum())
print (df.mean())
print (df.describe())
print (df.describe(include='all'))

Unnamed: 0,Name,Age,Rating
0,Tom,25,4.23
1,James,26,3.24
2,Ricky,25,3.98
3,Vin,23,2.56
4,Steve,30,3.2
5,Smith,29,4.6
6,Jack,23,3.8
7,Lee,34,3.78
8,David,40,2.98
9,Gasper,30,4.8


Name
Age
Rating
Name      TomJamesRickyVinSteveSmithJackLeeDavidGasperBe...
Age                                                     382
Rating                                                44.92
dtype: object
Age       31.833333
Rating     3.743333
dtype: float64
             Age     Rating
count  12.000000  12.000000
mean   31.833333   3.743333
std     9.232682   0.661628
min    23.000000   2.560000
25%    25.000000   3.230000
50%    29.500000   3.790000
75%    35.500000   4.132500
max    51.000000   4.800000
          Name        Age     Rating
count       12  12.000000  12.000000
unique      12        NaN        NaN
top     Betina        NaN        NaN
freq         1        NaN        NaN
mean       NaN  31.833333   3.743333
std        NaN   9.232682   0.661628
min        NaN  23.000000   2.560000
25%        NaN  25.000000   3.230000
50%        NaN  29.500000   3.790000
75%        NaN  35.500000   4.132500
max        NaN  51.000000   4.800000


In [38]:
import warnings
warnings.filterwarnings("ignore")

import seaborn as sns
print(sns.get_dataset_names())
iris = sns.load_dataset('iris')
iris.head()

['anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'exercise', 'flights', 'fmri', 'gammas', 'geyser', 'iris', 'mpg', 'penguins', 'planets', 'tips', 'titanic']


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [39]:
iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [40]:
start = pd.datetime(2011, 1, 1)
end = pd.datetime(2011, 1, 5)
date = pd.DataFrame(pd.date_range(start, end))
date.columns = ['Date']
display(date)

Unnamed: 0,Date
0,2011-01-01
1,2011-01-02
2,2011-01-03
3,2011-01-04
4,2011-01-05


In [41]:
s = pd.Series(["a","b","c","a"], dtype="category")
df = pd.DataFrame(s)
df.columns = ['Category']
display(df)

Unnamed: 0,Category
0,a
1,b
2,c
3,a


In [42]:
df.describe()

Unnamed: 0,Category
count,4
unique,3
top,a
freq,2
