In [1]:
import pandas as pd
import numpy as np

**creation**

In [2]:
data = {'state': ['Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'Year':[2000, 2001, 2000, 2001],
        'pop':[1.5, 1.7, 2.4, 2.9]}

In [3]:
df1 = pd.DataFrame(data)
df1

Unnamed: 0,state,Year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Nevada,2000,2.4
3,Nevada,2001,2.9


In [4]:
df2 = pd.DataFrame(data, columns=['Year', 'state', 'pop'])
df2

Unnamed: 0,Year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2000,Nevada,2.4
3,2001,Nevada,2.9


**creating a new column**

`df[column]` works for any column name, but `df.column` only works when the column name is a valid Python variable name.

In [5]:
df1['new1'] = ['n1', 'n2', 'n3', 'n4']
df1

Unnamed: 0,state,Year,pop,new1
0,Ohio,2000,1.5,n1
1,Ohio,2001,1.7,n2
2,Nevada,2000,2.4,n3
3,Nevada,2001,2.9,n4


In [6]:
df1.insert(loc=1, column='new2', value=2)
df1

Unnamed: 0,state,new2,Year,pop,new1
0,Ohio,2,2000,1.5,n1
1,Ohio,2,2001,1.7,n2
2,Nevada,2,2000,2.4,n3
3,Nevada,2,2001,2.9,n4


**`del`**

In [7]:
del df1['new2']
df1

Unnamed: 0,state,Year,pop,new1
0,Ohio,2000,1.5,n1
1,Ohio,2001,1.7,n2
2,Nevada,2000,2.4,n3
3,Nevada,2001,2.9,n4


**`.columns`**

In [8]:
df1.columns

Index(['state', 'Year', 'pop', 'new1'], dtype='object')

**rename column**

In [9]:
df2

Unnamed: 0,Year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2000,Nevada,2.4
3,2001,Nevada,2.9


In [10]:
df2.columns = ['year', 'state', 'pop']

In [11]:
df2

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2000,Nevada,2.4
3,2001,Nevada,2.9


In [12]:
df2.rename(columns={'year':'Year'})

Unnamed: 0,Year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2000,Nevada,2.4
3,2001,Nevada,2.9


**index**

In [13]:
df1.index

RangeIndex(start=0, stop=4, step=1)

In [14]:
df1.index = ['a', 'b', 'c', 'd']
df1

Unnamed: 0,state,Year,pop,new1
a,Ohio,2000,1.5,n1
b,Ohio,2001,1.7,n2
c,Nevada,2000,2.4,n3
d,Nevada,2001,2.9,n4


**hierarchical indexing**

In [15]:
df = pd.DataFrame(np.arange(12).reshape((4, 3)),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=[['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


**`reindex`**

In [16]:
df1.reindex(['c', 'd', 'b', 'a'])

Unnamed: 0,state,Year,pop,new1
c,Nevada,2000,2.4,n3
d,Nevada,2001,2.9,n4
b,Ohio,2001,1.7,n2
a,Ohio,2000,1.5,n1


**`drop`**

In [17]:
df1.drop('b') # row

Unnamed: 0,state,Year,pop,new1
a,Ohio,2000,1.5,n1
c,Nevada,2000,2.4,n3
d,Nevada,2001,2.9,n4


In [18]:
df1.drop(columns='new1')

Unnamed: 0,state,Year,pop
a,Ohio,2000,1.5
b,Ohio,2001,1.7
c,Nevada,2000,2.4
d,Nevada,2001,2.9


In [19]:
df1.drop(labels='new1', axis='columns')

Unnamed: 0,state,Year,pop
a,Ohio,2000,1.5
b,Ohio,2001,1.7
c,Nevada,2000,2.4
d,Nevada,2001,2.9


**selection, filtering**

In [20]:
df1

Unnamed: 0,state,Year,pop,new1
a,Ohio,2000,1.5,n1
b,Ohio,2001,1.7,n2
c,Nevada,2000,2.4,n3
d,Nevada,2001,2.9,n4


In [21]:
df1['pop']

a    1.5
b    1.7
c    2.4
d    2.9
Name: pop, dtype: float64

In [22]:
type(df1['pop'])

pandas.core.series.Series

In [23]:
df1[['pop']]

Unnamed: 0,pop
a,1.5
b,1.7
c,2.4
d,2.9


In [24]:
type(df1[['pop']])

pandas.core.frame.DataFrame

In [25]:
df1.loc['a']

state    Ohio
Year     2000
pop       1.5
new1       n1
Name: a, dtype: object

In [26]:
df1.iloc[0]

state    Ohio
Year     2000
pop       1.5
new1       n1
Name: a, dtype: object

In [27]:
df1.iloc[:2]

Unnamed: 0,state,Year,pop,new1
a,Ohio,2000,1.5,n1
b,Ohio,2001,1.7,n2


In [28]:
df1[:2]

Unnamed: 0,state,Year,pop,new1
a,Ohio,2000,1.5,n1
b,Ohio,2001,1.7,n2


In [29]:
df1.loc[:, 'pop']

a    1.5
b    1.7
c    2.4
d    2.9
Name: pop, dtype: float64

In [30]:
df1.iloc[:, 2]

a    1.5
b    1.7
c    2.4
d    2.9
Name: pop, dtype: float64

In [31]:
df1.iloc[:, :2]

Unnamed: 0,state,Year
a,Ohio,2000
b,Ohio,2001
c,Nevada,2000
d,Nevada,2001


In [32]:
df1

Unnamed: 0,state,Year,pop,new1
a,Ohio,2000,1.5,n1
b,Ohio,2001,1.7,n2
c,Nevada,2000,2.4,n3
d,Nevada,2001,2.9,n4


In [33]:
df1[df1['pop'] > 2]

Unnamed: 0,state,Year,pop,new1
c,Nevada,2000,2.4,n3
d,Nevada,2001,2.9,n4


**transpose the dataframe**

In [34]:
df2

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2000,Nevada,2.4
3,2001,Nevada,2.9


In [35]:
df2.T

Unnamed: 0,0,1,2,3
year,2000,2001,2000,2001
state,Ohio,Ohio,Nevada,Nevada
pop,1.5,1.7,2.4,2.9


**arithmetic method**

In [36]:
df1 = pd.DataFrame(np.arange(4).reshape(2, 2),
                   index=['a', 'b'],
                   columns=list('bc'))
df2 = pd.DataFrame(np.arange(6).reshape(3, 2),
                   index=['b', 'c', 'd'],
                   columns=list('ab'))

In [37]:
df1

Unnamed: 0,b,c
a,0,1
b,2,3


In [38]:
df2

Unnamed: 0,a,b
b,0,1
c,2,3
d,4,5


In [39]:
df1 + df2

Unnamed: 0,a,b,c
a,,,
b,,3.0,
c,,,
d,,,


In [40]:
1 / df1

Unnamed: 0,b,c
a,inf,1.0
b,0.5,0.333333


**`apply`, `applymap`, `map`**

In [41]:
df = pd.DataFrame(np.arange(16).reshape(4, 4),
                  index=['Ohio', 'Colorado', 'Utah', 'New York'],
                  columns=['one', 'two', 'three', 'four'])
df

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [42]:
df.apply(lambda x: x.max()-x.min())

one      12
two      12
three    12
four     12
dtype: int64

In [43]:
df.applymap(lambda x: '%.2f' % x)

Unnamed: 0,one,two,three,four
Ohio,0.0,1.0,2.0,3.0
Colorado,4.0,5.0,6.0,7.0
Utah,8.0,9.0,10.0,11.0
New York,12.0,13.0,14.0,15.0


In [44]:
df['two'].map(lambda x: '%.2f' % x)

Ohio         1.00
Colorado     5.00
Utah         9.00
New York    13.00
Name: two, dtype: object

**sorting**

In [45]:
df

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [46]:
df.sort_index()

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
New York,12,13,14,15
Ohio,0,1,2,3
Utah,8,9,10,11


In [47]:
df.sort_index(axis='columns')

Unnamed: 0,four,one,three,two
Ohio,3,0,2,1
Colorado,7,4,6,5
Utah,11,8,10,9
New York,15,12,14,13


In [48]:
df.sort_values('one')

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


**`reset_index`**

In [49]:
df.reset_index()

Unnamed: 0,index,one,two,three,four
0,Ohio,0,1,2,3
1,Colorado,4,5,6,7
2,Utah,8,9,10,11
3,New York,12,13,14,15


In [50]:
df.reset_index(drop=True)

Unnamed: 0,one,two,three,four
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


**ranking**

In [51]:
df

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [52]:
df.rank()

Unnamed: 0,one,two,three,four
Ohio,1.0,1.0,1.0,1.0
Colorado,2.0,2.0,2.0,2.0
Utah,3.0,3.0,3.0,3.0
New York,4.0,4.0,4.0,4.0


In [53]:
df.rank(axis='columns')

Unnamed: 0,one,two,three,four
Ohio,1.0,2.0,3.0,4.0
Colorado,1.0,2.0,3.0,4.0
Utah,1.0,2.0,3.0,4.0
New York,1.0,2.0,3.0,4.0


**`.isin()`**

In [54]:
df[df['two'].isin([2, 5, 6, 9])]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11


**`is_unique`**

In [55]:
df.index.is_unique

True

**computing descriptive statistics**

In [56]:
df

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [57]:
df.sum()

one      24
two      28
three    32
four     36
dtype: int64

In [58]:
df.sum(axis='columns')

Ohio         6
Colorado    22
Utah        38
New York    54
dtype: int64

In [59]:
df.cumsum()

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,6,8,10
Utah,12,15,18,21
New York,24,28,32,36


In [60]:
df.pct_change()

Unnamed: 0,one,two,three,four
Ohio,,,,
Colorado,inf,4.0,2.0,1.333333
Utah,1.0,0.8,0.666667,0.571429
New York,0.5,0.444444,0.4,0.363636


**import a dataframe**

In [61]:
csv_df = pd.read_csv('04-25-2020.csv')

In [62]:
csv_df.head()

Unnamed: 0,continentName,countryName,cityName,confirmedCount,curedCount,deadCount,updateTime
0,North America,US,US,907096,99121,52063,2020-04-25 23:30:33
1,Europe,Spain,Spain,223759,95708,22902,2020-04-25 23:30:31
2,Europe,Italy,Italy,195351,63120,26384,2020-04-25 23:30:31
3,Europe,France,France,160292,44594,22614,2020-04-25 23:30:31
4,Europe,Germany,Germany,156513,109800,5877,2020-04-25 23:30:31


**export a dataframe**

In [63]:
df.to_csv('df.csv')

**`dropna` & `fillna`**

In [64]:
from numpy import nan as NA

df = pd.DataFrame([[1, 6.5, 3], [1, NA, NA], [NA, NA, NA], [NA, 6.5, 3]])
df

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [65]:
# how='any' by default
df.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [66]:
df.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [67]:
df.fillna(0)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,6.5,3.0


In [68]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,6.5,3.0
2,1.0,6.5,3.0
3,1.0,6.5,3.0


**removing duplicates**

In [69]:
df = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                   'k2': [1, 1, 2, 3, 3, 4, 4]})
df

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [70]:
df.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [71]:
df.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


**dummy variables**

In [72]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a'],
                   'data': range(5)})
df

Unnamed: 0,key,data
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4


In [73]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0


**`merge`**

In [74]:
df1 = pd.DataFrame({'key':['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1':range(7)})

df2 = pd.DataFrame({'key':['a', 'b', 'd'],
                    'data2':range(3)})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [75]:
df2 

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [76]:
pd.merge(df1, df2, on='key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [77]:
pd.merge(df1, df2, on='key', how='right')

Unnamed: 0,key,data1,data2
0,b,0.0,1
1,b,1.0,1
2,b,6.0,1
3,a,2.0,0
4,a,4.0,0
5,a,5.0,0
6,d,,2


In [78]:
df3 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'c'],
                    'value': range(5)})
df4 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])
df3

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,b,3
4,c,4


In [79]:
df4

Unnamed: 0,group_val
a,3.5
b,7.0


In [80]:
pd.merge(df3, df4, left_on='key', right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
1,b,1,7.0
3,b,3,7.0


**`join`**

In [81]:
left_df = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'c'],
                        'value': range(5)})
right_df = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])
left_df

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,b,3
4,c,4


In [82]:
right_df

Unnamed: 0,group_val
a,3.5
b,7.0


In [83]:
left_df.join(right_df, on='key')

Unnamed: 0,key,value,group_val
0,a,0,3.5
1,b,1,7.0
2,a,2,3.5
3,b,3,7.0
4,c,4,


**`concat`**

In [84]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])

In [85]:
pd.concat([s1, s2, s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [86]:
pd.concat([s1, s2, s3], axis=1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


**`pivot`**

In [87]:
df = pd.DataFrame({'key': ['foo', 'bar', 'baz', 'foo', 'bar', 'baz', 'foo', 'bar', 'baz'],
                   'variable': ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C'],
                   'value': [1, 2, 3, 4, 5, 6, 7, 8, 9]})
df

Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6
6,foo,C,7
7,bar,C,8
8,baz,C,9


In [88]:
df.pivot('key', 'variable', 'value')

variable,A,B,C
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,2,5,8
baz,3,6,9
foo,1,4,7


**`melt`**

In [89]:
df = pd.DataFrame({'key': ['foo', 'bar', 'baz'],
                   'A': [1, 2, 3],
                   'B': [4, 5, 6],
                   'C': [7, 8, 9]})
df

Unnamed: 0,key,A,B,C
0,foo,1,4,7
1,bar,2,5,8
2,baz,3,6,9


In [90]:
pd.melt(df, ['key'])

Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6
6,foo,C,7
7,bar,C,8
8,baz,C,9


**`groupby`**

In [91]:
df = pd.DataFrame({'key1':['a', 'a', 'b', 'b', 'a'],
                   'key2':['one', 'two', 'one', 'two', 'one'],
                   'data1':range(5),
                   'data2': range(5, 10)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0,5
1,a,two,1,6
2,b,one,2,7
3,b,two,3,8
4,a,one,4,9


In [92]:
df.groupby(['key1'])['data1'].sum()

key1
a    5
b    5
Name: data1, dtype: int64

In [93]:
df.groupby(['key1', 'key2'])['data1'].mean()

key1  key2
a     one     2
      two     1
b     one     2
      two     3
Name: data1, dtype: int64

In [94]:
people = pd.DataFrame(np.arange(25).reshape(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people

Unnamed: 0,a,b,c,d,e
Joe,0,1,2,3,4
Steve,5,6,7,8,9
Wes,10,11,12,13,14
Jim,15,16,17,18,19
Travis,20,21,22,23,24


In [95]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f': 'orange'}

people.groupby(mapping, axis='columns').sum()

Unnamed: 0,blue,red
Joe,5,5
Steve,15,20
Wes,25,35
Jim,35,50
Travis,45,65


**aggregation**

In [96]:
df = pd.DataFrame({'total_bill': [10, 40, 20, 21, 19],
                   'tip': [1, 3, 2, 1.4, 1.9],
                   'smoker': ['No', 'No', 'Yes', 'No', 'Yes'],
                   'day': ['Sun', 'Mon', 'Fri', 'Sun', 'Tue'],
                   'time': ['Dinner'] * 5,
                   'size': [2, 3, 4, 3, 2],
                   'tip_pct': [0.1, 0.075, 0.1, 0.07, 0.1]},
                  columns=['total_bill', 'tip', 'smoker', 'day',
                           'time', 'size', 'tip_pct'])
df

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,10,1.0,No,Sun,Dinner,2,0.1
1,40,3.0,No,Mon,Dinner,3,0.075
2,20,2.0,Yes,Fri,Dinner,4,0.1
3,21,1.4,No,Sun,Dinner,3,0.07
4,19,1.9,Yes,Tue,Dinner,2,0.1


In [97]:
df.describe()

Unnamed: 0,total_bill,tip,size,tip_pct
count,5.0,5.0,5.0,5.0
mean,22.0,1.86,2.8,0.089
std,10.977249,0.753658,0.83666,0.015166
min,10.0,1.0,2.0,0.07
25%,19.0,1.4,2.0,0.075
50%,20.0,1.9,3.0,0.1
75%,21.0,2.0,3.0,0.1
max,40.0,3.0,4.0,0.1


In [98]:
grp_df = df.groupby(['day', 'smoker'])
grp_df['tip_pct'].agg(['mean', 'std'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,Yes,0.1,
Mon,No,0.075,
Sun,No,0.085,0.021213
Tue,Yes,0.1,


In [99]:
grp_df['tip_pct'].agg([('foo', 'mean'), ('bar', np.std)])

Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,Yes,0.1,
Mon,No,0.075,
Sun,No,0.085,0.021213
Tue,Yes,0.1,


**`pivot_table` & `crosstab`**

In [100]:
df = pd.DataFrame({'Sample': range(1, 11),
                   'Nationality': ['USA', 'Japan', 'USA', 'Japan', 'Japan',
                                   'USA', 'Japan', 'USA', 'USA', 'USA'],
                   'Handedness': ['Right-handed', 'Left-handed', 'Right-handed',
                                  'Left-handed', 'Left-handed', 'Right-handed',
                                  'Left-handed', 'Left-handed', 'Right-handed',
                                  'Right-handed']},
                  columns=['Sample', 'Nationality', 'Handedness'])
df

Unnamed: 0,Sample,Nationality,Handedness
0,1,USA,Right-handed
1,2,Japan,Left-handed
2,3,USA,Right-handed
3,4,Japan,Left-handed
4,5,Japan,Left-handed
5,6,USA,Right-handed
6,7,Japan,Left-handed
7,8,USA,Left-handed
8,9,USA,Right-handed
9,10,USA,Right-handed


In [101]:
pd.crosstab(df['Nationality'], df['Handedness'])

Handedness,Left-handed,Right-handed
Nationality,Unnamed: 1_level_1,Unnamed: 2_level_1
Japan,4,0
USA,1,5


In [102]:
pd.pivot_table(df,
               values='Sample',
               columns='Handedness',
               index='Nationality',
               aggfunc='count',
               fill_value=0)

Handedness,Left-handed,Right-handed
Nationality,Unnamed: 1_level_1,Unnamed: 2_level_1
Japan,4,0
USA,1,5
