## Working with Pandas

### Pandas Series

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = [10,20,30]
labels = ['a','b','c']
arr = np.array(data)
adict = {'a': 10, 'b':20, 'c': 40}

In [3]:
pd.Series(data = data, index = labels)

a    10
b    20
c    30
dtype: int64

In [4]:
pd.Series(data, labels)

a    10
b    20
c    30
dtype: int64

In [5]:
pd.Series(arr, labels)

a    10
b    20
c    30
dtype: int32

In [6]:
pd.Series(adict)

a    10
b    20
c    40
dtype: int64

Series can hold references to functions as well

In [7]:
pd.Series([print,sum,max])

0    <built-in function print>
1      <built-in function sum>
2      <built-in function max>
dtype: object

In [8]:
f = pd.Series([print,sum,max])

In [9]:
f[0]("Hello")

Hello


In [10]:
ser1 = pd.Series( [1,2,3,5],['USA','Japan','USSR','Germany'])

In [11]:
ser2 = pd.Series([6,7,4,2],['USA','Italy','Germany','USSR'])

In [12]:
ser1 + ser2

Germany    9.0
Italy      NaN
Japan      NaN
USA        7.0
USSR       5.0
dtype: float64

### DataFrames

In [13]:
df = pd.DataFrame(np.linspace(0,1,25).reshape(5,5),['A','B','C','D','E'],['V','W','X','Y','Z'])
df

Unnamed: 0,V,W,X,Y,Z
A,0.0,0.041667,0.083333,0.125,0.166667
B,0.208333,0.25,0.291667,0.333333,0.375
C,0.416667,0.458333,0.5,0.541667,0.583333
D,0.625,0.666667,0.708333,0.75,0.791667
E,0.833333,0.875,0.916667,0.958333,1.0


In [14]:
df['W']  #or df.W  however not recommended

A    0.041667
B    0.250000
C    0.458333
D    0.666667
E    0.875000
Name: W, dtype: float64

In [15]:
print(type(df))
print(type(df['W']))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [16]:
df[['W','Y']]

Unnamed: 0,W,Y
A,0.041667,0.125
B,0.25,0.333333
C,0.458333,0.541667
D,0.666667,0.75
E,0.875,0.958333


In [17]:
df['NEW'] = df['W']+df['Y']
df

Unnamed: 0,V,W,X,Y,Z,NEW
A,0.0,0.041667,0.083333,0.125,0.166667,0.166667
B,0.208333,0.25,0.291667,0.333333,0.375,0.583333
C,0.416667,0.458333,0.5,0.541667,0.583333,1.0
D,0.625,0.666667,0.708333,0.75,0.791667,1.416667
E,0.833333,0.875,0.916667,0.958333,1.0,1.833333


In [18]:
df.drop('NEW', axis = 1, inplace = True)
#or
#df = df.drop('NEW', axis = 1)
df

Unnamed: 0,V,W,X,Y,Z
A,0.0,0.041667,0.083333,0.125,0.166667
B,0.208333,0.25,0.291667,0.333333,0.375
C,0.416667,0.458333,0.5,0.541667,0.583333
D,0.625,0.666667,0.708333,0.75,0.791667
E,0.833333,0.875,0.916667,0.958333,1.0


Rows are axis = 0 while Columns are exis = 1

In [19]:
df.shape 

(5, 5)

As tuple shows the occurence of rows first at 0 index while columns at 1 index, hence the axis names.

In [20]:
df.loc[['A','B']]

Unnamed: 0,V,W,X,Y,Z
A,0.0,0.041667,0.083333,0.125,0.166667
B,0.208333,0.25,0.291667,0.333333,0.375


To get row wise indexing and selection

In [21]:
df.loc[['A','B'],['V','W','X']]

Unnamed: 0,V,W,X
A,0.0,0.041667,0.083333
B,0.208333,0.25,0.291667


In [22]:
df.iloc[1]

V    0.208333
W    0.250000
X    0.291667
Y    0.333333
Z    0.375000
Name: B, dtype: float64

In [23]:
df.iloc[1,[0,1,2]]

V    0.208333
W    0.250000
X    0.291667
Name: B, dtype: float64

'iloc' takes only integers as indexing values.

In [24]:
df

Unnamed: 0,V,W,X,Y,Z
A,0.0,0.041667,0.083333,0.125,0.166667
B,0.208333,0.25,0.291667,0.333333,0.375
C,0.416667,0.458333,0.5,0.541667,0.583333
D,0.625,0.666667,0.708333,0.75,0.791667
E,0.833333,0.875,0.916667,0.958333,1.0


In [25]:
df.ix['A',[0,1,2]]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


V    0.000000
W    0.041667
X    0.083333
Name: A, dtype: float64

In [26]:
df.ix['D','Y'] = 6

In [27]:
df

Unnamed: 0,V,W,X,Y,Z
A,0.0,0.041667,0.083333,0.125,0.166667
B,0.208333,0.25,0.291667,0.333333,0.375
C,0.416667,0.458333,0.5,0.541667,0.583333
D,0.625,0.666667,0.708333,6.0,0.791667
E,0.833333,0.875,0.916667,0.958333,1.0


### Conditional Selection

In [28]:
df = pd.DataFrame(np.random.randn(25).reshape(5,5),['A','B','C','D','E'],['V','W','X','Y','Z'])
df

Unnamed: 0,V,W,X,Y,Z
A,0.271522,-1.072557,-0.204947,-0.934321,-1.301463
B,-0.361891,-0.358661,-0.794397,0.345406,-0.331921
C,-0.495905,-0.413705,-0.74714,-1.060777,1.747387
D,0.103344,-0.867173,0.478603,-1.720202,-0.234376
E,1.23373,-1.500402,-0.375976,0.008403,0.487071


In [29]:
df[ df < 0.5]

Unnamed: 0,V,W,X,Y,Z
A,0.271522,-1.072557,-0.204947,-0.934321,-1.301463
B,-0.361891,-0.358661,-0.794397,0.345406,-0.331921
C,-0.495905,-0.413705,-0.74714,-1.060777,
D,0.103344,-0.867173,0.478603,-1.720202,-0.234376
E,,-1.500402,-0.375976,0.008403,0.487071


In [30]:
df[df['V'] < 0]

Unnamed: 0,V,W,X,Y,Z
B,-0.361891,-0.358661,-0.794397,0.345406,-0.331921
C,-0.495905,-0.413705,-0.74714,-1.060777,1.747387


In [31]:
df[df['V'] < 0][['W','X']]

Unnamed: 0,W,X
B,-0.358661,-0.794397
C,-0.413705,-0.74714


In [32]:
df[(df['V'] < 0) & (df['W'] > 0)]

Unnamed: 0,V,W,X,Y,Z


'and' operator cannot be used because it can only handle sing instances of booleans. Same is the case with 'or'.

In [33]:
df[(df['W'] < 0) | (df['X'] > 1)]

Unnamed: 0,V,W,X,Y,Z
A,0.271522,-1.072557,-0.204947,-0.934321,-1.301463
B,-0.361891,-0.358661,-0.794397,0.345406,-0.331921
C,-0.495905,-0.413705,-0.74714,-1.060777,1.747387
D,0.103344,-0.867173,0.478603,-1.720202,-0.234376
E,1.23373,-1.500402,-0.375976,0.008403,0.487071


In [34]:
df.reset_index()

Unnamed: 0,index,V,W,X,Y,Z
0,A,0.271522,-1.072557,-0.204947,-0.934321,-1.301463
1,B,-0.361891,-0.358661,-0.794397,0.345406,-0.331921
2,C,-0.495905,-0.413705,-0.74714,-1.060777,1.747387
3,D,0.103344,-0.867173,0.478603,-1.720202,-0.234376
4,E,1.23373,-1.500402,-0.375976,0.008403,0.487071


Notice that the old index has been moved to a column.

In [35]:
states = 'CO NY WY OK CH'.split()
df['states'] = states
df

Unnamed: 0,V,W,X,Y,Z,states
A,0.271522,-1.072557,-0.204947,-0.934321,-1.301463,CO
B,-0.361891,-0.358661,-0.794397,0.345406,-0.331921,NY
C,-0.495905,-0.413705,-0.74714,-1.060777,1.747387,WY
D,0.103344,-0.867173,0.478603,-1.720202,-0.234376,OK
E,1.23373,-1.500402,-0.375976,0.008403,0.487071,CH


In [36]:
df.set_index(df['states']).drop('states', axis = 1)

Unnamed: 0_level_0,V,W,X,Y,Z
states,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CO,0.271522,-1.072557,-0.204947,-0.934321,-1.301463
NY,-0.361891,-0.358661,-0.794397,0.345406,-0.331921
WY,-0.495905,-0.413705,-0.74714,-1.060777,1.747387
OK,0.103344,-0.867173,0.478603,-1.720202,-0.234376
CH,1.23373,-1.500402,-0.375976,0.008403,0.487071


Unlike 'reset_index', 'set_index' does not makes a copy of previous index as column.

## -----------------------------------------------------------------------------------------------------------------------------------
### Multilevel Indexing and Heirarchy

In [76]:
outside = ['G1','G1','G1','G2','G2','G2']
inside  =  [1,2,3,1,2,3]
hier_index  = list(zip(outside, inside))
print(hier_index)
hier_index  = pd.MultiIndex.from_tuples(hier_index)
hier_index

[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]


MultiIndex(levels=[['G1', 'G2'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

In [77]:
ddf = pd.DataFrame(np.random.randn(6,2),hier_index, ['A','B'])
ddf

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.075792,-0.49265
G1,2,-0.819587,0.026369
G1,3,0.110563,0.521474
G2,1,1.235503,1.648629
G2,2,2.281606,-0.636091
G2,3,1.100797,0.416771


In [78]:
ddf.index.names = ['Groups', 'Nums']
ddf

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Nums,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.075792,-0.49265
G1,2,-0.819587,0.026369
G1,3,0.110563,0.521474
G2,1,1.235503,1.648629
G2,2,2.281606,-0.636091
G2,3,1.100797,0.416771


In [79]:
ddf.stack().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0
Groups,Nums,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,A,0.075792
G1,1,B,-0.49265
G1,2,A,-0.819587
G1,2,B,0.026369
G1,3,A,0.110563
G1,3,B,0.521474
G2,1,A,1.235503
G2,1,B,1.648629
G2,2,A,2.281606
G2,2,B,-0.636091


In [812]:
ddf.loc['G2'].loc[3]['B']

0.4586614642083646

In [95]:
df = pd.read_csv('bigmac.csv')

In [96]:
df.set_index(['Date','Country'], inplace= True)

In [97]:
df.sort_index(inplace= True)

In [98]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
1/2010,Argentina,1.84
1/2010,Australia,3.98
1/2010,Brazil,4.76
1/2010,Britain,3.67
1/2010,Canada,3.97
1/2010,Chile,3.18
1/2010,China,1.83
1/2010,Colombia,3.91
1/2010,Costa Rica,3.52
1/2010,Czech Republic,3.71


In [99]:
df.index

MultiIndex(levels=[['1/2010', '1/2012', '1/2013', '1/2014', '1/2015', '1/2016', '7/2010', '7/2011', '7/2012', '7/2013', '7/2014', '7/2015'], ['Argentina', 'Australia', 'Austria', 'Belgium', 'Brazil', 'Britain', 'Canada', 'Chile', 'China', 'Colombia', 'Costa Rica', 'Czech Republic', 'Denmark', 'Egypt', 'Estonia', 'Euro area', 'Finland', 'France', 'Germany', 'Greece', 'Hong Kong', 'Hungary', 'India', 'Indonesia', 'Ireland', 'Israel', 'Italy', 'Japan', 'Latvia', 'Lithuania', 'Malaysia', 'Mexico', 'Netherlands', 'New Zealand', 'Norway', 'Pakistan', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Russia', 'Saudi Arabia', 'Singapore', 'South Africa', 'South Korea', 'Spain', 'Sri Lanka', 'Sweden', 'Switzerland', 'Taiwan', 'Thailand', 'Turkey', 'UAE', 'Ukraine', 'United States', 'Uruguay', 'Venezuela', 'Vietnam']],
           labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

Above is a Pandas Multiindex Object

In [100]:
df.index.get_level_values(0)

Index(['1/2010', '1/2010', '1/2010', '1/2010', '1/2010', '1/2010', '1/2010',
       '1/2010', '1/2010', '1/2010',
       ...
       '7/2015', '7/2015', '7/2015', '7/2015', '7/2015', '7/2015', '7/2015',
       '7/2015', '7/2015', '7/2015'],
      dtype='object', name='Date', length=652)

In [101]:
df.index.get_level_values(1)

Index(['Argentina', 'Australia', 'Brazil', 'Britain', 'Canada', 'Chile',
       'China', 'Colombia', 'Costa Rica', 'Czech Republic',
       ...
       'Switzerland', 'Taiwan', 'Thailand', 'Turkey', 'UAE', 'Ukraine',
       'United States', 'Uruguay', 'Venezuela', 'Vietnam'],
      dtype='object', name='Country', length=652)

In [102]:
df.index.set_names(['Dated','Location'],inplace= True)

In [103]:
df.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Dated,Location,Unnamed: 2_level_1
1/2010,Argentina,1.84


In [104]:
df.sort_index(ascending = [True,False]).head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Dated,Location,Unnamed: 2_level_1
1/2010,Uruguay,3.32
1/2010,United States,3.58
1/2010,Ukraine,1.83
1/2010,UAE,2.99
1/2010,Turkey,3.83


In [105]:
df.loc[('1/2010','UAE'), 'Price in US Dollars']

2.99

In [106]:
df.swaplevel()

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Location,Dated,Unnamed: 2_level_1
Argentina,1/2010,1.84
Australia,1/2010,3.98
Brazil,1/2010,4.76
Britain,1/2010,3.67
Canada,1/2010,3.97
Chile,1/2010,3.18
China,1/2010,1.83
Colombia,1/2010,3.91
Costa Rica,1/2010,3.52
Czech Republic,1/2010,3.71


In [107]:
df = df.stack().to_frame().head()

In [108]:
df = df.unstack()

In [109]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Unnamed: 0_level_1,Unnamed: 1_level_1,Price in US Dollars
Dated,Location,Unnamed: 2_level_2
1/2010,Argentina,1.84
1/2010,Australia,3.98
1/2010,Brazil,4.76
1/2010,Britain,3.67
1/2010,Canada,3.97


## -----------------------------------------------------------------------------------------------------------------------------------
### Handling Missing Values

In [813]:
daf  = {'A': [1, 2, np.nan], 'B': [4, np.nan, np.nan], 'C': [7,8,9]}
daf = pd.DataFrame(daf)
daf

Unnamed: 0,A,B,C
0,1.0,4.0,7
1,2.0,,8
2,,,9


In [814]:
daf.dropna()

Unnamed: 0,A,B,C
0,1.0,4.0,7


In [815]:
daf.dropna(axis = 1)

Unnamed: 0,C
0,7
1,8
2,9


In [816]:
daf.dropna(thresh = 2)

Unnamed: 0,A,B,C
0,1.0,4.0,7
1,2.0,,8


'thresh' sets the minimum occurence of nan to be dropped

In [817]:
daf.fillna(value = 'FILLED')

Unnamed: 0,A,B,C
0,1,4,7
1,2,FILLED,8
2,FILLED,FILLED,9


In [818]:
daf['A'].fillna(value = (daf['A'].mean()), inplace = True)
daf['B'].fillna(value = (daf['B'].mean()), inplace = True)
daf

Unnamed: 0,A,B,C
0,1.0,4.0,7
1,2.0,4.0,8
2,1.5,4.0,9


## -----------------------------------------------------------------------------------------------------------------------------------
### Grouping By

In [819]:
sa = pd.DataFrame({'Sales': [450,120,345,334,232,100],'Person': ['Prashant','Shivam','Shiva','Ankit','Arpit','Abhi']
                  ,'Company':['Microsoft','Microsoft','Google','Google','Apple','Apple']
                   })
sa

Unnamed: 0,Company,Person,Sales
0,Microsoft,Prashant,450
1,Microsoft,Shivam,120
2,Google,Shiva,345
3,Google,Ankit,334
4,Apple,Arpit,232
5,Apple,Abhi,100


In [820]:
byComp = sa.groupby('Company')
byComp

<pandas.core.groupby.DataFrameGroupBy object at 0x00000248BFC5BB38>

In [821]:
byComp.mean()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
Apple,166.0
Google,339.5
Microsoft,285.0


Non Numeric column 'Person' ignored due to non relevance to the mean function.

In [822]:
byComp.median()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
Apple,166.0
Google,339.5
Microsoft,285.0


In [823]:
byComp.median().loc['Microsoft']

Sales    285.0
Name: Microsoft, dtype: float64

Summing up all above steps as one:

In [824]:
sa.groupby('Company').median().loc['Microsoft']

Sales    285.0
Name: Microsoft, dtype: float64

In [825]:
sa.groupby('Company').count()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
Apple,2,2
Google,2,2
Microsoft,2,2


In [826]:
sa.groupby('Company').max()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
Apple,Arpit,232
Google,Shiva,345
Microsoft,Shivam,450


Gave the individual entry from each company with max values.

In [827]:
sa.groupby('Company').min()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
Apple,Abhi,100
Google,Ankit,334
Microsoft,Prashant,120


In [828]:
sa.groupby('Company').describe().transpose()

Unnamed: 0,Company,Apple,Google,Microsoft
Sales,count,2.0,2.0,2.0
Sales,mean,166.0,339.5,285.0
Sales,std,93.338095,7.778175,233.345238
Sales,min,100.0,334.0,120.0
Sales,25%,133.0,336.75,202.5
Sales,50%,166.0,339.5,285.0
Sales,75%,199.0,342.25,367.5
Sales,max,232.0,345.0,450.0


In [829]:
sa.groupby('Company').describe().transpose()['Microsoft']

Sales  count      2.000000
       mean     285.000000
       std      233.345238
       min      120.000000
       25%      202.500000
       50%      285.000000
       75%      367.500000
       max      450.000000
Name: Microsoft, dtype: float64

## -----------------------------------------------------------------------------------------------------------------------------------
### Merging, Joining and Concatenating

In [830]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']},
                        index=[0, 1, 2, 3])

In [831]:
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                        'B': ['B4', 'B5', 'B6', 'B7'],
                        'C': ['C4', 'C5', 'C6', 'C7'],
                        'D': ['D4', 'D5', 'D6', 'D7']},
                         index=[4, 5, 6, 7]) 

In [832]:
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                        'B': ['B8', 'B9', 'B10', 'B11'],
                        'C': ['C8', 'C9', 'C10', 'C11'],
                        'D': ['D8', 'D9', 'D10', 'D11']},
                        index=[8, 9, 10, 11])

In [833]:
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [834]:
df2

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [835]:
df3

Unnamed: 0,A,B,C,D
8,A8,B8,C8,D8
9,A9,B9,C9,D9
10,A10,B10,C10,D10
11,A11,B11,C11,D11


In [836]:
pd.concat([df1, df2, df3])

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


In [837]:
pd.concat([df1, df2, df3], axis = 1)

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1,A.2,B.2,C.2,D.2
0,A0,B0,C0,D0,,,,,,,,
1,A1,B1,C1,D1,,,,,,,,
2,A2,B2,C2,D2,,,,,,,,
3,A3,B3,C3,D3,,,,,,,,
4,,,,,A4,B4,C4,D4,,,,
5,,,,,A5,B5,C5,D5,,,,
6,,,,,A6,B6,C6,D6,,,,
7,,,,,A7,B7,C7,D7,,,,
8,,,,,,,,,A8,B8,C8,D8
9,,,,,,,,,A9,B9,C9,D9


In [838]:
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
   
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                          'C': ['C0', 'C1', 'C2', 'C3'],
                          'D': ['D0', 'D1', 'D2', 'D3']})    

In [839]:
left

Unnamed: 0,A,B,key
0,A0,B0,K0
1,A1,B1,K1
2,A2,B2,K2
3,A3,B3,K3


In [840]:
right

Unnamed: 0,C,D,key
0,C0,D0,K0
1,C1,D1,K1
2,C2,D2,K2
3,C3,D3,K3


In [841]:
pd.merge(left, right, how = 'inner', on = 'key')

Unnamed: 0,A,B,key,C,D
0,A0,B0,K0,C0,D0
1,A1,B1,K1,C1,D1
2,A2,B2,K2,C2,D2
3,A3,B3,K3,C3,D3


In [842]:
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                     'key2': ['K0', 'K1', 'K0', 'K1'],
                        'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3']})
    
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                               'key2': ['K0', 'K0', 'K0', 'K0'],
                                  'C': ['C0', 'C1', 'C2', 'C3'],
                                  'D': ['D0', 'D1', 'D2', 'D3']})

In [843]:
left

Unnamed: 0,A,B,key1,key2
0,A0,B0,K0,K0
1,A1,B1,K0,K1
2,A2,B2,K1,K0
3,A3,B3,K2,K1


In [844]:
right

Unnamed: 0,C,D,key1,key2
0,C0,D0,K0,K0
1,C1,D1,K1,K0
2,C2,D2,K1,K0
3,C3,D3,K2,K0


In [845]:
pd.merge(left, right, on = ['key1', 'key2'])

Unnamed: 0,A,B,key1,key2,C,D
0,A0,B0,K0,K0,C0,D0
1,A2,B2,K1,K0,C1,D1
2,A2,B2,K1,K0,C2,D2


In [846]:
pd.merge(left, right, how = 'outer', on = ['key1','key2'])

Unnamed: 0,A,B,key1,key2,C,D
0,A0,B0,K0,K0,C0,D0
1,A1,B1,K0,K1,,
2,A2,B2,K1,K0,C1,D1
3,A2,B2,K1,K0,C2,D2
4,A3,B3,K2,K1,,
5,,,K2,K0,C3,D3


In [847]:
pd.merge(left, right, how = 'left', on = ['key1', 'key2'])

Unnamed: 0,A,B,key1,key2,C,D
0,A0,B0,K0,K0,C0,D0
1,A1,B1,K0,K1,,
2,A2,B2,K1,K0,C1,D1
3,A2,B2,K1,K0,C2,D2
4,A3,B3,K2,K1,,


In [848]:
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                      index=['K0', 'K1', 'K2']) 

right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                    'D': ['D0', 'D2', 'D3']},
                      index=['K0', 'K2', 'K3'])

In [849]:
left.join(right)

Unnamed: 0,A,B,C,D
K0,A0,B0,C0,D0
K1,A1,B1,,
K2,A2,B2,C2,D2


In [850]:
left.join(right, how = 'outer')

Unnamed: 0,A,B,C,D
K0,A0,B0,C0,D0
K1,A1,B1,,
K2,A2,B2,C2,D2
K3,,,C3,D3


## ------------------------------------------------------------------------------------------------------------------------------------
### Operations

In [851]:
import pandas as pd
df = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})
df

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [852]:
df['col2'].unique()

array([444, 555, 666], dtype=int64)

In [853]:
df['col2'].nunique() #equivalent to using len(df['col2'].unique())

3

In [854]:
df['col2'].value_counts()

444    2
555    1
666    1
Name: col2, dtype: int64

In [855]:
df[(df['col1'] > 1) & (df['col2'] == 444)]

Unnamed: 0,col1,col2,col3
3,4,444,xyz


In [856]:
def times2(n):
    return n*2

In [857]:
df['col2'].apply((lambda n: n*2)) #or df['col2'].apply(times2)

0     888
1    1110
2    1332
3     888
Name: col2, dtype: int64

In [858]:
df.columns

Index(['col1', 'col2', 'col3'], dtype='object')

In [859]:
df.index

RangeIndex(start=0, stop=4, step=1)

In [860]:
df.sort_values('col2', ascending = False)

Unnamed: 0,col1,col2,col3
2,3,666,ghi
1,2,555,def
0,1,444,abc
3,4,444,xyz


In [861]:
df.isnull()

Unnamed: 0,col1,col2,col3
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False


In [862]:
data = {'A':['foo','foo','foo','bar','bar','bar'],
     'B':['one','one','two','two','one','one'],
       'C':['x','y','x','y','x','y'],
       'D':[1,3,2,5,4,1]}

df = pd.DataFrame(data)
df

Unnamed: 0,A,B,C,D
0,foo,one,x,1
1,foo,one,y,3
2,foo,two,x,2
3,bar,two,y,5
4,bar,one,x,4
5,bar,one,y,1


In [863]:
df.pivot_table(values = 'D', index = ['A','B'], columns = 'C')

Unnamed: 0_level_0,C,x,y
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,4.0,1.0
bar,two,,5.0
foo,one,1.0,3.0
foo,two,2.0,


In [120]:
df = pd.read_csv('employees.csv')

In [136]:
df.sort_values('First Name').head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2/17/2012,10:20 AM,61602,11.849,True,Marketing
327,Aaron,Male,1/29/1994,6:48 PM,58755,5.097,True,Marketing
440,Aaron,Male,7/22/1990,2:53 PM,52119,11.343,True,Client Services
937,Aaron,,1/22/1986,7:39 PM,63126,18.424,False,Client Services
137,Adam,Male,5/21/2011,1:45 AM,95327,15.12,False,Distribution


In [139]:
df.pivot_table(values = ['Salary'],index = ['First Name'], columns = ['Gender']).head()

Unnamed: 0_level_0,Salary,Salary
Gender,Female,Male
First Name,Unnamed: 1_level_2,Unnamed: 2_level_2
Aaron,,57492.0
Adam,,80494.5
Alan,,76619.5
Albert,,95025.666667
Alice,89484.571429,


## ------------------------------------------------------------------------------------------------------------------------------------
### Data Input and Output
#### in CSV, HTML, SQL, Excel

In [864]:
dff = pd.read_csv('example')
dff

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [865]:
dff.to_csv('ToCSVoutput.csv', index = False)

Make sure you put index as False. Or index will be added as a column.

In [866]:
pd.read_csv('ToCSVoutput.csv')

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [867]:
ddff = pd.read_excel('Excel_Sample.xlsx', sheet_name= 'Sheet1')
ddff

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [868]:
ddff.to_excel('ToXLoutput.xlsx', sheet_name= 'Sheet1')

In [869]:
df = pd.read_html('http://www.fdic.gov/bank/individual/failed/banklist.html')
df[0].head()

Unnamed: 0,Bank Name,City,ST,CERT,Acquiring Institution,Closing Date,Updated Date
0,Washington Federal Bank for Savings,Chicago,IL,30570,Royal Savings Bank,"December 15, 2017","February 21, 2018"
1,The Farmers and Merchants State Bank of Argonia,Argonia,KS,17719,Conway Bank,"October 13, 2017","February 21, 2018"
2,Fayette County Bank,Saint Elmo,IL,1802,"United Fidelity Bank, fsb","May 26, 2017","July 26, 2017"
3,"Guaranty Bank, (d/b/a BestBank in Georgia & Mi...",Milwaukee,WI,30003,First-Citizens Bank & Trust Company,"May 5, 2017","March 22, 2018"
4,First NBC Bank,New Orleans,LA,58302,Whitney Bank,"April 28, 2017","December 5, 2017"


Note that all the data has been read in a list so we cycle through indexes until find our DataFrame.

In [870]:
from sqlalchemy import create_engine

In [871]:
engine = create_engine('sqlite:///:memory:')

In [872]:
dff.to_sql('mySQLtable', con = engine)

In [873]:
sqldf = pd.read_sql('mySQLtable', con = engine)

In [874]:
sqldf

Unnamed: 0,index,a,b,c,d
0,0,0,1,2,3
1,1,4,5,6,7
2,2,8,9,10,11
3,3,12,13,14,15


### Other Leftout methods

In [875]:
df = pd.DataFrame(np.random.randn(6,4), columns = 'One Two Three Four'.split())

In [876]:
df

Unnamed: 0,One,Two,Three,Four
0,0.243955,-0.942449,0.788263,-1.860005
1,0.494912,0.134304,0.969188,-0.610259
2,0.158402,-0.317292,0.394202,-1.537873
3,0.898798,0.643745,-0.475634,-0.735277
4,0.026187,-0.657128,3.146547,-2.023609
5,1.414945,1.390598,0.322901,0.519558


In [877]:
df.One.dtype

dtype('float64')

In [878]:
df['One'] = df.One.astype('int')

In [879]:
df

Unnamed: 0,One,Two,Three,Four
0,0,-0.942449,0.788263,-1.860005
1,0,0.134304,0.969188,-0.610259
2,0,-0.317292,0.394202,-1.537873
3,0,0.643745,-0.475634,-0.735277
4,0,-0.657128,3.146547,-2.023609
5,1,1.390598,0.322901,0.519558


In [880]:
df['Four'] = [1,2,3,4,5,6]

In [881]:
df

Unnamed: 0,One,Two,Three,Four
0,0,-0.942449,0.788263,1
1,0,0.134304,0.969188,2
2,0,-0.317292,0.394202,3
3,0,0.643745,-0.475634,4
4,0,-0.657128,3.146547,5
5,1,1.390598,0.322901,6


In [882]:
df[df['Four'].between(2,5)]

Unnamed: 0,One,Two,Three,Four
1,0,0.134304,0.969188,2
2,0,-0.317292,0.394202,3
3,0,0.643745,-0.475634,4
4,0,-0.657128,3.146547,5


In [883]:
df.insert(loc = 2, column = 'TwoAndHalf', value = df['Four'])

In [884]:
df

Unnamed: 0,One,Two,TwoAndHalf,Three,Four
0,0,-0.942449,1,0.788263,1
1,0,0.134304,2,0.969188,2
2,0,-0.317292,3,0.394202,3
3,0,0.643745,4,-0.475634,4
4,0,-0.657128,5,3.146547,5
5,1,1.390598,6,0.322901,6


In [885]:
df['TwoAndHalf'].isin(df['Four'])

0    True
1    True
2    True
3    True
4    True
5    True
Name: TwoAndHalf, dtype: bool

In [886]:
df[df['Four'].isin(df['TwoAndHalf'])]

Unnamed: 0,One,Two,TwoAndHalf,Three,Four
0,0,-0.942449,1,0.788263,1
1,0,0.134304,2,0.969188,2
2,0,-0.317292,3,0.394202,3
3,0,0.643745,4,-0.475634,4
4,0,-0.657128,5,3.146547,5
5,1,1.390598,6,0.322901,6


In [887]:
df['Four'] = [1,1,2,2,3,3]

In [888]:
df['Five'] = [1,2,2,3,3,4]

In [889]:
df['Four'].duplicated()

0    False
1     True
2    False
3     True
4    False
5     True
Name: Four, dtype: bool

In [890]:
df[df['Four'].duplicated()]

Unnamed: 0,One,Two,TwoAndHalf,Three,Four,Five
1,0,0.134304,2,0.969188,1,2
3,0,0.643745,4,-0.475634,2,3
5,1,1.390598,6,0.322901,3,4


In [891]:
df[df['Four'].duplicated(keep = 'first')]

Unnamed: 0,One,Two,TwoAndHalf,Three,Four,Five
1,0,0.134304,2,0.969188,1,2
3,0,0.643745,4,-0.475634,2,3
5,1,1.390598,6,0.322901,3,4


In [892]:
df[df['Four'].duplicated(keep = 'last')]

Unnamed: 0,One,Two,TwoAndHalf,Three,Four,Five
0,0,-0.942449,1,0.788263,1,1
2,0,-0.317292,3,0.394202,2,2
4,0,-0.657128,5,3.146547,3,3


In [893]:
df[~df['Four'].duplicated(keep = 'last')]

Unnamed: 0,One,Two,TwoAndHalf,Three,Four,Five
1,0,0.134304,2,0.969188,1,2
3,0,0.643745,4,-0.475634,2,3
5,1,1.390598,6,0.322901,3,4


In [894]:
df.drop_duplicates(subset = 'Four', keep = False)

Unnamed: 0,One,Two,TwoAndHalf,Three,Four,Five


This does not keep any value that have duplicates

In [895]:
df.drop_duplicates(subset = ['Four','Five'])

Unnamed: 0,One,Two,TwoAndHalf,Three,Four,Five
0,0,-0.942449,1,0.788263,1,1
1,0,0.134304,2,0.969188,1,2
2,0,-0.317292,3,0.394202,2,2
3,0,0.643745,4,-0.475634,2,3
4,0,-0.657128,5,3.146547,3,3
5,1,1.390598,6,0.322901,3,4


This shows the duplicate values in both columns irrelevant of the distinct values at that instance in other column.

In [896]:
df['TwoAndHalf'].rank()

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    6.0
Name: TwoAndHalf, dtype: float64

In [897]:
df['Four'].rank(ascending = False)

0    5.5
1    5.5
2    3.5
3    3.5
4    1.5
5    1.5
Name: Four, dtype: float64

In [898]:
df.rename(columns = {'TwoAndHalf': 'TwoAnd1/2'})

Unnamed: 0,One,Two,TwoAnd1/2,Three,Four,Five
0,0,-0.942449,1,0.788263,1,1
1,0,0.134304,2,0.969188,1,2
2,0,-0.317292,3,0.394202,2,2
3,0,0.643745,4,-0.475634,2,3
4,0,-0.657128,5,3.146547,3,3
5,1,1.390598,6,0.322901,3,4


In [899]:
df.sample(n = 3)

Unnamed: 0,One,Two,TwoAndHalf,Three,Four,Five
3,0,0.643745,4,-0.475634,2,3
1,0,0.134304,2,0.969188,1,2
4,0,-0.657128,5,3.146547,3,3


In [900]:
df.sample(frac= 0.5)

Unnamed: 0,One,Two,TwoAndHalf,Three,Four,Five
2,0,-0.317292,3,0.394202,2,2
5,1,1.390598,6,0.322901,3,4
1,0,0.134304,2,0.969188,1,2


In [901]:
df.nlargest(n = 3, columns= 'TwoAndHalf')

Unnamed: 0,One,Two,TwoAndHalf,Three,Four,Five
5,1,1.390598,6,0.322901,3,4
4,0,-0.657128,5,3.146547,3,3
3,0,0.643745,4,-0.475634,2,3


In [902]:
df.nsmallest(n = 3, columns= 'TwoAndHalf')

Unnamed: 0,One,Two,TwoAndHalf,Three,Four,Five
0,0,-0.942449,1,0.788263,1,1
1,0,0.134304,2,0.969188,1,2
2,0,-0.317292,3,0.394202,2,2


In [903]:
df.where(df['TwoAndHalf'] < 4)

Unnamed: 0,One,Two,TwoAndHalf,Three,Four,Five
0,0.0,-0.942449,1.0,0.788263,1.0,1.0
1,0.0,0.134304,2.0,0.969188,1.0,2.0
2,0.0,-0.317292,3.0,0.394202,2.0,2.0
3,,,,,,
4,,,,,,
5,,,,,,


Unlike conditional access whithout **where**, **where** returns all existing columns however with *NaNs*

In [904]:
df.where((df['TwoAndHalf']< 4 )  & (df['Five'] < 3))

Unnamed: 0,One,Two,TwoAndHalf,Three,Four,Five
0,0.0,-0.942449,1.0,0.788263,1.0,1.0
1,0.0,0.134304,2.0,0.969188,1.0,2.0
2,0.0,-0.317292,3.0,0.394202,2.0,2.0
3,,,,,,
4,,,,,,
5,,,,,,
