Pandas is for manipulating table-like data structures

In [1]:
import pandas as pd 

Two fundamental data structures in pandas is series and data frame

In [3]:
x = pd.Series([20,40,60,80])
x

0    20
1    40
2    60
3    80
dtype: int64

Creating a series with custom indexing

In [4]:
x = pd.Series([20,40,60,80], index = ['a','b','c','d'])

In [5]:
x

a    20
b    40
c    60
d    80
dtype: int64

In [6]:
x.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [7]:
x.values

array([20, 40, 60, 80])

In [8]:
x['b']

40

In [10]:
x[['b', 'c']]

b    40
c    60
dtype: int64

Creating a series from a python dictionary

In [11]:
x = pd.Series({'a':20, 'b': 40})

In [12]:
x

a    20
b    40
dtype: int64

Selecting a subset of a series

In [14]:
x2 = x[x > 20]
x2

b    40
dtype: int64

In [15]:
x.index.name = 'XX'

In [16]:
x

XX
a    20
b    40
dtype: int64

Create a dataframe from a dictionary 

In [17]:
data = {'x': [20,20,25], 'y': [60,43,65]}

In [19]:
x = pd.DataFrame(data)
x

Unnamed: 0,x,y
0,20,60
1,20,43
2,25,65


In [20]:
#Specify column order

x = pd.DataFrame(data, columns = [ 'y', 'x'])
x

Unnamed: 0,y,x
0,60,20
1,43,20
2,65,25


In [21]:
#Like sereis you can have custom indicies

x = pd.DataFrame(data, columns = [ 'y', 'x'], index = ['a', 'b', 'c'])
x

Unnamed: 0,y,x
a,60,20
b,43,20
c,65,25


In [22]:
#Inspecting the data frame

#Display the columns
x.columns

Index(['y', 'x'], dtype='object')

In [24]:
#Display row b, displayed vertically
x.loc['b']

y    43
x    20
Name: b, dtype: int64

In [29]:
#Retrieving a column
x['y']

a    60
b    43
c    65
Name: y, dtype: int64

In [30]:
#Creating a dataframe from a list

data = [[10,20,20,40], [50,60,70,80]]
x = pd.DataFrame(data)
x

Unnamed: 0,0,1,2,3
0,10,20,20,40
1,50,60,70,80


In [31]:
#Transpose the data frame

x2 = x.T
x2

Unnamed: 0,0,1
0,10,50
1,20,60
2,20,70
3,40,80


In [32]:
import numpy as np

In [33]:
#Geneerate a series of 5 random numbers
s1 = pd.Series(np.random.rand(5), index = ['a', 'b', 'c', 'd', 'e'])

In [34]:
s1

a    0.078087
b    0.337330
c    0.175195
d    0.030983
e    0.813578
dtype: float64

In [35]:
#Generate a dataframe with random numbers with certain columns
df = pd.DataFrame(np.random.rand(4,3), columns = ['a', 'b', 'c'])
df

Unnamed: 0,a,b,c
0,0.272108,0.79492,0.171517
1,0.739006,0.621364,0.622009
2,0.479861,0.811355,0.834069
3,0.874392,0.09313,0.372435


In [36]:
s2 = s1.drop('c')
s2

a    0.078087
b    0.337330
d    0.030983
e    0.813578
dtype: float64

In [40]:
#Drop rows 0 and 1
df.drop([0,1])

Unnamed: 0,a,b,c
2,0.479861,0.811355,0.834069
3,0.874392,0.09313,0.372435


In [41]:
#Drop multiple columns
df.drop(['a', 'b'], axis = 1)

Unnamed: 0,c
0,0.171517
1,0.622009
2,0.834069
3,0.372435


In [43]:
#Must capture the object in new varaible
df1 = df.drop(['a', 'b'], axis = 1)
df1

Unnamed: 0,c
0,0.171517
1,0.622009
2,0.834069
3,0.372435


In [44]:
s1

a    0.078087
b    0.337330
c    0.175195
d    0.030983
e    0.813578
dtype: float64

In [46]:
#Select certain elements from list s1 above
s1[['b', 'c']]

b    0.337330
c    0.175195
dtype: float64

In [47]:
#Select a range
s1['a' : 'c']

a    0.078087
b    0.337330
c    0.175195
dtype: float64

In [49]:
#Select certain rows and keep all columns
df.loc[0:2]

Unnamed: 0,a,b,c
0,0.272108,0.79492,0.171517
1,0.739006,0.621364,0.622009
2,0.479861,0.811355,0.834069


In [53]:
df.describe()

Unnamed: 0,a,b,c
count,4.0,4.0,4.0
mean,0.591342,0.580192,0.500007
std,0.26849,0.335891,0.289057
min,0.272108,0.09313,0.171517
25%,0.427923,0.489305,0.322205
50%,0.609433,0.708142,0.497222
75%,0.772852,0.799029,0.675024
max,0.874392,0.811355,0.834069


In [54]:
s1.sum()

1.4351732149719607

In [55]:
#vertical sum
df.sum()

a    2.365367
b    2.320769
c    2.000029
dtype: float64

In [56]:
#horizontal sum
df.sum(axis = 1)

0    1.238545
1    1.982378
2    2.125285
3    1.339957
dtype: float64

In [57]:
#Colunwise max
df.max()

a    0.874392
b    0.811355
c    0.834069
dtype: float64

#Handling Missing Data

In [5]:
import pandas as pd
import numpy as np
from numpy import nan as NA

In [6]:
data = pd.Series([23, NA, 23, NA, 87, 34, NA])
data

0    23.0
1     NaN
2    23.0
3     NaN
4    87.0
5    34.0
6     NaN
dtype: float64

In [7]:
data.dropna()

0    23.0
2    23.0
4    87.0
5    34.0
dtype: float64

In [8]:
data.notnull()

0     True
1    False
2     True
3    False
4     True
5     True
6    False
dtype: bool

In [9]:
data = pd.DataFrame([[2,5,3,6], [4, NA, 2, -2], [NA, NA, NA, NA], [4,NA,1, 4]])
data

Unnamed: 0,0,1,2,3
0,2.0,5.0,3.0,6.0
1,4.0,,2.0,-2.0
2,,,,
3,4.0,,1.0,4.0


In [10]:
#Default is that it drops all rows with even one NA
data.dropna()

Unnamed: 0,0,1,2,3
0,2.0,5.0,3.0,6.0


In [11]:
#Drop only the rows with all NA's
data.dropna( how = 'all')

Unnamed: 0,0,1,2,3
0,2.0,5.0,3.0,6.0
1,4.0,,2.0,-2.0
3,4.0,,1.0,4.0


In [13]:
#Assign the entire column with NA's
data[3] = NA
data

Unnamed: 0,0,1,2,3
0,2.0,5.0,3.0,
1,4.0,,2.0,
2,,,,
3,4.0,,1.0,


In [14]:
#Drop the columns with all NA's
data.dropna(how = 'all', axis=1)

Unnamed: 0,0,1,2
0,2.0,5.0,3.0
1,4.0,,2.0
2,,,
3,4.0,,1.0


In [15]:
#Rows with at least 2 non-NA values
data.dropna(thresh=2)

Unnamed: 0,0,1,2,3
0,2.0,5.0,3.0,
1,4.0,,2.0,
3,4.0,,1.0,


In [17]:
#Fill a value for all NA's
data.fillna(-1)

Unnamed: 0,0,1,2,3
0,2.0,5.0,3.0,-1.0
1,4.0,-1.0,2.0,-1.0
2,-1.0,-1.0,-1.0,-1.0
3,4.0,-1.0,1.0,-1.0


In [19]:
#Filling different values for each column
data.fillna({0:-10, 1: -20, 2:-30,3:-40,4:-50})

Unnamed: 0,0,1,2,3
0,2.0,5.0,3.0,-40.0
1,4.0,-20.0,2.0,-40.0
2,-10.0,-20.0,-30.0,-40.0
3,4.0,-20.0,1.0,-40.0


#Transforming Data using a function

In [20]:
df = pd.DataFrame({'food': ['bacon', 'ham', 'beef'], 'weight': [2,4,3]})
df

Unnamed: 0,food,weight
0,bacon,2
1,ham,4
2,beef,3


In [21]:
def myfunc(x):
    if x > 2:
        return 'more than 2'
    else:
        return '<= 2'
    
df['weight'].map(myfunc)

0           <= 2
1    more than 2
2    more than 2
Name: weight, dtype: object

In [22]:
df['new'] = df['weight'].map(myfunc)
df

Unnamed: 0,food,weight,new
0,bacon,2,<= 2
1,ham,4,more than 2
2,beef,3,more than 2
