# Pandas

* Data analysis library
* Advanced version of Excel with more features than excel

# Topics

* Series
* DataFrame
* Operations
* Missing Data
* Merging and Joining
* File Reading and Writing

In [1]:
import pandas as pd

# Series in Pandas

In [2]:
import numpy as np
import pandas as pd

In [3]:
pd.Series([2, 6, 10]) #Equivalent to 1D numpy array

0     2
1     6
2    10
dtype: int64

In [4]:
pd.Series([2, 6, 10], index = ['a', 'b', 'c']) #Custom indexing

a     2
b     6
c    10
dtype: int64

In [5]:
s1 = pd.Series([2, 6, 10], index = ['a', 'b', 'c']) #Custom indexing

In [6]:
s1['a']

2

In [7]:
s1[['a','c']]

a     2
c    10
dtype: int64

In [8]:
pd.Series({'a1' : 5, 'b1' : 7, 'c1' : 4}) #Create Series using dict

a1    5
b1    7
c1    4
dtype: int64

In [9]:
pd.Series({'a1' : 'Hello', 'b1' : 7, 'c1' : True}) #Multiple datatypes-- objects

a1    Hello
b1        7
c1     True
dtype: object

In [10]:
s1 = pd.Series([1, 3, 5], index = ['a', 'b', 'c'])
s2 = pd.Series([2, 7, 9], index = ['b', 'c', 'd'])

In [11]:
s1

a    1
b    3
c    5
dtype: int64

In [12]:
s2

b    2
c    7
d    9
dtype: int64

In [13]:
s1 * s2 #Multiplies values with same index

a     NaN
b     6.0
c    35.0
d     NaN
dtype: float64

In [14]:
s1 - s2 #Performs operations on values with same index

a    NaN
b    1.0
c   -2.0
d    NaN
dtype: float64

# DataFrame

#### Can be considered as 2D Array

In [15]:
df = pd.DataFrame(data = [[2,4,6,-1],[3,-4,2,-1],[-10,3,5,-7]], 
            columns=['C1','C2','C3','C4'],
            index=['R1','R2','R3'])

In [16]:
df

Unnamed: 0,C1,C2,C3,C4
R1,2,4,6,-1
R2,3,-4,2,-1
R3,-10,3,5,-7


In [17]:
df.head(2) #First few rows

Unnamed: 0,C1,C2,C3,C4
R1,2,4,6,-1
R2,3,-4,2,-1


In [18]:
df['newC'] = 7 #Create new column

In [19]:
df

Unnamed: 0,C1,C2,C3,C4,newC
R1,2,4,6,-1,7
R2,3,-4,2,-1,7
R3,-10,3,5,-7,7


In [20]:
df['newC'] = df['C1'] + 5

In [21]:
df

Unnamed: 0,C1,C2,C3,C4,newC
R1,2,4,6,-1,7
R2,3,-4,2,-1,8
R3,-10,3,5,-7,-5


In [22]:
df.drop('newC', axis=1) #Delete column 
#Axis=0 --> row, #axis=1 -->columna

Unnamed: 0,C1,C2,C3,C4
R1,2,4,6,-1
R2,3,-4,2,-1
R3,-10,3,5,-7


In [23]:
df.drop('R1', axis=0) #Delete row

Unnamed: 0,C1,C2,C3,C4,newC
R2,3,-4,2,-1,8
R3,-10,3,5,-7,-5


In [24]:
df

Unnamed: 0,C1,C2,C3,C4,newC
R1,2,4,6,-1,7
R2,3,-4,2,-1,8
R3,-10,3,5,-7,-5


In [25]:
df.drop('newC', axis=1, inplace=True) #inplace --> modify existing DataFrame

In [26]:
df

Unnamed: 0,C1,C2,C3,C4
R1,2,4,6,-1
R2,3,-4,2,-1
R3,-10,3,5,-7


In [27]:
df.drop('R1', axis=0, inplace=True)

In [28]:
df

Unnamed: 0,C1,C2,C3,C4
R2,3,-4,2,-1
R3,-10,3,5,-7


## Select

In [29]:
df

Unnamed: 0,C1,C2,C3,C4
R2,3,-4,2,-1
R3,-10,3,5,-7


In [30]:
df['C2']

R2   -4
R3    3
Name: C2, dtype: int64

In [31]:
df[['C2', 'C4']]

Unnamed: 0,C2,C4
R2,-4,-1
R3,3,-7


.loc(row, column) <br/>
.iloc(row, column) -- refer by integer value

In [32]:
df

Unnamed: 0,C1,C2,C3,C4
R2,3,-4,2,-1
R3,-10,3,5,-7


In [33]:
df.loc['R2', :]

C1    3
C2   -4
C3    2
C4   -1
Name: R2, dtype: int64

In [34]:
df.loc['R2', ['C2', 'C3']]

C2   -4
C3    2
Name: R2, dtype: int64

In [35]:
df.iloc[1,:] #Refers to R3-- iloc references by integer

C1   -10
C2     3
C3     5
C4    -7
Name: R3, dtype: int64

In [36]:
df.iloc[1, 1:3]

C2    3
C3    5
Name: R3, dtype: int64

# Index, Multiindex

## Single index

In [38]:
df = pd.DataFrame(data = [[2,4,6,-1], [3,-4,2,-1],[-10,2,5,-7]],
                 columns = ['C1', 'C2', 'C3', 'C4'],
                 index = ['R1', 'R2', 'R3'])

In [39]:
df

Unnamed: 0,C1,C2,C3,C4
R1,2,4,6,-1
R2,3,-4,2,-1
R3,-10,2,5,-7


In [40]:
df.index #Displays indexes

Index(['R1', 'R2', 'R3'], dtype='object')

In [42]:
df.reset_index()

Unnamed: 0,index,C1,C2,C3,C4
0,R1,2,4,6,-1
1,R2,3,-4,2,-1
2,R3,-10,2,5,-7


In [43]:
df.set_index('C3', inplace=True) #Set 'C3' as index for the DataFrame

In [44]:
df

Unnamed: 0_level_0,C1,C2,C4
C3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6,2,4,-1
2,3,-4,-1
5,-10,2,-7


In [45]:
df['C2']

C3
6    4
2   -4
5    2
Name: C2, dtype: int64

In [46]:
df

Unnamed: 0_level_0,C1,C2,C4
C3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6,2,4,-1
2,3,-4,-1
5,-10,2,-7


In [47]:
df.loc[2, :]

C1    3
C2   -4
C4   -1
Name: 2, dtype: int64

In [48]:
df.loc[5,:]

C1   -10
C2     2
C4    -7
Name: 5, dtype: int64

In [49]:
df.index

Int64Index([6, 2, 5], dtype='int64', name='C3')

## Multiindex

### Creates multiple primary indexes for the DataFrame

In [52]:
df = pd.DataFrame(data=np.array([['Math', 'C1', 's1', 93],
                                ['Science', 'C1', 's2', 78],
                                ['English', 'C1', 's3', 86],
                                ['Math', 'C2', 's4', 58],
                                ['English', 'C2', 's5', 71],
                                ['Science', 'C2', 's6', 69]]),
                 columns=['Subject', 'Class', 'Student','MaxScore'])

In [53]:
df

Unnamed: 0,Subject,Class,Student,MaxScore
0,Math,C1,s1,93
1,Science,C1,s2,78
2,English,C1,s3,86
3,Math,C2,s4,58
4,English,C2,s5,71
5,Science,C2,s6,69


In [54]:
df.index

RangeIndex(start=0, stop=6, step=1)

In [56]:
df.set_index(['Class', 'Subject'], inplace=True) #Create multi-index

In [57]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Student,MaxScore
Class,Subject,Unnamed: 2_level_1,Unnamed: 3_level_1
C1,Math,s1,93
C1,Science,s2,78
C1,English,s3,86
C2,Math,s4,58
C2,English,s5,71
C2,Science,s6,69


In [58]:
df.index

MultiIndex(levels=[['C1', 'C2'], ['English', 'Math', 'Science']],
           labels=[[0, 0, 0, 1, 1, 1], [1, 2, 0, 1, 0, 2]],
           names=['Class', 'Subject'])

In [59]:
df.loc['C1'] #Returns a sub-DataFrame since it has multi-index

Unnamed: 0_level_0,Student,MaxScore
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1
Math,s1,93
Science,s2,78
English,s3,86


In [60]:
df.loc['C2'].loc['English'] #Multiple loc functions to grab the data

Student     s5
MaxScore    71
Name: English, dtype: object