# Pandas

* Data analysis library
* Advanced version of Excel with more features than excel

# Topics

* Series
* DataFrame
* Operations
* Missing Data
* Merging and Joining
* File Reading and Writing

In [2]:
import pandas as pd

# Series in Pandas

In [3]:
import numpy as np
import pandas as pd

In [5]:
pd.Series([2, 6, 10]) #Equivalent to 1D numpy array

0     2
1     6
2    10
dtype: int64

In [9]:
pd.Series([2, 6, 10], index = ['a', 'b', 'c']) #Custom indexing

a     2
b     6
c    10
dtype: int64

In [10]:
s1 = pd.Series([2, 6, 10], index = ['a', 'b', 'c']) #Custom indexing

In [11]:
s1['a']

2

In [13]:
s1[['a','c']]

a     2
c    10
dtype: int64

In [14]:
pd.Series({'a1' : 5, 'b1' : 7, 'c1' : 4}) #Create Series using dict

a1    5
b1    7
c1    4
dtype: int64

In [16]:
pd.Series({'a1' : 'Hello', 'b1' : 7, 'c1' : True}) #Multiple datatypes-- objects

a1    Hello
b1        7
c1     True
dtype: object

In [17]:
s1 = pd.Series([1, 3, 5], index = ['a', 'b', 'c'])
s2 = pd.Series([2, 7, 9], index = ['b', 'c', 'd'])

In [18]:
s1

a    1
b    3
c    5
dtype: int64

In [19]:
s2

b    2
c    7
d    9
dtype: int64

In [22]:
s1 * s2 #Multiplies values with same index

a     NaN
b     6.0
c    35.0
d     NaN
dtype: float64

In [23]:
s1 - s2

a    NaN
b    1.0
c   -2.0
d    NaN
dtype: float64

# DataFrame

Can be considered as 2D Array

In [24]:
df = pd.DataFrame(data = [[2,4,6,-1],[3,-4,2,-1],[-10,3,5,-7]], 
            columns=['C1','C2','C3','C4'],
            index=['R1','R2','R3'])

In [25]:
df

Unnamed: 0,C1,C2,C3,C4
R1,2,4,6,-1
R2,3,-4,2,-1
R3,-10,3,5,-7


In [28]:
df.head(2) #First few rows

Unnamed: 0,C1,C2,C3,C4
R1,2,4,6,-1
R2,3,-4,2,-1


In [31]:
df['newC'] = 7 #Create new column

In [32]:
df

Unnamed: 0,C1,C2,C3,C4,newC
R1,2,4,6,-1,7
R2,3,-4,2,-1,7
R3,-10,3,5,-7,7


In [35]:
df['newC'] = df['C1'] + 5

In [36]:
df

Unnamed: 0,C1,C2,C3,C4,newC
R1,2,4,6,-1,7
R2,3,-4,2,-1,8
R3,-10,3,5,-7,-5


In [38]:
df.drop('newC', axis=1) #Delete column #Axis=0 --> row #axis=1 -->columna

Unnamed: 0,C1,C2,C3,C4
R1,2,4,6,-1
R2,3,-4,2,-1
R3,-10,3,5,-7


In [39]:
df.drop('R1', axis=0)

Unnamed: 0,C1,C2,C3,C4,newC
R2,3,-4,2,-1,8
R3,-10,3,5,-7,-5


In [40]:
df

Unnamed: 0,C1,C2,C3,C4,newC
R1,2,4,6,-1,7
R2,3,-4,2,-1,8
R3,-10,3,5,-7,-5


In [41]:
df.drop('newC', axis=1, inplace=True) #inplace --> modify existing DataFrame

In [42]:
df

Unnamed: 0,C1,C2,C3,C4
R1,2,4,6,-1
R2,3,-4,2,-1
R3,-10,3,5,-7


In [43]:
df.drop('R1', axis=0, inplace=True)

In [44]:
df

Unnamed: 0,C1,C2,C3,C4
R2,3,-4,2,-1
R3,-10,3,5,-7


## Select

In [45]:
df

Unnamed: 0,C1,C2,C3,C4
R2,3,-4,2,-1
R3,-10,3,5,-7


In [46]:
df['C2']

R2   -4
R3    3
Name: C2, dtype: int64

In [47]:
df[['C2', 'C4']]

Unnamed: 0,C2,C4
R2,-4,-1
R3,3,-7


.loc(row, column)
.iloc(row, column) -- refer by integer value

In [48]:
df

Unnamed: 0,C1,C2,C3,C4
R2,3,-4,2,-1
R3,-10,3,5,-7


In [50]:
df.loc['R2', :]

C1    3
C2   -4
C3    2
C4   -1
Name: R2, dtype: int64

In [51]:
df.loc['R2', ['C2', 'C3']]

C2   -4
C3    2
Name: R2, dtype: int64

In [53]:
df.iloc[1,:] #Refers to R3-- iloc references by integer

C1   -10
C2     3
C3     5
C4    -7
Name: R3, dtype: int64

In [54]:
df.iloc[1, 1:3]

C2    3
C3    5
Name: R3, dtype: int64