In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
from numpy.random import randn

In [4]:
np.random.seed(101)

# DataFrames

In [5]:
# pd.DataFrame(data, index, columns, ....)

In [6]:
df1 = pd.DataFrame(randn(5,4))
df1

Unnamed: 0,0,1,2,3
0,2.70685,0.628133,0.907969,0.503826
1,0.651118,-0.319318,-0.848077,0.605965
2,-2.018168,0.740122,0.528813,-0.589001
3,0.188695,-0.758872,-0.933237,0.955057
4,0.190794,1.978757,2.605967,0.683509


In [7]:
df2 = pd.DataFrame(randn(5,4), ['A', 'B', 'C', 'D', 'E'])
df2

Unnamed: 0,0,1,2,3
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [8]:
df = pd.DataFrame(randn(5,4), ['A', 'B', 'C', 'D', 'E'], ['W','X', 'Y','Z'])
df

Unnamed: 0,W,X,Y,Z
A,-0.993263,0.1968,-1.136645,0.000366
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481
E,-0.925874,1.862864,-1.133817,0.610478


# DataFrame is just bunch of Series sharing common Indexs.

In [9]:
# Here is the proof
df['W']

A   -0.993263
B    1.025984
C    2.154846
D    0.147027
E   -0.925874
Name: W, dtype: float64

In [10]:
type(df['W'])

pandas.core.series.Series

In [11]:
type(df)

pandas.core.frame.DataFrame

# Selecting Columns

In [12]:
df

Unnamed: 0,W,X,Y,Z
A,-0.993263,0.1968,-1.136645,0.000366
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481
E,-0.925874,1.862864,-1.133817,0.610478


In [13]:
# Accessing a single column, for single column it returns a Series.
df['W']

A   -0.993263
B    1.025984
C    2.154846
D    0.147027
E   -0.925874
Name: W, dtype: float64

In [14]:
# Multiple columns, for more then one column, we have to pass list of columns, for multiple it returns a dataframe. 
df[['X', 'Z']]

Unnamed: 0,X,Z
A,0.1968,0.000366
B,-0.156598,0.649826
C,-0.610259,-0.346419
D,-0.479448,1.02481
E,1.862864,0.610478


## Creating New Columns

In [15]:
df['new'] = randn(5)

In [16]:
df

Unnamed: 0,W,X,Y,Z,new
A,-0.993263,0.1968,-1.136645,0.000366,0.38603
B,1.025984,-0.156598,-0.031579,0.649826,2.084019
C,2.154846,-0.610259,-0.755325,-0.346419,-0.376519
D,0.147027,-0.479448,0.558769,1.02481,0.230336
E,-0.925874,1.862864,-1.133817,0.610478,0.681209


In [17]:
df['pp'] = df['W'] + df['X']

In [18]:
df

Unnamed: 0,W,X,Y,Z,new,pp
A,-0.993263,0.1968,-1.136645,0.000366,0.38603,-0.796464
B,1.025984,-0.156598,-0.031579,0.649826,2.084019,0.869386
C,2.154846,-0.610259,-0.755325,-0.346419,-0.376519,1.544588
D,0.147027,-0.479448,0.558769,1.02481,0.230336,-0.332421
E,-0.925874,1.862864,-1.133817,0.610478,0.681209,0.93699


## drop() any row or column

In [19]:
# Drop any row
df.drop('E')

Unnamed: 0,W,X,Y,Z,new,pp
A,-0.993263,0.1968,-1.136645,0.000366,0.38603,-0.796464
B,1.025984,-0.156598,-0.031579,0.649826,2.084019,0.869386
C,2.154846,-0.610259,-0.755325,-0.346419,-0.376519,1.544588
D,0.147027,-0.479448,0.558769,1.02481,0.230336,-0.332421


In [20]:
df.drop('C', axis=0)
# axis = 0,  represent row
# E is here because these deletions are not permanent.

Unnamed: 0,W,X,Y,Z,new,pp
A,-0.993263,0.1968,-1.136645,0.000366,0.38603,-0.796464
B,1.025984,-0.156598,-0.031579,0.649826,2.084019,0.869386
D,0.147027,-0.479448,0.558769,1.02481,0.230336,-0.332421
E,-0.925874,1.862864,-1.133817,0.610478,0.681209,0.93699


In [21]:
# Drop any column
# df.drop('new')
# KeyError"['new'] not found in axis", because python will find a row names 'new', because default axis=0

In [22]:
df.drop('new',1)   # axis=1

Unnamed: 0,W,X,Y,Z,pp
A,-0.993263,0.1968,-1.136645,0.000366,-0.796464
B,1.025984,-0.156598,-0.031579,0.649826,0.869386
C,2.154846,-0.610259,-0.755325,-0.346419,1.544588
D,0.147027,-0.479448,0.558769,1.02481,-0.332421
E,-0.925874,1.862864,-1.133817,0.610478,0.93699


#### But changes are not permanent so nothing is droped from dataFrame

In [23]:
df

Unnamed: 0,W,X,Y,Z,new,pp
A,-0.993263,0.1968,-1.136645,0.000366,0.38603,-0.796464
B,1.025984,-0.156598,-0.031579,0.649826,2.084019,0.869386
C,2.154846,-0.610259,-0.755325,-0.346419,-0.376519,1.544588
D,0.147027,-0.479448,0.558769,1.02481,0.230336,-0.332421
E,-0.925874,1.862864,-1.133817,0.610478,0.681209,0.93699


#### For permanent changes

In [24]:
# For permanent deletion set inplace=True from default value inplace=False
df.drop('pp', 1, inplace=True)

In [25]:
df

Unnamed: 0,W,X,Y,Z,new
A,-0.993263,0.1968,-1.136645,0.000366,0.38603
B,1.025984,-0.156598,-0.031579,0.649826,2.084019
C,2.154846,-0.610259,-0.755325,-0.346419,-0.376519
D,0.147027,-0.479448,0.558769,1.02481,0.230336
E,-0.925874,1.862864,-1.133817,0.610478,0.681209


# Selecting Rows

In [26]:
# df['A']
# KeyError: 'A', because it will try to access a column named 'A'

## loc[ ]

In [27]:
# To access a row by its lable
df.loc['A']

W     -0.993263
X      0.196800
Y     -1.136645
Z      0.000366
new    0.386030
Name: A, dtype: float64

In [34]:
df.loc['C']

W      2.154846
X     -0.610259
Y     -0.755325
Z     -0.346419
new   -0.376519
Name: C, dtype: float64

In [36]:
df.loc[['A', 'C']]

Unnamed: 0,W,X,Y,Z,new
A,-0.993263,0.1968,-1.136645,0.000366,0.38603
C,2.154846,-0.610259,-0.755325,-0.346419,-0.376519


## iloc[ ]

In [37]:
# To access rows by its numerical index value, whether it has a lable or not
df.iloc[2]   # It is same as df.loc['C']

W      2.154846
X     -0.610259
Y     -0.755325
Z     -0.346419
new   -0.376519
Name: C, dtype: float64

In [39]:
df.iloc[[2,3]]

Unnamed: 0,W,X,Y,Z,new
C,2.154846,-0.610259,-0.755325,-0.346419,-0.376519
D,0.147027,-0.479448,0.558769,1.02481,0.230336


# Selecting Subsets of DataFrame

In [30]:
df

Unnamed: 0,W,X,Y,Z,new
A,-0.993263,0.1968,-1.136645,0.000366,0.38603
B,1.025984,-0.156598,-0.031579,0.649826,2.084019
C,2.154846,-0.610259,-0.755325,-0.346419,-0.376519
D,0.147027,-0.479448,0.558769,1.02481,0.230336
E,-0.925874,1.862864,-1.133817,0.610478,0.681209


In [31]:
# Single cell
df['Z']['D']          # Not recommanded

1.0248102783372157

In [32]:
df.loc['D', 'Z']      # Recommanded

1.0248102783372157

### Subset

In [33]:
df.loc[['A', 'C'],['X', 'Y']]

Unnamed: 0,X,Y
A,0.1968,-1.136645
C,-0.610259,-0.755325
