# Pandas DataFrames
DataFrames will be the main tool while working with the pandas.


In [1]:
import numpy as np
import pandas as pd

In [2]:
from numpy.random import randn
np.random.seed(101) # here seed=101, is the starting seed, Random number algorithm will generate from this seed onwards.
# seed here is used to get same result for every one in class

In [3]:
df = pd.DataFrame(randn(5,4), ['A','B','C','D','E'],['W','X','Y','Z'])
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [4]:
df = pd.DataFrame(data=randn(5,4),index=['A','B','C','D','E'],columns=['W','X','Y','Z'])
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


### Here W,X,Y,Z are columns and A,B,C,D and E are rows
### W, X, Y, Z all are Panda Series and all are sharing the same indexes - A, B, C, D and E

## Pandas Indexing

In [5]:
df['W'] # Here, output is Pandas Series

A    0.302665
B   -0.134841
C    0.807706
D   -0.497104
E   -0.116773
Name: W, dtype: float64

In [6]:
type(df['W']) #type in-built function gives the object type.

pandas.core.series.Series

In [7]:
type(df)

pandas.core.frame.DataFrame

In [8]:
df.X

A    1.693723
B    0.390528
C    0.072960
D   -0.754070
E    1.901755
Name: X, dtype: float64

In [9]:
df.W # Not preferable way, as we sometimes get confuse with the available method names

A    0.302665
B   -0.134841
C    0.807706
D   -0.497104
E   -0.116773
Name: W, dtype: float64

In [10]:
df[['W','Z']] # List of Columns, output is the DataFrame

Unnamed: 0,W,Z
A,0.302665,-1.159119
B,-0.134841,0.184502
C,0.807706,0.329646
D,-0.497104,0.484752
E,-0.116773,1.996652


In [11]:
df['new'] = df['W'] + df['Y'] # Inserting new Columns in the DataFrame

In [12]:
df

Unnamed: 0,W,X,Y,Z,new
A,0.302665,1.693723,-1.706086,-1.159119,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,0.032064
C,0.807706,0.07296,0.638787,0.329646,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,-1.44051
E,-0.116773,1.901755,0.238127,1.996652,0.121354


In [15]:
df.drop('new',axis=1) #axis=1, will point to the column but it will not drop in-place

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [16]:
df

Unnamed: 0,W,X,Y,Z,new
A,0.302665,1.693723,-1.706086,-1.159119,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,0.032064
C,0.807706,0.07296,0.638787,0.329646,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,-1.44051
E,-0.116773,1.901755,0.238127,1.996652,0.121354


In [17]:
df.drop('new',axis=1,inplace=True) # inplace=True will reflect the changes

In [18]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [19]:
df.drop('E',axis=0) #axis=0, point to the rows

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752


In [20]:
df.shape #Output is tuple, 5 are rows and 4 are columns

(5, 4)

## Selecting the rows

In [21]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [22]:
df.loc['A'] #Here, output is Pandas Series

W    0.302665
X    1.693723
Y   -1.706086
Z   -1.159119
Name: A, dtype: float64

In [23]:
df.iloc[2] #Index-Based Location

W    0.807706
X    0.072960
Y    0.638787
Z    0.329646
Name: C, dtype: float64

In [24]:
df.loc['C']

W    0.807706
X    0.072960
Y    0.638787
Z    0.329646
Name: C, dtype: float64

In [25]:
df.loc['B','Y'] #Row,Column Notation

0.16690463609281317

In [26]:
df.iloc[0,0]

0.3026654485851825

In [27]:
df.loc[['A','B']]

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502


In [28]:
df.loc[['A','B'],['W','Y']] #Pandas Sub DataFrame

Unnamed: 0,W,Y
A,0.302665,-1.706086
B,-0.134841,0.166905


In [29]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [30]:
df.loc[['C','D','E'],['Z']]

Unnamed: 0,Z
C,0.329646
D,0.484752
E,1.996652
