## Data Frame

In [2]:
import numpy as np
import pandas as pd

from numpy.random import randn

In [3]:
# set seed
np.random.seed(101)

In [7]:
randn(5,4)

array([[ 0.38603031,  2.08401853, -0.37651868,  0.23033634],
       [ 0.68120929,  1.03512507, -0.03116048,  1.93993231],
       [-1.00518692, -0.7417897 ,  0.18712452, -0.73284515],
       [-1.3829201 ,  1.4824955 ,  0.96145816, -2.14121229],
       [ 0.99257345,  1.19224064, -1.04677954,  1.29276458]])

In [12]:
df = pd.DataFrame(data = randn(5,4), 
                  index = ['A', 'B', 'C', 'D', 'E'], 
                  columns = ['W', 'X', 'Y', 'Z'])
df

Unnamed: 0,W,X,Y,Z
A,0.093628,1.240813,-1.097693,-1.908009
B,-0.380104,-1.666059,-2.736995,1.522562
C,0.178009,-0.626805,-0.391089,1.743477
D,1.130018,0.897796,0.330866,-1.063049
E,-0.125381,-0.945588,2.029544,-1.046358


* Each column is a pandas series
* A bunch of series that share index

In [15]:
type(df)

pandas.core.frame.DataFrame

### Selection/Indexing

**Selecting Columns**

In [13]:
# column, using bracket notation (better way)
df['W']

A    0.093628
B   -0.380104
C    0.178009
D    1.130018
E   -0.125381
Name: W, dtype: float64

In [17]:
type(df['W'])

pandas.core.series.Series

In [20]:
# observe
df[['W']]

Unnamed: 0,W
A,0.093628
B,-0.380104
C,0.178009
D,1.130018
E,-0.125381


In [21]:
type(df[['W']]) # double brackets - return data frame

pandas.core.frame.DataFrame

In [16]:
# column, dot notation, it might be confusing with other dot notations
df.W

A    0.093628
B   -0.380104
C    0.178009
D    1.130018
E   -0.125381
Name: W, dtype: float64

In [22]:
# multiple columns - list of columns
df[['W', 'Z']]

Unnamed: 0,W,Z
A,0.093628,-1.908009
B,-0.380104,1.522562
C,0.178009,1.743477
D,1.130018,-1.063049
E,-0.125381,-1.046358


**Creating a new column**

In [23]:
df['new'] = df['W'] + df['Y']
df

Unnamed: 0,W,X,Y,Z,new
A,0.093628,1.240813,-1.097693,-1.908009,-1.004065
B,-0.380104,-1.666059,-2.736995,1.522562,-3.117098
C,0.178009,-0.626805,-0.391089,1.743477,-0.21308
D,1.130018,0.897796,0.330866,-1.063049,1.460884
E,-0.125381,-0.945588,2.029544,-1.046358,1.904163


**Removing column(s)**

In [25]:
df.drop('new', axis=1) # specify axis = 1 for columns
# it does not modify the original df, it returns a new one

Unnamed: 0,W,X,Y,Z
A,0.093628,1.240813,-1.097693,-1.908009
B,-0.380104,-1.666059,-2.736995,1.522562
C,0.178009,-0.626805,-0.391089,1.743477
D,1.130018,0.897796,0.330866,-1.063049
E,-0.125381,-0.945588,2.029544,-1.046358


In [26]:
df

Unnamed: 0,W,X,Y,Z,new
A,0.093628,1.240813,-1.097693,-1.908009,-1.004065
B,-0.380104,-1.666059,-2.736995,1.522562,-3.117098
C,0.178009,-0.626805,-0.391089,1.743477,-0.21308
D,1.130018,0.897796,0.330866,-1.063049,1.460884
E,-0.125381,-0.945588,2.029544,-1.046358,1.904163


In [28]:
# to drop from the original data frame, (set inplace = True)
df.drop("new", axis = 1, inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,0.093628,1.240813,-1.097693,-1.908009
B,-0.380104,-1.666059,-2.736995,1.522562
C,0.178009,-0.626805,-0.391089,1.743477
D,1.130018,0.897796,0.330866,-1.063049
E,-0.125381,-0.945588,2.029544,-1.046358


*inplace = False* is integrated to prevent information loss

**Removing rows**

In [29]:
df.drop('E')

Unnamed: 0,W,X,Y,Z
A,0.093628,1.240813,-1.097693,-1.908009
B,-0.380104,-1.666059,-2.736995,1.522562
C,0.178009,-0.626805,-0.391089,1.743477
D,1.130018,0.897796,0.330866,-1.063049


In [30]:
df

Unnamed: 0,W,X,Y,Z
A,0.093628,1.240813,-1.097693,-1.908009
B,-0.380104,-1.666059,-2.736995,1.522562
C,0.178009,-0.626805,-0.391089,1.743477
D,1.130018,0.897796,0.330866,-1.063049
E,-0.125381,-0.945588,2.029544,-1.046358


**Selecting Rows**

* df.loc - takes label 
* df.iloc - takes index

In [32]:
df.loc['A'] # notice rows are series as well

W    0.093628
X    1.240813
Y   -1.097693
Z   -1.908009
Name: A, dtype: float64

In [34]:
df.iloc[0]

W    0.093628
X    1.240813
Y   -1.097693
Z   -1.908009
Name: A, dtype: float64

In [35]:
df.loc[['A','B']] # use list to subset multiple rows, gives a df

Unnamed: 0,W,X,Y,Z
A,0.093628,1.240813,-1.097693,-1.908009
B,-0.380104,-1.666059,-2.736995,1.522562


In [38]:
df.iloc[0:2] # with iloc, it is similar to indexing list

Unnamed: 0,W,X,Y,Z
A,0.093628,1.240813,-1.097693,-1.908009
B,-0.380104,-1.666059,-2.736995,1.522562


In [39]:
df.iloc[[2,4]]

Unnamed: 0,W,X,Y,Z
C,0.178009,-0.626805,-0.391089,1.743477
E,-0.125381,-0.945588,2.029544,-1.046358


**Subsetting rows AND columns**
Very similar to selecting rows:
* df.loc
* df.iloc

In [40]:
df.loc['B', 'Y']

-2.7369945956467303

In [42]:
df.loc[['A', 'B'], ['W', 'Y']]

Unnamed: 0,W,Y
A,0.093628,-1.097693
B,-0.380104,-2.736995
