# DataFrames


In [30]:
import pandas as pd
import numpy as np

In [31]:
from numpy.random import randn

np.random.seed(100)

In [None]:
# returns a 4 by 3 matrix sampled from the standard normal distribution
randn(4, 3)

array([[-1.74976547,  0.3426804 ,  1.1530358 ],
       [-0.25243604,  0.98132079,  0.51421884],
       [ 0.22117967, -1.07004333, -0.18949583],
       [ 0.25500144, -0.45802699,  0.43516349]])

In [33]:
df = pd.DataFrame(randn(4, 3), index="A B C D".split(), columns="X Y Z".split())

In [34]:
df

Unnamed: 0,X,Y,Z
A,-0.583595,0.816847,0.672721
B,-0.104411,-0.53128,1.029733
C,-0.438136,-1.118318,1.618982
D,1.541605,-0.251879,-0.842436


## Selection and Indexing


In [35]:
df["X"]  # returns a Series object

A   -0.583595
B   -0.104411
C   -0.438136
D    1.541605
Name: X, dtype: float64

In [36]:
df[["X"]]  # returns a DataFrame object

Unnamed: 0,X
A,-0.583595
B,-0.104411
C,-0.438136
D,1.541605


In [37]:
# Pass a list of column names
df[["Y", "Z"]]  # note double [[]] as we pass a list in. ['Y','Z']

Unnamed: 0,Y,Z
A,0.816847,0.672721
B,-0.53128,1.029733
C,-1.118318,1.618982
D,-0.251879,-0.842436


**Creating a new column:**


In [38]:
df["addition"] = df["X"] + df["Y"]

In [39]:
df

Unnamed: 0,X,Y,Z,addition
A,-0.583595,0.816847,0.672721,0.233252
B,-0.104411,-0.53128,1.029733,-0.635692
C,-0.438136,-1.118318,1.618982,-1.556454
D,1.541605,-0.251879,-0.842436,1.289726


**Removing Columns**


In [40]:
df.drop("addition", axis=1)

Unnamed: 0,X,Y,Z
A,-0.583595,0.816847,0.672721
B,-0.104411,-0.53128,1.029733
C,-0.438136,-1.118318,1.618982
D,1.541605,-0.251879,-0.842436


In [41]:
df1 = df.drop("addition", axis=1)
df1

Unnamed: 0,X,Y,Z
A,-0.583595,0.816847,0.672721
B,-0.104411,-0.53128,1.029733
C,-0.438136,-1.118318,1.618982
D,1.541605,-0.251879,-0.842436


In [42]:
# Not in place unless specified!
df

Unnamed: 0,X,Y,Z,addition
A,-0.583595,0.816847,0.672721,0.233252
B,-0.104411,-0.53128,1.029733,-0.635692
C,-0.438136,-1.118318,1.618982,-1.556454
D,1.541605,-0.251879,-0.842436,1.289726


In [43]:
df.drop("addition", axis=1, inplace=True)

In [44]:
df

Unnamed: 0,X,Y,Z
A,-0.583595,0.816847,0.672721
B,-0.104411,-0.53128,1.029733
C,-0.438136,-1.118318,1.618982
D,1.541605,-0.251879,-0.842436


Can also drop rows this way:


In [45]:
df.drop("D", axis=0)

Unnamed: 0,X,Y,Z
A,-0.583595,0.816847,0.672721
B,-0.104411,-0.53128,1.029733
C,-0.438136,-1.118318,1.618982


In [46]:
df  # note by default inplace=False

Unnamed: 0,X,Y,Z
A,-0.583595,0.816847,0.672721
B,-0.104411,-0.53128,1.029733
C,-0.438136,-1.118318,1.618982
D,1.541605,-0.251879,-0.842436


**Selecting Rows**


In [47]:
df.loc["A"]

X   -0.583595
Y    0.816847
Z    0.672721
Name: A, dtype: float64

Or select based off of position instead of label


In [48]:
df.iloc[2]

X   -0.438136
Y   -1.118318
Z    1.618982
Name: C, dtype: float64

** Selecting subset of rows and columns **


In [49]:
df.loc["B", "Y"]

np.float64(-0.5312803768519098)

In [50]:
df.loc[["A", "B"], ["X", "Y"]]

Unnamed: 0,X,Y
A,-0.583595,0.816847
B,-0.104411,-0.53128


### Conditional Selection

Similar to numpy:


In [51]:
df

Unnamed: 0,X,Y,Z
A,-0.583595,0.816847,0.672721
B,-0.104411,-0.53128,1.029733
C,-0.438136,-1.118318,1.618982
D,1.541605,-0.251879,-0.842436


In [52]:
df > 0  # returns a DataFrame of booleans

Unnamed: 0,X,Y,Z
A,False,True,True
B,False,False,True
C,False,False,True
D,True,False,False


In [53]:
df[df > 0]  # the DataFrame of booleans can be used to select

Unnamed: 0,X,Y,Z
A,,0.816847,0.672721
B,,,1.029733
C,,,1.618982
D,1.541605,,


### Reading a csv file


In [54]:
import pandas as pd

In [None]:
df = pd.read_csv("iris.csv")  # read using relative path

In [56]:
# an example of using absolute path with backslash - remember to escape the backslash
# df = pd.read_csv(
#     "C:\\Users\\chris\\Documents\\!G Training\\JDE05\\Week 1\\PROG-1 Introduction to Python\\iris.csv"
# )

In [57]:
# now with forward slash
# df = pd.read_csv(
#     "C:/Users/chris/Documents/!G Training/JDE05/Week 1/PROG-1 Introduction to Python/iris.csv"
# )

In [58]:
# There is a folder called data in the same directory
# df = pd.read_csv("data/iris.csv")