# Introduction to Pandas

In [2]:
# Import Numpy and Pandas
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

### Series

A series is a 1d array with axis labels (an index)

In [4]:
# Creating a series from a list
x = pd.Series([10,20,30])
x

0    10
1    20
2    30
dtype: int64

In [5]:
# We can access different components separetly
x.index

RangeIndex(start=0, stop=3, step=1)

In [6]:
# Access values
x.values

array([10, 20, 30], dtype=int64)

In [7]:
# Acess data type
# ndarrya is homogenous, single data type
x.dtype

dtype('int64')

In [8]:
# Creating a Series with an index
data = [10,20,30]
sales = Series(data, index=['yes','no','okay'])

In [9]:
# Type of sales
type(sales)

pandas.core.series.Series

In [10]:
# When we check the index now, we get the new index
sales.index

Index(['yes', 'no', 'okay'], dtype='object')

In [15]:
# Accessing the values again
sales['yes']

10

# Checking for conditions

In [18]:
# We can filter our data based on conditions we specify
sales > 5

yes     True
no      True
okay    True
dtype: bool

In [19]:
# What happends when we use these booleans
sales[[False, True, True]]

no      20
okay    30
dtype: int64

In [20]:
# Checking if something is present in the index
'yes' in sales

True

In [21]:
10 in sales

False

# Working with dictionaries

In [22]:
# Converting series to dict
sales_dict = pd.Series(sales)

In [24]:
# Converting dict to series
sales = pd.Series(sales_dict)

# DataFrames

In [30]:
# Making a DF from a list
list = [["adrian", 20],["sam",30],["ally",40]]

# When we create a DF we can specify the column names
df = pd.DataFrame(list, columns=["'Name", "Age"])
df

Unnamed: 0,'Name,Age
0,adrian,20
1,sam,30
2,ally,40


In [33]:
# Creating a DF from a dict
new_dict = {"Name":["Tom", "Jane", "Steve", "Lucy"], "Sales": [250, 300, 350,400]}
df_dict = pd.DataFrame(new_dict)
df_dict

Unnamed: 0,Name,Sales
0,Tom,250
1,Jane,300
2,Steve,350
3,Lucy,400


In [34]:
# You can assign a custom index
df_dict_index = pd.DataFrame(new_dict, index=['rank1','rank2','rank3','rank4'])
df_dict_index

Unnamed: 0,Name,Sales
rank1,Tom,250
rank2,Jane,300
rank3,Steve,350
rank4,Lucy,400


In [36]:
# Creating a DF from a list of dictionaries
# Useful when working with json
dict_list = [{'Name':'Tom','Sales':250},{'Name':'Jane','Sales':300},{'Name':'Steve','Sales':350},{'Name':'Lucy','Sales':400}]

df_dict_list = pd.DataFrame(dict_list)
df_dict_list

Unnamed: 0,Name,Sales
0,Tom,250
1,Jane,300
2,Steve,350
3,Lucy,400


In [40]:
# Creating a DF from a dict of Series
east = pd.Series([1000,1200,3400],index=['Q1','Q2','Q3'])
west = pd.Series([1100,1300,2400,3500],index=['Q1','Q2','Q3','Q4'])

In [41]:
east

Q1    1000
Q2    1200
Q3    3400
dtype: int64

In [42]:
west

Q1    1100
Q2    1300
Q3    2400
Q4    3500
dtype: int64

In [62]:
# If we want to put a Series into a DF we combine them
df_region = pd.DataFrame({'East':east, 'West':west})
df_region

Unnamed: 0,East,West
Q1,1000.0,1100
Q2,1200.0,1300
Q3,3400.0,2400
Q4,,3500


In [63]:
# We can add more series
df_region['North'] = [2000,3000,2500,4000]
df_region['South'] = [1500,2000,1500,4000]
df_region

Unnamed: 0,East,West,North,South
Q1,1000.0,1100,2000,1500
Q2,1200.0,1300,3000,2000
Q3,3400.0,2400,2500,1500
Q4,,3500,4000,4000


# Shifting and changin the index

In [64]:
# Sometimes we make a mistake and need to edit the index
# We can add a new column and set the column as the index
new_df = df_region.reindex(['2017','2018','2019','2020','2021'])
new_df

Unnamed: 0,East,West,North,South
2017,,,,
2018,,,,
2019,,,,
2020,,,,
2021,,,,


In [77]:
# Set the column as the index
df_region = df_region.set_index(['Years'])

KeyError: "None of ['Years'] are in the columns"

In [66]:
# We can also change the index values if we want
new_df = df_region.reindex(['2017','2018','2019','2020','2021'])
new_df

Unnamed: 0,East,West,North,South
2017,,,,
2018,,,,
2019,,,,
2020,,,,
2021,,,,


In [67]:
# We can use re-index to shift/change columns
# Including adding new columns
re_indexed = new_df.reindex(columns=['North','East','South','New'])
re_indexed

Unnamed: 0,North,East,South,New
2017,,,,
2018,,,,
2019,,,,
2020,,,,
2021,,,,


# Missing Data

In [68]:
# Filling missing values
# We may want to change all NaN values to 0
re_indexed.fillna(1)
re_indexed

Unnamed: 0,North,East,South,New
2017,,,,
2018,,,,
2019,,,,
2020,,,,
2021,,,,


In [74]:
# Fill methods
# Backfill, BFfill, pad, ffill
re_indexed.ffill()

Unnamed: 0,North,East,South,New
2017,,,,
2018,,,,
2019,,,,
2020,,,,
2021,,,,


In [75]:
# We can also use interpolation
re_indexed.interpolate()

Unnamed: 0,North,East,South,New
2017,,,,
2018,,,,
2019,,,,
2020,,,,
2021,,,,


# Dropping items in DFs

In [76]:
# Instead of filling NaN values we can drop
# dropna() drops all NaN values
re_indexed.dropna()

Unnamed: 0,North,East,South,New


In [79]:
# We can specify columns and methods 

# axis 1 = columns, axis 0 = rows
re_indexed.dropna(axis=1,how="all")

2017
2018
2019
2020
2021


In [81]:
# We can set a threshold, numer is the amount of NaN values allowed to be present in each row
re_indexed.dropna(thresh=1)

Unnamed: 0,North,East,South,New


In [82]:
# Dropping based on index
re_indexed.drop("2019")

Unnamed: 0,North,East,South,New
2017,,,,
2018,,,,
2020,,,,
2021,,,,


In [86]:
# We can easily check duplicates and remove duplicated rows
df_dup = DataFrame([['A',1],['B',2],['A',1]])
df_dup.duplicated()

0    False
1    False
2     True
dtype: bool

In [89]:
# Drop dupicate rows
df_dup.drop_duplicates(inplace=True)
df_dup

Unnamed: 0,0,1
0,A,1
1,B,2


# Selecting entries

In [90]:
new_df

Unnamed: 0,East,West,North,South
2017,,,,
2018,,,,
2019,,,,
2020,,,,
2021,,,,


In [91]:
# Select entire columns
new_df["North"]

2017   NaN
2018   NaN
2019   NaN
2020   NaN
2021   NaN
Name: North, dtype: float64

In [92]:
# Iloc
# Find record based on intereger indexing
new_df.iloc[2]

East    NaN
West    NaN
North   NaN
South   NaN
Name: 2019, dtype: float64

In [94]:
# Find specific values (row, index)
new_df.iloc[0,1]

nan

In [95]:
# Iloc also allows slicing (rows : 
new_df.iloc[1:3]

Unnamed: 0,East,West,North,South
2018,,,,
2019,,,,


In [96]:
# Loc allows us to access a group of rows and columns based on labels or a boolean array
new_df.loc['2019']

East    NaN
West    NaN
North   NaN
South   NaN
Name: 2019, dtype: float64

In [98]:
# Select mupltile rows
new_df.loc[['2019','2018']]

Unnamed: 0,East,West,North,South
2019,,,,
2018,,,,


In [99]:
# Boolean arrays with loc
# For each row, we specify a boolean values which denotes if we want the row retunred or not
new_df.loc[[False,False,True,True,True]]

Unnamed: 0,East,West,North,South
2019,,,,
2020,,,,
2021,,,,


In [100]:
# Use filtering to get specific info
new_df[new_df['West'] > 500]

Unnamed: 0,East,West,North,South


# Data Alignment

In [101]:
# Adding to columns in one DF
new_df['East'] + new_df ['North']

2017   NaN
2018   NaN
2019   NaN
2020   NaN
2021   NaN
dtype: float64

In [102]:
# We can use the .add method to specifiyy a fill value for NaN
new_df['East'].add(new_df['North'],fill_value=0)

2017   NaN
2018   NaN
2019   NaN
2020   NaN
2021   NaN
dtype: float64

# Sorting and Ranking

In [105]:
#Sort by index
# Ascending = 0 means False (so 1 = True)
new_df.sort_index(ascending=0)

Unnamed: 0,East,West,North,South
2021,,,,
2020,,,,
2019,,,,
2018,,,,
2017,,,,


In [106]:
# Sorting by column
# Defeualt is ascending
new_df.sort_values(by=['North'])

Unnamed: 0,East,West,North,South
2017,,,,
2018,,,,
2019,,,,
2020,,,,
2021,,,,


In [107]:
# Ranking columns
# Specify the coloumn and apply the ranki function to it
new_df['North'].rank(ascending=0)

2017   NaN
2018   NaN
2019   NaN
2020   NaN
2021   NaN
Name: North, dtype: float64

In [110]:
# We can save ranking as a new coloumn
new_df['rank_north'] = new_df['North'].rank(ascending=0)

# Summary Statistics

In [111]:
# Describe - gives general summary statistics for the entire df
new_df.describe()

Unnamed: 0,East,West,North,South,rank_north
count,0.0,0.0,0.0,0.0,0.0
mean,,,,,
std,,,,,
min,,,,,
25%,,,,,
50%,,,,,
75%,,,,,
max,,,,,


In [112]:
# Sum all columns
new_df.sum()

East          0.0
West          0.0
North         0.0
South         0.0
rank_north    0.0
dtype: float64

In [113]:
# Cumulative sum
new_df.cumsum()

Unnamed: 0,East,West,North,South,rank_north
2017,,,,,
2018,,,,,
2019,,,,,
2020,,,,,
2021,,,,,


In [114]:
# Min values in each column
new_df.min()

East         NaN
West         NaN
North        NaN
South        NaN
rank_north   NaN
dtype: float64

In [115]:
# Max values in each column
new_df.max()

East         NaN
West         NaN
North        NaN
South        NaN
rank_north   NaN
dtype: float64

# Index Hierarchy

In [117]:
# Creating a DataFrame with a hierarchical index
# We are using numpy arange to create a range of numbers, and re-shaping it so that the numbers will fit on a 4x4 grid
df_temp = DataFrame(np.arange(16).reshape(4,4), index = [['2018','2018','2019','2019'],['Jan','Feb','Jan','Feb']],columns = [['NY','NY','LA','LA'],['Cold','Hot','Cold','Hot']])
df_temp

Unnamed: 0_level_0,Unnamed: 1_level_0,NY,NY,LA,LA
Unnamed: 0_level_1,Unnamed: 1_level_1,Cold,Hot,Cold,Hot
2018,Jan,0,1,2,3
2018,Feb,4,5,6,7
2019,Jan,8,9,10,11
2019,Feb,12,13,14,15


In [118]:
# Add index to column names
df_temp.index.names=['Year','Month']
df_temp.columns.names=['State','Temp']
df_temp

Unnamed: 0_level_0,State,NY,NY,LA,LA
Unnamed: 0_level_1,Temp,Cold,Hot,Cold,Hot
Year,Month,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2018,Jan,0,1,2,3
2018,Feb,4,5,6,7
2019,Jan,8,9,10,11
2019,Feb,12,13,14,15


In [119]:
# It is possible to swap level for patricualr axis
df_temp.swaplevel('State','Temp',axis=1)

Unnamed: 0_level_0,Temp,Cold,Hot,Cold,Hot
Unnamed: 0_level_1,State,NY,NY,LA,LA
Year,Month,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2018,Jan,0,1,2,3
2018,Feb,4,5,6,7
2019,Jan,8,9,10,11
2019,Feb,12,13,14,15


In [123]:
# Sorting based on the first level of the index
df_temp.sort_index(level=0)

Unnamed: 0_level_0,State,NY,NY,LA,LA
Unnamed: 0_level_1,Temp,Cold,Hot,Cold,Hot
Year,Month,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2018,Feb,4,5,6,7
2018,Jan,0,1,2,3
2019,Feb,12,13,14,15
2019,Jan,8,9,10,11


In [124]:
# Access single coloumns in a multi-index DF
df_temp['NY']['Cold']

Year  Month
2018  Jan       0
      Feb       4
2019  Jan       8
      Feb      12
Name: Cold, dtype: int32

In [125]:
# Can still use iloc
df_temp.iloc[0]

State  Temp
NY     Cold    0
       Hot     1
LA     Cold    2
       Hot     3
Name: (2018, Jan), dtype: int32

In [126]:
# Can also still use loc
df_temp.loc['2018','Jan']

State  Temp
NY     Cold    0
       Hot     1
LA     Cold    2
       Hot     3
Name: (2018, Jan), dtype: int32