# Pandas Cheatsheet
Copyright 2021 [Compass Mentis IT Limited](https://www.compassmentis.com).\
May be freely distributed under [Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)](https://creativecommons.org/licenses/by-sa/4.0/).

## Setup

In [1]:
import pandas as pd
import numpy as np

## Pandas Series

A Pandas Series is a one-dimensional array, a list. It has either a numeric index or a named index

In [2]:
# Pandas Series, numeric index
sample_series = pd.Series([1, 2, 3, 4])
sample_series

0    1
1    2
2    3
3    4
dtype: int64

In [3]:
# Pandas Series, named index
cities = ['Tokyo', 'Delhi', 'Shanghai', 'São Paulo']
population_numbers = [37, 28, 25, 21]
populations = pd.Series(data=population_numbers, index=cities)
populations

Tokyo        37
Delhi        28
Shanghai     25
São Paulo    21
dtype: int64

## Pandas DataFrames

A Pandas DataFrame is a two-dimensional array, a grid. It consists of one or more Series (columns) which all share the same (numeric or named) index

In [4]:
# Pandas DataFrame, no column names, numeric index
df = pd.DataFrame([[1, 2], [3, 4]])
df

Unnamed: 0,0,1
0,1,2
1,3,4


In [5]:
# Pandas DataFrame with column names, numeric index
df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
df

Unnamed: 0,A,B
0,1,2
1,3,4


In [6]:
# Pandas DataFrame with column names and named index
df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'], index=['V', 'W'])
df

Unnamed: 0,A,B
V,1,2
W,3,4


In [7]:
# Another Pandas DataFrame
cities = ['Tokyo', 'Delhi', 'Shanghai', 'São Paulo']
population_numbers = [37, 28, 25, 21]
area_numbers = [8230, 2232, 4068, 3116]
column_names = ['Population', 'Area']

# To create the dataframe we need to give it a list of rows. 
# We've got a list of columns instead
# For now we'll use this, and transpose it in the next step
# Notice how the cities are the column names instead of the row names
df = pd.DataFrame(data=[population_numbers, area_numbers], columns=cities, index=column_names)
df

Unnamed: 0,Tokyo,Delhi,Shanghai,São Paulo
Population,37,28,25,21
Area,8230,2232,4068,3116


In [8]:
# Transpose the DataFrame
df = df.transpose()
df

Unnamed: 0,Population,Area
Tokyo,37,8230
Delhi,28,2232
Shanghai,25,4068
São Paulo,21,3116


## Selecting rows and columns from a DataFrame

In [9]:
# The following examples all use this DataFrame
data = [[1, 2, 3, 4], [10, 11, 12, 13], [30, 31, 32, 33]]
columns = ['ColA', 'ColB', 'ColC', 'ColD']
rows = ['RowA', 'RowB', 'RowC']
df = pd.DataFrame(data=data, columns=columns, index=rows)
df

Unnamed: 0,ColA,ColB,ColC,ColD
RowA,1,2,3,4
RowB,10,11,12,13
RowC,30,31,32,33


In [10]:
# Selecting a single column by name
df['ColA']

RowA     1
RowB    10
RowC    30
Name: ColA, dtype: int64

In [11]:
# Selecting multiple columns by name
columns = ['ColA', 'ColB']
df[columns]

Unnamed: 0,ColA,ColB
RowA,1,2
RowB,10,11
RowC,30,31


In [12]:
# Shorter version - Selecting multiple columns by name
# Note the double square brackets
df[['ColA', 'ColB']]

Unnamed: 0,ColA,ColB
RowA,1,2
RowB,10,11
RowC,30,31


In [13]:
# Selecting a single column by number - first column
# Note the zero indexing, column 0 is the first column
df.iloc[:, 0]

RowA     1
RowB    10
RowC    30
Name: ColA, dtype: int64

In [14]:
# Selecting a single row by name
# This returns a Series, with the column names as the named index
df.loc['RowA']

ColA    1
ColB    2
ColC    3
ColD    4
Name: RowA, dtype: int64

In [15]:
# Selecting multiple rows by name
rows = ['RowB', 'RowC']
df.loc[rows]

Unnamed: 0,ColA,ColB,ColC,ColD
RowB,10,11,12,13
RowC,30,31,32,33


In [16]:
# Shorter version - Selecting multiple rows by name
# Note the double square brackets
df.loc[['RowB', 'RowC']]

Unnamed: 0,ColA,ColB,ColC,ColD
RowB,10,11,12,13
RowC,30,31,32,33


In [17]:
# Selecting a single row by number
# Note the zero indexing, row 0 is the first row
df.iloc[0]

ColA    1
ColB    2
ColC    3
ColD    4
Name: RowA, dtype: int64

## Selecting individual cells from a DataFrame

In [18]:
# A reminder of the data used in these examples
df

Unnamed: 0,ColA,ColB,ColC,ColD
RowA,1,2,3,4
RowB,10,11,12,13
RowC,30,31,32,33


In [19]:
# Selecting a cell by row and column name
df.loc['RowB', 'ColB']

11

In [20]:
# Selecting a cell by row and column number
# Note the zero indexing, row 0 is the first row
# ditto for columns
row_number = 0
column_number = 1
df.iloc[row_number, column_number]

2

## Selecting an area from a DataFrame

In [21]:
# A reminder of the data used in these examples
df

Unnamed: 0,ColA,ColB,ColC,ColD
RowA,1,2,3,4
RowB,10,11,12,13
RowC,30,31,32,33


In [22]:
# Selecting an area by row range and column range, using names
# Note how both the start and end row are included
# ditto for columns
df.loc['RowA':'RowB', 'ColB':'ColC']

Unnamed: 0,ColB,ColC
RowA,2,3
RowB,11,12


In [23]:
# Selecting an area by row range and column range, using numbers
# Note how the end row is *not* included
# Ditto for the end column
df.iloc[0:2, 0:2]

Unnamed: 0,ColA,ColB
RowA,1,2
RowB,10,11


In [24]:
# Selecting every other column, using slicing
# To get all the rows, put a ':' as the first argument, i.e. before the comma
df.iloc[:, ::2]

Unnamed: 0,ColA,ColC
RowA,1,3
RowB,10,12
RowC,30,32


## Adding and deleting columns

In [25]:
# The following examples still use this DataFrame
data = [[1, 2, 3, 4], [10, 11, 12, 13], [30, 31, 32, 33]]
columns = ['ColA', 'ColB', 'ColC', 'ColD']
rows = ['RowA', 'RowB', 'RowC']
df = pd.DataFrame(data=data, columns=columns, index=rows)
df

Unnamed: 0,ColA,ColB,ColC,ColD
RowA,1,2,3,4
RowB,10,11,12,13
RowC,30,31,32,33


In [26]:
# Adding a new column as a sum of two existing columns
df['ColAplusB'] = df['ColA'] + df['ColB']
df

Unnamed: 0,ColA,ColB,ColC,ColD,ColAplusB
RowA,1,2,3,4,3
RowB,10,11,12,13,21
RowC,30,31,32,33,61


In [27]:
# Removing (dropping) a column
# axis=1 means columns. axis=0 means rows
# Note: This returns a new view on the data, minus ColD,
#       which then gets shown
df.drop('ColD', axis=1)

Unnamed: 0,ColA,ColB,ColC,ColAplusB
RowA,1,2,3,3
RowB,10,11,12,21
RowC,30,31,32,61


In [28]:
# Note how df wasn't actually changed, ColD is still there,
# in the previous cell it just wasn't being shown
df

Unnamed: 0,ColA,ColB,ColC,ColD,ColAplusB
RowA,1,2,3,4,3
RowB,10,11,12,13,21
RowC,30,31,32,33,61


In [29]:
# Permanently removing a column
df = df.drop('ColD', axis=1)
df

Unnamed: 0,ColA,ColB,ColC,ColAplusB
RowA,1,2,3,3
RowB,10,11,12,21
RowC,30,31,32,61


## Adding and removing rows

In [30]:
# The following examples use this DataFrame again
data = [[1, 2, 3, 4], [10, 11, 12, 13], [30, 31, 32, 33]]
columns = ['ColA', 'ColB', 'ColC', 'ColD']
rows = ['RowA', 'RowB', 'RowC']
df = pd.DataFrame(data=data, columns=columns, index=rows)
df

Unnamed: 0,ColA,ColB,ColC,ColD
RowA,1,2,3,4
RowB,10,11,12,13
RowC,30,31,32,33


In [31]:
# Adding a row
# We need to give this a dictionary with the column names as the keys
# Note how any cell which doesn't have a value is set to NaN - not a number
df.append({'ColA':100, 'ColB': 101, 'ColC': 102}, ignore_index=True)

Unnamed: 0,ColA,ColB,ColC,ColD
0,1.0,2.0,3.0,4.0
1,10.0,11.0,12.0,13.0
2,30.0,31.0,32.0,33.0
3,100.0,101.0,102.0,


In [32]:
# Removing (dropping) a row
# axis=1 means columns. axis=0 means rows
# Note: This returns a new view on the data, minus RowB,
#       which then gets shown
df.drop('RowB', axis=0)

Unnamed: 0,ColA,ColB,ColC,ColD
RowA,1,2,3,4
RowC,30,31,32,33


In [33]:
# Note how df wasn't actually changed, RowB is still there,
# in the previous cell it just wasn't being shown
df

Unnamed: 0,ColA,ColB,ColC,ColD
RowA,1,2,3,4
RowB,10,11,12,13
RowC,30,31,32,33


In [34]:
# Permanently removing a row
df = df.drop('RowB', axis=0)
df

Unnamed: 0,ColA,ColB,ColC,ColD
RowA,1,2,3,4
RowC,30,31,32,33


## Filtering a DataFrame

In [35]:
# Another Pandas DataFrame
# See above for more details on how this was created
cities = ['Tokyo', 'Delhi', 'Shanghai', 'São Paulo']
population_numbers = [37, 28, 25, 21]
area_numbers = [8230, 2232, 4068, 3116]
column_names = ['Population', 'Area']
df = pd.DataFrame(data=[population_numbers, area_numbers], columns=cities, index=column_names)
df = df.transpose()
df

Unnamed: 0,Population,Area
Tokyo,37,8230
Delhi,28,2232
Shanghai,25,4068
São Paulo,21,3116


In [36]:
# Filtering rows in a DataFrame by values from a specific column
df[ df['Area'] > 4000]

Unnamed: 0,Population,Area
Tokyo,37,8230
Shanghai,25,4068


## Getting information about a DataFrame

In [37]:
# Column names
df.columns

Index(['Population', 'Area'], dtype='object')

In [38]:
# Row names (shared index)
df.index

Index(['Tokyo', 'Delhi', 'Shanghai', 'São Paulo'], dtype='object')

In [39]:
# Shape - number of rows and columns
df.shape

(4, 2)

In [40]:
# Data types
df.dtypes

Population    int64
Area          int64
dtype: object

In [41]:
# Most of info shown above
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, Tokyo to São Paulo
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   Population  4 non-null      int64
 1   Area        4 non-null      int64
dtypes: int64(2)
memory usage: 96.0+ bytes


In [42]:
# Statistics
df.describe()

Unnamed: 0,Population,Area
count,4.0,4.0
mean,27.75,4411.5
std,6.800735,2653.769332
min,21.0,2232.0
25%,24.0,2895.0
50%,26.5,3592.0
75%,30.25,5108.5
max,37.0,8230.0


## Loading and saving data - Excel and CSV

For getting a full list of functions try:

* pd.read_\<tab key>
* pd.to_\<tab key>

In [43]:
# The next examples use the existing DataFrame
# Here it is again
df

Unnamed: 0,Population,Area
Tokyo,37,8230
Delhi,28,2232
Shanghai,25,4068
São Paulo,21,3116


In [44]:
# Save a DataFrame to an Excel spreadsheet
# Note: This requires openpyxl
# If you get a message: No module named 'openpyxl', run the following command in a cell:
# ! pip install openpyxl
# You only need to do this once
df.to_excel('cities.xlsx')

In [45]:
# Check to see if the spreadsheet was created
# You should see 'cities.xlsx' in the list below
! ls

 cities.csv			     'Numpy Cheatsheet.ipynb'
 cities.xlsx			     'Numpy Cheatsheet.pdf'
'Jupyter Notebook Cheatsheet.ipynb'  'Pandas Cheatsheet.ipynb'
'Jupyter Notebook Cheatsheet.pdf'     Sandbox.ipynb


In [46]:
# Load an Excel spreadsheet into a dataframe
new_df = pd.read_excel('cities.xlsx')
new_df

Unnamed: 0.1,Unnamed: 0,Population,Area
0,Tokyo,37,8230
1,Delhi,28,2232
2,Shanghai,25,4068
3,São Paulo,21,3116


In [47]:
# Save a DataFrame to a CVS file
df.to_csv('cities.csv')

In [48]:
# Check to see if the spreadsheet was created
# You should see 'cities.csv' in the list below
! ls

 cities.csv			     'Numpy Cheatsheet.ipynb'
 cities.xlsx			     'Numpy Cheatsheet.pdf'
'Jupyter Notebook Cheatsheet.ipynb'  'Pandas Cheatsheet.ipynb'
'Jupyter Notebook Cheatsheet.pdf'     Sandbox.ipynb


In [49]:
# Load an Excel spreadsheet into a dataframe
# use index_col=0 to use the first colum in the CSV
# as the index (row names) of the DataFrame
another_new_df = pd.read_csv('cities.csv', index_col=0)
another_new_df

Unnamed: 0,Population,Area
Tokyo,37,8230
Delhi,28,2232
Shanghai,25,4068
São Paulo,21,3116
