# Pandas Intro

### Load basic libraries

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import re
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('max_columns', 15)
pd.set_option('chained_assignment', None)

### Pandas Creation


In [2]:
# Create a pandas Series with some random numbers and index it with roman numbers up to 5:
indices = ['I', 'II', 'III', 'IV', 'V']
s = pd.Series(np.random.randn(5), index=indices)
s

I      0.416125
II    -1.813219
III    0.166710
IV     0.366578
V      0.265027
dtype: float64

In [3]:
# Create a date range with eight dates
dates = pd.date_range('20200101', periods=8)

print(f"date type: {type(dates)}")
dates

date type: <class 'pandas.core.indexes.datetimes.DatetimeIndex'>


DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06', '2020-01-07', '2020-01-08'],
              dtype='datetime64[ns]', freq='D')

In [4]:
# Combine the Series and the dates creation to create a pandas dataframe 
# with data for 8 days using the previous indices as columns
df = pd.DataFrame(np.random.randn(8, 5), index=dates, columns=indices)
df

Unnamed: 0,I,II,III,IV,V
2020-01-01,-0.20568,0.532244,-1.617994,0.837649,0.010214
2020-01-02,-1.399543,-0.966135,-0.130476,0.194911,-0.4865
2020-01-03,1.349497,-0.189682,1.703154,-0.737544,-0.755482
2020-01-04,0.895049,1.168853,0.777433,0.287427,-2.201884
2020-01-05,0.191527,-0.381723,1.171725,-0.106117,-0.670856
2020-01-06,-2.569348,-1.330691,0.032311,-0.428877,-0.026792
2020-01-07,1.182499,0.349626,-0.005139,0.881672,-0.368204
2020-01-08,1.094489,-1.105137,0.407892,0.747717,0.478146


Load a dictionary into pandas:

In [5]:
data = {'Country':['United Kingdom', 'Spain', 'France', 'USA', 'Australia', 'Bangladesh', 'Belgium'],
        'Capital': ['London', 'Madrid', 'Paris', 'washington', 'Canberra', 'Dhaka', 'Brussels'],
        'Random': [3, 1, 4, 2, 8, 5, 6]}
df_countries = pd.DataFrame(data)
df_countries

Unnamed: 0,Country,Capital,Random
0,United Kingdom,London,3
1,Spain,Madrid,1
2,France,Paris,4
3,USA,washington,2
4,Australia,Canberra,8
5,Bangladesh,Dhaka,5
6,Belgium,Brussels,6


In [6]:
# Check columns types:
df_countries.dtypes

Country    object
Capital    object
Random      int64
dtype: object

Selecting rows and columns:

In [7]:
# Selecting 'country' column
col = df_countries['Country']

print(col)
print(f"\nColumn type: {type(col)}")

0    United Kingdom
1             Spain
2            France
3               USA
4         Australia
5        Bangladesh
6           Belgium
Name: Country, dtype: object

Column type: <class 'pandas.core.series.Series'>


In [8]:
# Select the column using dot notation
df_countries.Country

0    United Kingdom
1             Spain
2            France
3               USA
4         Australia
5        Bangladesh
6           Belgium
Name: Country, dtype: object

In [9]:
# Select the column as a new dataframe
df_col = df_countries[['Country']]

print(f"\nColumn type: {type(df_col)}")
df_col


Column type: <class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Country
0,United Kingdom
1,Spain
2,France
3,USA
4,Australia
5,Bangladesh
6,Belgium


In [10]:
# Update the first value of the pandas column using dot notation
df_col.Country[0] = 'Brexit'

# Verify the change
df_col

Unnamed: 0,Country
0,Brexit
1,Spain
2,France
3,USA
4,Australia
5,Bangladesh
6,Belgium


In [11]:
# Check what happens with the original dataframe
df_countries

Unnamed: 0,Country,Capital,Random
0,United Kingdom,London,3
1,Spain,Madrid,1
2,France,Paris,4
3,USA,washington,2
4,Australia,Canberra,8
5,Bangladesh,Dhaka,5
6,Belgium,Brussels,6


In [12]:
# Transpose the dataframe
transposed = df_countries.T
transposed

Unnamed: 0,0,1,2,3,4,5,6
Country,United Kingdom,Spain,France,USA,Australia,Bangladesh,Belgium
Capital,London,Madrid,Paris,washington,Canberra,Dhaka,Brussels
Random,3,1,4,2,8,5,6


In [13]:
# Get all the data as numpy
df_countries.to_numpy()

array([['United Kingdom', 'London', 3],
       ['Spain', 'Madrid', 1],
       ['France', 'Paris', 4],
       ['USA', 'washington', 2],
       ['Australia', 'Canberra', 8],
       ['Bangladesh', 'Dhaka', 5],
       ['Belgium', 'Brussels', 6]], dtype=object)

Notes:

- The output doesn't know anything about index or column information
- The behaviour changes if the columns are from the same type vs. if they contain different types:
    - With the same time it is very fast as it is not a copy.
    - With different types it gets very expensive as it copies everything into a new object.

### Summary Info


In [14]:
# Get examples of the first 3 rows
df_countries.head(3)

Unnamed: 0,Country,Capital,Random
0,United Kingdom,London,3
1,Spain,Madrid,1
2,France,Paris,4


In [15]:
# Get examples of the last 2 rows
df_countries.tail(2)

Unnamed: 0,Country,Capital,Random
5,Bangladesh,Dhaka,5
6,Belgium,Brussels,6


In [16]:
# List the indices of the DataFrame
print(f"Indices: {df_countries.index}")

# What are the indices of the previously transposed DataFrame?
print(f"Transposed index: {transposed.index}")

Indices: RangeIndex(start=0, stop=7, step=1)
Transposed index: Index(['Country', 'Capital', 'Random'], dtype='object')


In [17]:
# List the columns of the DataFrame
print(f"Indices: {df_countries.columns}")

# What are the columns of the previously transposed DataFrame?
print(f"Transposed index: {transposed.columns}")

Indices: Index(['Country', 'Capital', 'Random'], dtype='object')
Transposed index: RangeIndex(start=0, stop=7, step=1)


In [18]:
# Get statistics of numerical columns
df_countries.describe()

Unnamed: 0,Random
count,7.0
mean,4.142857
std,2.410295
min,1.0
25%,2.5
50%,4.0
75%,5.5
max,8.0


### More in Selection

We have seen  `df_countries.Country`  and `df_countries['Country']`  as possible ways to select a column.

What if we want to select specific rows?

In [19]:
# Select second and forth row by indices:
df_countries.iloc[[2, 4], :]

Unnamed: 0,Country,Capital,Random
2,France,Paris,4
4,Australia,Canberra,8


In [20]:
# Select again second and forth row but specifying the column names 'Country' and 'Capital':
df_countries.loc[[2, 4], ['Country', 'Capital']]

Unnamed: 0,Country,Capital
2,France,Paris
4,Australia,Canberra


In [21]:
# If you specify a single column, you get a series instead
series_selection = df_countries.loc[[2, 4], 'Country']

print(f"Type is: {type(series_selection)}")
series_selection

Type is: <class 'pandas.core.series.Series'>


2       France
4    Australia
Name: Country, dtype: object

### Boolean indexing


In [22]:
# Select the data where 'Random' > 3
df_countries[df_countries['Random'] > 3]

Unnamed: 0,Country,Capital,Random
2,France,Paris,4
4,Australia,Canberra,8
5,Bangladesh,Dhaka,5
6,Belgium,Brussels,6


In [23]:
# Select the rows where Capital is in the list ['Paris', 'Madrid']
my_list = ['Paris', 'Madrid']
df_countries[df_countries['Capital'].isin(my_list)]

Unnamed: 0,Country,Capital,Random
1,Spain,Madrid,1
2,France,Paris,4


### Sorting:

In [24]:
# Sort the columns in descending order:
df_countries.sort_index(axis=1, ascending=False)

Unnamed: 0,Random,Country,Capital
0,3,United Kingdom,London
1,1,Spain,Madrid
2,4,France,Paris
3,2,USA,washington
4,8,Australia,Canberra
5,5,Bangladesh,Dhaka
6,6,Belgium,Brussels


In [25]:
# Sort by values of the 'Random' column
df_countries.sort_values(by='Random', ascending=True)

Unnamed: 0,Country,Capital,Random
1,Spain,Madrid,1
3,USA,washington,2
0,United Kingdom,London,3
2,France,Paris,4
5,Bangladesh,Dhaka,5
6,Belgium,Brussels,6
4,Australia,Canberra,8


### Modifying the data



In [26]:
# Create a new column called 'Continent' and fill it with 'Unknown'
df_countries['Continent'] = np.nan

# Given the following list, modify the values of 'Continent' for those countries to 'Europe'
isEurope = ['Belgium', 'France', 'Spain', 'United Kingdom']
df_countries.loc[df_countries['Country'].isin(isEurope), 'Continent'] = 'Europe'

df_countries

Unnamed: 0,Country,Capital,Random,Continent
0,United Kingdom,London,3,Europe
1,Spain,Madrid,1,Europe
2,France,Paris,4,Europe
3,USA,washington,2,
4,Australia,Canberra,8,
5,Bangladesh,Dhaka,5,
6,Belgium,Brussels,6,Europe


In [27]:
# Modify value of 'United Kindom' to 'UK' by position
df_countries.iat[0,0] = 'UK'
df_countries

Unnamed: 0,Country,Capital,Random,Continent
0,UK,London,3,Europe
1,Spain,Madrid,1,Europe
2,France,Paris,4,Europe
3,USA,washington,2,
4,Australia,Canberra,8,
5,Bangladesh,Dhaka,5,
6,Belgium,Brussels,6,Europe


In [28]:
# Clean up missing values from the dataframe:
df_countries.dropna(how='any')

Unnamed: 0,Country,Capital,Random,Continent
0,UK,London,3,Europe
1,Spain,Madrid,1,Europe
2,France,Paris,4,Europe
6,Belgium,Brussels,6,Europe


Note that this creates a copy except when inplace is specified

In [29]:
# Fill missing data in Continent with 'World'. As before filling data creates a new DataFrame by default.
df_countries.fillna('World')

Unnamed: 0,Country,Capital,Random,Continent
0,UK,London,3,Europe
1,Spain,Madrid,1,Europe
2,France,Paris,4,Europe
3,USA,washington,2,World
4,Australia,Canberra,8,World
5,Bangladesh,Dhaka,5,World
6,Belgium,Brussels,6,Europe


In [30]:
# Create a mask (filter) of the values that are NA:
pd.isna(df_countries)

Unnamed: 0,Country,Capital,Random,Continent
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,True
4,False,False,False,True
5,False,False,False,True
6,False,False,False,False


### Stats

In [31]:
# Calculate the mean of the first column (df dataset):
print(f"First column mean: {df.iloc[:, 0].mean()}")

# Calculate the mean of all columns:
print(f"\nMean (all columns): \n{df.mean()}")

# Calculate the mean of all rows:
print(f"\nMean (all rows): \n{df.mean(axis=1)}")

First column mean: 0.06731113717114418

Mean (all columns): 
I      0.067311
II    -0.240331
III    0.292363
IV     0.209605
V     -0.502670
dtype: float64

Mean (all rows): 
2020-01-01   -0.088713
2020-01-02   -0.557548
2020-01-03    0.273989
2020-01-04    0.185375
2020-01-05    0.040911
2020-01-06   -0.864679
2020-01-07    0.408091
2020-01-08    0.324621
Freq: D, dtype: float64


In [32]:
# Max and min of 'Random' in df_countries
print(f"Max value in random: {df_countries.Random.max()}")
print(f"min value in random: {df_countries.Random.min()}")

Max value in random: 8
min value in random: 1


In [33]:
# Use apply to obtain the difference between the max and and min by column
df.apply(lambda x: x.max() - x.min())

I      3.918845
II     2.499544
III    3.321148
IV     1.619216
V      2.680031
dtype: float64

In [34]:
# Create a matrix 8x8 with random integers from 0 to 9
m = pd.DataFrame(np.random.randint(0, 9, size=(8, 8)))
print(f"Matrix: \n{m}")

Matrix: 
   0  1  2  3  4  5  6  7
0  3  7  6  6  1  7  7  6
1  1  5  0  5  0  1  0  2
2  0  7  6  4  1  5  3  5
3  4  1  5  8  7  2  3  7
4  4  1  0  2  7  0  5  1
5  5  5  2  6  7  2  5  0
6  0  5  4  8  3  3  3  7
7  0  4  7  4  6  2  4  1


In [35]:
# Count of each value across the 4th column
m.iloc[:,3].value_counts()

6    2
4    2
8    2
5    1
2    1
Name: 3, dtype: int64

In [36]:
# Perform the count of values for all rows
m.apply(lambda x: x.value_counts(), axis=1)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,,1.0,,1.0,,,3.0,3.0,
1,3.0,2.0,1.0,,,2.0,,,
2,1.0,1.0,,1.0,1.0,2.0,1.0,1.0,
3,,1.0,1.0,1.0,1.0,1.0,,2.0,1.0
4,2.0,2.0,1.0,,1.0,1.0,,1.0,
5,1.0,,2.0,,,3.0,1.0,1.0,
6,1.0,,,3.0,1.0,1.0,,1.0,1.0
7,1.0,1.0,1.0,,3.0,,1.0,1.0,
