#  Preprocessing with Pandas and Matplotlib

In [1]:
# Import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer


In [2]:
names = ['Kelly', np.nan, 'Jon', 'Ken', 'Tim', 'Pel']
grades = [30,40,30,67,np.nan,55]
age = [15,np.nan,18,17,np.nan,16]

# Create data frame with lists above

df = pd.DataFrame({'Names': names,'Grades': grades, 'Age':age})
df

Unnamed: 0,Names,Grades,Age
0,Kelly,30.0,15.0
1,,40.0,
2,Jon,30.0,18.0
3,Ken,67.0,17.0
4,Tim,,
5,Pel,55.0,16.0


In [3]:
# Check for missing values

df.isna().sum()

Names     1
Grades    1
Age       2
dtype: int64

In [4]:
# Create copy of data frame

df_copy = df.copy()
df_copy

Unnamed: 0,Names,Grades,Age
0,Kelly,30.0,15.0
1,,40.0,
2,Jon,30.0,18.0
3,Ken,67.0,17.0
4,Tim,,
5,Pel,55.0,16.0


In [5]:
# Create instance of different strategies for SimpleImputer

imputer_names = SimpleImputer(strategy='constant',fill_value=0,)
imputer_grades = SimpleImputer(strategy='mean')
imputer_age = SimpleImputer(strategy='median')

In [6]:
# Fit and train SimpleImputer

df_copy[['Names']] = imputer_names.fit_transform(df_copy[['Names']])
df_copy[['Grades']] = imputer_names.fit_transform(df_copy[['Grades']])
df_copy[['Age']] = imputer_age.fit_transform(df_copy[['Age']])

df_copy

Unnamed: 0,Names,Grades,Age
0,Kelly,30.0,15.0
1,0,40.0,16.5
2,Jon,30.0,18.0
3,Ken,67.0,17.0
4,Tim,0.0,16.5
5,Pel,55.0,16.0


In [7]:
# Drop columns with more than 30% missing values

# Check for percentage of missing values
values_missing = df.isnull().mean()

# Columns with more than 30% missing values
columns_drop = values_missing[values_missing > 0.3].index

df_cleaned = df.drop(columns_drop,axis=1)
df_cleaned


Unnamed: 0,Names,Grades
0,Kelly,30.0
1,,40.0
2,Jon,30.0
3,Ken,67.0
4,Tim,
5,Pel,55.0


In [8]:
# Check for duplicate values in Names column

df_cleaned.duplicated("Names").sum()

0

In [9]:
# Group by Names column

df_copy_group_names = df_copy.groupby(by="Names").agg({'Grades':['mean','min','max']})
df_copy_group_names

Unnamed: 0_level_0,Grades,Grades,Grades
Unnamed: 0_level_1,mean,min,max
Names,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,40.0,40.0,40.0
Jon,30.0,30.0,30.0
Kelly,30.0,30.0,30.0
Ken,67.0,67.0,67.0
Pel,55.0,55.0,55.0
Tim,0.0,0.0,0.0
