In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.model_selection import GridSearchCV,learning_curve
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pickle

In [26]:
df = pd.read_csv('StudentsPerformance.csv')
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [27]:
print(">>Dataset shape: ",df.shape)
print(">>null values in the data:")
df.isna().sum()

>>Dataset shape:  (1000, 8)
>>null values in the data:


gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

# Target-feature split

In [28]:
X, y = df.drop('writing score', axis=1), df['writing score']

# Ecoding and Data Preprocessing

In [29]:
cat_cols = X.dtypes[X.dtypes == 'O'].index.tolist()
cat_cols

['gender',
 'race/ethnicity',
 'parental level of education',
 'lunch',
 'test preparation course']

In [30]:
#Pipeline to processes Numerical and Catogorical columns
pp_num = Pipeline([
    ('scaler', MinMaxScaler((0,1)))
])

pp_catN = Pipeline([
    ('col_catN', SimpleImputer(strategy='constant', add_indicator=False, fill_value='missing')),
    ('catN', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

In [31]:
#ColumnTransformer with pre-processing steps
ct = ColumnTransformer([
    ('pp_num', pp_num, ['reading score', 'math score']),
    ('pp_catN', pp_catN, cat_cols)
], remainder='passthrough')

In [32]:
ct.fit(X)

ColumnTransformer(remainder='passthrough',
                  transformers=[('pp_num',
                                 Pipeline(steps=[('scaler', MinMaxScaler())]),
                                 ['reading score', 'math score']),
                                ('pp_catN',
                                 Pipeline(steps=[('col_catN',
                                                  SimpleImputer(fill_value='missing',
                                                                strategy='constant')),
                                                 ('catN',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False))]),
                                 ['gender', 'race/ethnicity',
                                  'parental level of education', 'lunch',
                                  'test preparation course'])])

# Saving the PreProcessor

In [43]:
import pathlib
path_to_write_output=str(pathlib.Path.cwd()) #Path of current working Directory
with open(path_to_write_output + '\\preprocessor.pkl', 'wb') as handle:
 pickle.dump(ct, handle)