In [10]:
import pickle
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [11]:
df = pd.read_csv('StudentsPerformance.csv')
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


# Target-feature split

In [12]:
X, y = df.drop('writing score', axis=1), df['writing score']

# Importing the PreProcessor CloumnTransformer model Using Pickle

In [13]:
import pathlib
path_to_read_model=str(pathlib.Path.cwd())
with open(path_to_read_model + '\\preprocessor.pkl', 'rb') as f:
    ct = pickle.load(f)

In [14]:
ct

ColumnTransformer(remainder='passthrough',
                  transformers=[('pp_num',
                                 Pipeline(steps=[('scaler', MinMaxScaler())]),
                                 ['reading score', 'math score']),
                                ('pp_catN',
                                 Pipeline(steps=[('col_catN',
                                                  SimpleImputer(fill_value='missing',
                                                                strategy='constant')),
                                                 ('catN',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False))]),
                                 ['gender', 'race/ethnicity',
                                  'parental level of education', 'lunch',
                                  'test preparation course'])])

# Model tuning and fiting

In [15]:
# pipelne with the Data Transformations and model
pipe = Pipeline([
    ('col_trans', ct),
    ('model', RandomForestRegressor(random_state = 0))
])

# parameters of the model
params = {
    'model__n_estimators':[100,200,300],
    'model__max_depth':[5, 9, 13],
    'model__min_samples_split':[2,4,6,8]
}

# CrossValidation
gs = GridSearchCV(pipe, param_grid=params, cv=5, return_train_score=False, scoring='neg_root_mean_squared_error', verbose=1, n_jobs=-1)

In [16]:
# fit the data
gs.fit(X, y)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('col_trans',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('pp_num',
                                                                         Pipeline(steps=[('scaler',
                                                                                          MinMaxScaler())]),
                                                                         ['reading '
                                                                          'score',
                                                                          'math '
                                                                          'score']),
                                                                        ('pp_catN',
                                                                         Pipeline(steps=[('col_catN',
                                

In [17]:
# Check for the best parameters and its score
print(">> best estimator: ",gs.best_estimator_)
print(">> best_score: ",gs.best_score_)

>> best estimator:  Pipeline(steps=[('col_trans',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pp_num',
                                                  Pipeline(steps=[('scaler',
                                                                   MinMaxScaler())]),
                                                  ['reading score',
                                                   'math score']),
                                                 ('pp_catN',
                                                  Pipeline(steps=[('col_catN',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('catN',
                                                                   OneHotEncoder(handle_unknown='ignore',
    

# Saving the best estimator Model

In [18]:
import pathlib
path_to_write_output=str(pathlib.Path.cwd()) #Path of current working Directory
with open(path_to_write_output + '\\model.pkl', 'wb') as handle:
 pickle.dump(gs.best_estimator_, handle)