# MODEL TRAINING

In [32]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [33]:
data_train = pd.read_csv('../Data/train.csv')
data_test = pd.read_csv('../Data/test.csv')

In [34]:
X, y = data_train.drop(columns="SalePrice"), data_train["SalePrice"].copy()
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=42)
X_test = data_test.copy()

In [35]:
preprocessor = pickle.load(open('./preprocessor.pkl', 'rb'))

In [36]:
simple_model = LinearRegression()

In [37]:
simple_clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', simple_model) 
])

In [38]:
simple_clf.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF...
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu

In [39]:
with open('./simple_classifier.pkl', 'wb') as pkl_file:
    pickle.dump(simple_clf, pkl_file)

In [40]:
model = RandomForestRegressor(random_state=42, n_jobs=-1)
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

In [43]:
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [2, 5, 10, None],
}
search = GridSearchCV(clf, param_grid, cv=5, scoring='neg_mean_squared_log_error', error_score=-1, n_jobs=-1)
search.fit(X_train, y_train)
print(f"Best parameter (CV rmsle={-1 * search.best_score_}):")
print(search.best_params_)

Best parameter (CV rmsle=0.02339125531748941):
{'model__max_depth': None, 'model__n_estimators': 200}


In [44]:
model = RandomForestRegressor(n_estimators=200, max_depth=None, random_state=42, n_jobs=-1)
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])
clf.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF...
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDr

In [45]:
with open('./classifier.pkl', 'wb') as pkl_file:
    pickle.dump(clf, pkl_file)