In [1]:
%load_ext autoreload
%autoreload 2

import joblib as jl

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.svm import SVR

from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.metrics import mean_squared_error

import functions as f
import classes as c

In [2]:
train_df = pd.read_csv("datasets/train.csv", na_values="np.NaN").set_index("Id")
test_df = pd.read_csv("datasets/test.csv", na_values=np.NaN).set_index("Id")
corr_matrix = train_df.corr()
# train_df[pd.isnull(train_df)] = np.NaN

train_df.loc[:, "LotAreaSqrt"] = np.sqrt(train_df["LotArea"].values)
train_df.loc[:, "GarageAreaSqrt"] = np.sqrt(train_df["GarageArea"].values)

col_of_interest = ["LotArea", "LotAreaSqrt", "GarageArea", "GarageAreaSqrt", "LotFrontage"]
train_df = train_df[col_of_interest].copy()
train_df.dropna(axis=0, inplace=True)

X = train_df.drop("LotFrontage", axis=1).copy()
y = train_df["LotFrontage"].copy()


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [3]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

In [4]:
clf = GridSearchCV(SVR(), tuned_parameters, cv=5, scoring="neg_mean_squared_error", verbose=100, n_jobs=4)

In [5]:
clf.fit(X_train, y_train)
print("Best parameters set found on development set:")
print(clf.best_params_)

Fitting 5 folds for each of 28 candidates, totalling 140 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
Pickling array (shape=(4,), dtype=object).
Pickling array (shape=(804,), dtype=int64).
Pickling array (shape=(2, 804), dtype=int64).
Pickling array (shape=(2, 804), dtype=float64).
Pickling array (shape=(2,), dtype=object).
Pickling array (shape=(2,), dtype=object).
Pickling array (shape=(804,), dtype=int64).
Pickling array (shape=(804,), dtype=float64).
Pickling array (shape=(804,), dtype=int64).
Pickling array (shape=(643,), dtype=int32).
Pickling array (shape=(161,), dtype=int32).
Pickling array (shape=(4,), dtype=object).
Pickling array (shape=(804,), dtype=int64).
Pickling array (shape=(2, 804), dtype=int64).
Pickling array (shape=(2, 804), dtype=float64).
Pickling array (shape=(2,), dtype=object).
Pickling array (shape=(2,), dtype=object).
Pickling array (shape=(804,), dtype=int64).
Pickling array (shape=(804,), dtype=float64).
Pickling array (s


Pickling array (shape=(804,), dtype=int64).
Pickling array (shape=(2, 804), dtype=int64).
Pickling array (shape=(2, 804), dtype=float64).
Pickling array (shape=(2,), dtype=object).
Pickling array (shape=(2,), dtype=object).
Pickling array (shape=(804,), dtype=int64).
Pickling array (shape=(804,), dtype=float64).
Pickling array (shape=(804,), dtype=int64).
Pickling array (shape=(643,), dtype=int32).
Pickling array (shape=(161,), dtype=int32).
Pickling array (shape=(4,), dtype=object).
Pickling array (shape=(804,), dtype=int64).
Pickling array (shape=(2, 804), dtype=int64).
Pickling array (shape=(2, 804), dtype=float64).
Pickling array (shape=(2,), dtype=object).
Pickling array (shape=(2,), dtype=object).
Pickling array (shape=(804,), dtype=int64).
Pickling array (shape=(804,), dtype=float64).
Pickling array (shape=(804,), dtype=int64).
Pickling array (shape=(643,), dtype=int32).
Pickling array (shape=(161,), dtype=int32).
[Parallel(n_jobs=4)]: Done  13 tasks      | elapsed:    1.4s
Pic

Pickling array (shape=(644,), dtype=int32).
Pickling array (shape=(160,), dtype=int32).
Pickling array (shape=(4,), dtype=object).[Parallel(n_jobs=4)]: Done  37 tasks      | elapsed:    1.8s

Pickling array (shape=(804,), dtype=int64).
Pickling array (shape=(2, 804), dtype=int64).
Pickling array (shape=(2, 804), dtype=float64).
Pickling array (shape=(2,), dtype=object).
Pickling array (shape=(2,), dtype=object).
Pickling array (shape=(804,), dtype=int64).
Pickling array (shape=(804,), dtype=float64).
Pickling array (shape=(804,), dtype=int64).
Pickling array (shape=(643,), dtype=int32).
Pickling array (shape=(161,), dtype=int32).
Pickling array (shape=(643,), dtype=int32).
Pickling array (shape=(161,), dtype=int32).
Pickling array (shape=(643,), dtype=int32).
Pickling array (shape=(161,), dtype=int32).
Pickling array (shape=(643,), dtype=int32).
Pickling array (shape=(161,), dtype=int32).
Pickling array (shape=(644,), dtype=int32).
Pickling array (shape=(160,), dtype=int32).
Pickling a

[Parallel(n_jobs=4)]: Done  75 tasks      | elapsed:    2.6s
Pickling array (shape=(4,), dtype=object).
Pickling array (shape=(804,), dtype=int64).
Pickling array (shape=(2, 804), dtype=int64).
Pickling array (shape=(2, 804), dtype=float64).
Pickling array (shape=(2,), dtype=object).
Pickling array (shape=(2,), dtype=object).
Pickling array (shape=(804,), dtype=int64).
Pickling array (shape=(804,), dtype=float64).
Pickling array (shape=(804,), dtype=int64).
Pickling array (shape=(644,), dtype=int32).
Pickling array (shape=(160,), dtype=int32).
Pickling array (shape=(643,), dtype=int32).
Pickling array (shape=(161,), dtype=int32).
Pickling array (shape=(643,), dtype=int32).
Pickling array (shape=(161,), dtype=int32).
Pickling array (shape=(643,), dtype=int32).
Pickling array (shape=(161,), dtype=int32).
Pickling array (shape=(643,), dtype=int32).
Pickling array (shape=(161,), dtype=int32).
Pickling array (shape=(4,), dtype=object).[Parallel(n_jobs=4)]: Done  81 tasks      | elapsed:    



In [6]:
model = clf.best_estimator_

In [9]:
model.fit(X, y)

SVR(C=10, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=1e-06,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [10]:
jl.dump(model, "LotFrontageReg.joblib")

['LotAreaReg.joblib']

In [11]:
loded_model = jl.load("LotAreaReg.joblib")

In [7]:
y_train_pr = model.predict(X_train)
y_test_pr = model.predict(X_test)
mse_train = mean_squared_error(y_train, y_train_pr)
mse_test = mean_squared_error(y_test, y_test_pr)

In [8]:
np.sqrt(mse_train), np.sqrt(mse_test)

(19.370898183756445, 15.597789451750993)

In [12]:
y_train_pr = loded_model.predict(X_train)
y_test_pr = loded_model.predict(X_test)
mse_train = mean_squared_error(y_train, y_train_pr)
mse_test = mean_squared_error(y_test, y_test_pr)
np.sqrt(mse_train), np.sqrt(mse_test)

(19.232273628809942, 14.522899551373246)