# Mercari Price Prediction ML

## Importing Libraries

In [1]:
import scipy as sp
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import RidgeCV, Ridge
from sklearn.pipeline import make_pipeline
import scipy

#### Pulling in premade Kaggle competition data.

In [2]:
train = pd.read_csv("train.tsv", sep='\t', header=0)
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [3]:
test = pd.read_csv("test.tsv", sep='\t', header=0)
test.head()

Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description
0,0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,,1,Size 7
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined..."
2,2,Coach bag,1,Vintage & Collectibles/Bags and Purses/Handbag,Coach,1,Brand new coach bag. Bought for [rm] at a Coac...
3,3,Floral Kimono,2,Women/Sweaters/Cardigan,,0,-floral kimono -never worn -lightweight and pe...
4,4,Life after Death,3,Other/Books/Religion & Spirituality,,1,Rediscovering life after the loss of a loved o...


## Data Cleaning

#### Need to remove null values from item_description, as they would interfere with the algorithm.

In [4]:
#Deleting null rows to clean up the data.
train = train[~train['item_description'].isnull()]
test = test[~test['item_description'].isnull()]

#### Combining datasets for train and test together so that all function performed in preparing the data will not skew the size of the matrix. Here, we are also going to remove 0 value entries for log price and other outliers as they would skew the data.

In [5]:
#using log price and pre-setting quantiles to delete outliers within the data.
train['log_price'] = np.log(train.price + 1)
q = train['log_price'].quantile(0.975)
train = train[train['log_price'] < q]
train = train[train['log_price'] > 0]

In [6]:
data = pd.concat([train, test], 0)
train_rows = train.shape[0]
print(train_rows)

1442262


#### Replacing 'No description yet' to no_desc for tfidf vectorizer to count these as stop words. Also transforming category_name into two distinct categories to introduce two features for the matrix

In [7]:
data.item_description = data.item_description.str.replace('No description yet', 'no_desc')
data['primary_cat'] = data.category_name.str.extract('([^/]+)/[^/]+/[^/]+')
data['secondary_cat'] = data.category_name.str.extract('[^/]+/([^/]+/[^/]+)')
data = data.apply(lambda x: x.astype(str).str.lower())

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
data.head()

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,log_price,name,price,shipping,test_id,train_id,primary_cat,secondary_cat
0,,men/tops/t-shirts,3,no_desc,2.3978952728,mlb cincinnati reds t shirt size xl,10.0,1,,0.0,men,tops/t-shirts
1,razer,electronics/computers & tablets/components & p...,3,this keyboard is in great condition and works ...,3.97029191355,razer blackwidow chroma keyboard,52.0,0,,1.0,electronics,computers & tablets/components & parts
2,target,women/tops & blouses/blouse,1,adorable top with a hint of lace and a key hol...,2.3978952728,ava-viv blouse,10.0,1,,2.0,women,tops & blouses/blouse
3,,home/home décor/home décor accents,1,new with tags. leather horses. retail for [rm]...,3.58351893846,leather horse statues,35.0,1,,3.0,home,home décor/home décor accents
4,,women/jewelry/necklaces,1,complete with certificate of authenticity,3.80666248977,24k gold plated rose,44.0,0,,4.0,women,jewelry/necklaces


In [9]:
text_features = ['name', 'brand_name', 'category_name', 'primary_cat', 'secondary_cat', 'item_description']
for t in text_features:
    data[t].replace(regex=True,inplace=True,to_replace=r'\W',value=r' ')
    
data.head()

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,log_price,name,price,shipping,test_id,train_id,primary_cat,secondary_cat
0,,men tops t shirts,3,no_desc,2.3978952728,mlb cincinnati reds t shirt size xl,10.0,1,,0.0,men,tops t shirts
1,razer,electronics computers tablets components p...,3,this keyboard is in great condition and works ...,3.97029191355,razer blackwidow chroma keyboard,52.0,0,,1.0,electronics,computers tablets components parts
2,target,women tops blouses blouse,1,adorable top with a hint of lace and a key hol...,2.3978952728,ava viv blouse,10.0,1,,2.0,women,tops blouses blouse
3,,home home décor home décor accents,1,new with tags leather horses retail for rm ...,3.58351893846,leather horse statues,35.0,1,,3.0,home,home décor home décor accents
4,,women jewelry necklaces,1,complete with certificate of authenticity,3.80666248977,24k gold plated rose,44.0,0,,4.0,women,jewelry necklaces


In [10]:
data['item_condition_id'] = pd.to_numeric(data.item_condition_id, errors = 'ignore')
data['shipping'] = pd.to_numeric(data.shipping, errors = 'ignore')
data['item_description'] = data['item_description'].fillna('')

In [11]:
data['brand_name'] = data['brand_name'].replace([np.nan,'nan'], 'negative', regex=True)
data['brand_name'] = data['brand_name'].str.replace('\s+', '')  # in case there are multiple white spaces
data['brand_name'] = 'brand_' + data['brand_name'].astype(str)


In [12]:
data.head()

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,log_price,name,price,shipping,test_id,train_id,primary_cat,secondary_cat
0,brand_negative,men tops t shirts,3,no_desc,2.3978952728,mlb cincinnati reds t shirt size xl,10.0,1,,0.0,men,tops t shirts
1,brand_razer,electronics computers tablets components p...,3,this keyboard is in great condition and works ...,3.97029191355,razer blackwidow chroma keyboard,52.0,0,,1.0,electronics,computers tablets components parts
2,brand_target,women tops blouses blouse,1,adorable top with a hint of lace and a key hol...,2.3978952728,ava viv blouse,10.0,1,,2.0,women,tops blouses blouse
3,brand_negative,home home décor home décor accents,1,new with tags leather horses retail for rm ...,3.58351893846,leather horse statues,35.0,1,,3.0,home,home décor home décor accents
4,brand_negative,women jewelry necklaces,1,complete with certificate of authenticity,3.80666248977,24k gold plated rose,44.0,0,,4.0,women,jewelry necklaces


#### Combining text values into one single text  corpus

In [13]:
text = (data['name'] + ' '+ data['brand_name'] + ' ' +
          data['item_description'] + ' ' + data['primary_cat'] + ' ' + 
          data['secondary_cat']).values

In [14]:
from nltk import word_tokenize          
#from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
import nltk

#class LemmaTokenizer(object):
#    def __init__(self):
##        self.wnl = WordNetLemmatizer()
#    def __call__(self, articles):
#        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [15]:
#separating the text data into train and test set again
text_train = text[:train_rows]
text_test = text[train_rows:]


dummies = scipy.sparse.csr_matrix(pd.get_dummies(data[[
    "item_condition_id", "shipping"]], sparse = True).values)
X_dummies = dummies[:train_rows]
y_dummies = dummies[train_rows:]
y = np.array(train['log_price'].values)

#### This gridsearch took at upwards of 12 hours for 405 candidates for a total of 1215 fits. The results of the gridsearch have been pickled and saved to a pkl file and the code has been converted to markdown format.

In [None]:
estimators = [("tf_idf", TfidfVectorizer(lowercase=True)), 
              ("ridge", Ridge())]
model = Pipeline(estimators)
model.fit(text_train, y)


params = {"ridge__alpha":[0.1, 0.3, 0.5, 1, 3], #regularization param
          "tf_idf__min_df": [1, 5, 10],#min count of words allowed
          "tf_idf__max_df": [0.7,0.8,0.9],
          "tf_idf__ngram_range": [(1,1), (1,2), (1,3)], #1-grams or 2-grams
          "tf_idf__max_features": [5000, 7500, 10000]#5000 features, 7500, or 10000
         }


grid = GridSearchCV(estimator=model, param_grid = params, scoring = "neg_mean_squared_error" , verbose=600)
grid.fit(text_train, y)

import pickle

outfile = 'grid_ml.pkl'

with open(outfile, 'wb') as pickle_file:
    pickle.dump(grid, pickle_file)

#### The gridsearch results for tfidf vectorizer and the following best estimator/best score/best parameters from the pickle file are uploaded in the cell below:

In [16]:
import pickle
grid_ml = pickle.load(open('grid_ml.pkl', "rb"))

#best estimators
print("Best Estimator: ", grid_ml.best_estimator_)
#printing best scores
print("Best Score: ", grid_ml.best_score_)
#printing best parameters for optimal parameter tuning
print("Best Parameters: ", grid_ml.best_params_)

Best Estimator:  Pipeline(memory=None,
     steps=[('tf_idf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=10000, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,...it_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])
Best Score:  -0.222323799127
Best Parameters:  {'ridge__alpha': 1, 'tf_idf__max_df': 0.7, 'tf_idf__max_features': 10000, 'tf_idf__min_df': 1, 'tf_idf__ngram_range': (1, 2)}


#### Separating the training and test sets again after massaging the data for the distinct train and test sets with the train_rows mask. Also creating the y value with train['price'] as log as we had found during the exploratory data analysis portion of the project

#### Transforming the text data into a matrix and combinining dummies data (with shipping and item condition) into the matrix

In [17]:
tfidf = TfidfVectorizer(max_features = 10000, lowercase=True, max_df=0.7, min_df=1, ngram_range=(1,2), stop_words= 'english')
text_transformed = tfidf.fit_transform(text_train)

In [18]:
text_transformed = text_transformed.astype(int)

#### Checking to see the result of the combined matrix/ if the row features are in line with what we originally started with:

In [19]:
print(text_transformed.shape)
print(X_dummies.shape)

(1442262, 10000)
(1442262, 2)


In [20]:
X = scipy.sparse.hstack((X_dummies, text_transformed)).tocsr()

In [21]:
X.shape

(1442262, 10002)

#### Train Test Split of the data to create a sample train/test from the training data set.

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [23]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha = 1.0)

In [24]:
ridge.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [25]:
ridge.score(X_test,y_test)

0.065007517529582781

In [26]:
predictions = ridge.predict(X_test)
#Would need to do a root mean squared error on the predictions vs real y_test.
from sklearn.metrics import mean_squared_error
from math import sqrt

rmsle = sqrt(mean_squared_error(y_test, predictions))

In [27]:
print(rmsle)

0.6380121199664495


In [39]:
from sklearn.ensemble import RandomForestRegressor
estimators = [("rfr", RandomForestRegressor())]
model = Pipeline(estimators)
model.fit(X, y)

params={"n_estimators": [5, 10, 20, 50],
        "max_features": [0.2, 0.4, 0.6, 0.8],
        "min_samples_leaf": [2, 10, 20, 50]
       }

In [40]:
gscv = GridSearchCV(estimator=RandomForestRegressor(), param_grid = params, scoring = "neg_mean_squared_error" , verbose=600)

In [41]:
gscv.fit(X, y)

Fitting 3 folds for each of 64 candidates, totalling 192 fits
[CV] max_features=0.2, min_samples_leaf=2, n_estimators=5 ............
[CV]  max_features=0.2, min_samples_leaf=2, n_estimators=5, score=-0.4063163052558484, total=   3.6s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.8s remaining:    0.0s
[CV] max_features=0.2, min_samples_leaf=2, n_estimators=5 ............
[CV]  max_features=0.2, min_samples_leaf=2, n_estimators=5, score=-0.4072767040559621, total=   3.7s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    7.9s remaining:    0.0s
[CV] max_features=0.2, min_samples_leaf=2, n_estimators=5 ............
[CV]  max_features=0.2, min_samples_leaf=2, n_estimators=5, score=-0.40618530835960454, total=   3.7s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   11.9s remaining:    0.0s
[CV] max_features=0.2, min_samples_leaf=2, n_estimators=10 ...........
[CV]  max_features=0.2, min_samples_leaf=2, n_estimators=10, score=-0.40631106861410954, total=   7.8s
[Parall

[CV]  max_features=0.2, min_samples_leaf=20, n_estimators=20, score=-0.40764697187383503, total=   6.7s
[Parallel(n_jobs=1)]: Done  33 out of  33 | elapsed:  6.1min remaining:    0.0s
[CV] max_features=0.2, min_samples_leaf=20, n_estimators=50 ..........
[CV]  max_features=0.2, min_samples_leaf=20, n_estimators=50, score=-0.4144761648977189, total=  13.9s
[Parallel(n_jobs=1)]: Done  34 out of  34 | elapsed:  6.4min remaining:    0.0s
[CV] max_features=0.2, min_samples_leaf=20, n_estimators=50 ..........
[CV]  max_features=0.2, min_samples_leaf=20, n_estimators=50, score=-0.410444059659439, total=  17.9s
[Parallel(n_jobs=1)]: Done  35 out of  35 | elapsed:  6.7min remaining:    0.0s
[CV] max_features=0.2, min_samples_leaf=20, n_estimators=50 ..........
[CV]  max_features=0.2, min_samples_leaf=20, n_estimators=50, score=-0.40987641962865345, total=  16.4s
[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:  7.0min remaining:    0.0s
[CV] max_features=0.2, min_samples_leaf=50, n_estimato

[CV]  max_features=0.4, min_samples_leaf=10, n_estimators=10, score=-0.40650809979700797, total=   5.5s
[Parallel(n_jobs=1)]: Done  66 out of  66 | elapsed: 12.3min remaining:    0.0s
[CV] max_features=0.4, min_samples_leaf=10, n_estimators=20 ..........
[CV]  max_features=0.4, min_samples_leaf=10, n_estimators=20, score=-0.40679426446490724, total=   9.1s
[Parallel(n_jobs=1)]: Done  67 out of  67 | elapsed: 12.5min remaining:    0.0s
[CV] max_features=0.4, min_samples_leaf=10, n_estimators=20 ..........
[CV]  max_features=0.4, min_samples_leaf=10, n_estimators=20, score=-0.40734536995510523, total=  11.3s
[Parallel(n_jobs=1)]: Done  68 out of  68 | elapsed: 12.7min remaining:    0.0s
[CV] max_features=0.4, min_samples_leaf=10, n_estimators=20 ..........
[CV]  max_features=0.4, min_samples_leaf=10, n_estimators=20, score=-0.4064454829084755, total=  11.9s
[Parallel(n_jobs=1)]: Done  69 out of  69 | elapsed: 12.9min remaining:    0.0s
[CV] max_features=0.4, min_samples_leaf=10, n_estima

[CV]  max_features=0.6, min_samples_leaf=2, n_estimators=5, score=-0.40616774898539404, total=   3.9s
[Parallel(n_jobs=1)]: Done  99 out of  99 | elapsed: 18.1min remaining:    0.0s
[CV] max_features=0.6, min_samples_leaf=2, n_estimators=10 ...........
[CV]  max_features=0.6, min_samples_leaf=2, n_estimators=10, score=-0.40631325509868055, total=   7.3s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 18.2min remaining:    0.0s
[CV] max_features=0.6, min_samples_leaf=2, n_estimators=10 ...........
[CV]  max_features=0.6, min_samples_leaf=2, n_estimators=10, score=-0.4072701299039894, total=   7.5s
[Parallel(n_jobs=1)]: Done 101 out of 101 | elapsed: 18.4min remaining:    0.0s
[CV] max_features=0.6, min_samples_leaf=2, n_estimators=10 ...........
[CV]  max_features=0.6, min_samples_leaf=2, n_estimators=10, score=-0.40617692177543646, total=   8.6s
[Parallel(n_jobs=1)]: Done 102 out of 102 | elapsed: 18.5min remaining:    0.0s
[CV] max_features=0.6, min_samples_leaf=2, n_estimators=2

[CV]  max_features=0.6, min_samples_leaf=20, n_estimators=50, score=-0.4068103654092018, total=  21.2s
[Parallel(n_jobs=1)]: Done 132 out of 132 | elapsed: 26.0min remaining:    0.0s
[CV] max_features=0.6, min_samples_leaf=50, n_estimators=5 ...........
[CV]  max_features=0.6, min_samples_leaf=50, n_estimators=5, score=-0.40655224268858253, total=   3.1s
[Parallel(n_jobs=1)]: Done 133 out of 133 | elapsed: 26.0min remaining:    0.0s
[CV] max_features=0.6, min_samples_leaf=50, n_estimators=5 ...........
[CV]  max_features=0.6, min_samples_leaf=50, n_estimators=5, score=-0.4078011699445228, total=   3.2s
[Parallel(n_jobs=1)]: Done 134 out of 134 | elapsed: 26.1min remaining:    0.0s
[CV] max_features=0.6, min_samples_leaf=50, n_estimators=5 ...........
[CV]  max_features=0.6, min_samples_leaf=50, n_estimators=5, score=-0.406410444985832, total=   3.0s
[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed: 26.2min remaining:    0.0s
[CV] max_features=0.6, min_samples_leaf=50, n_estimators=1

[CV]  max_features=0.8, min_samples_leaf=10, n_estimators=20, score=-0.4063755017752602, total=  10.5s
[Parallel(n_jobs=1)]: Done 165 out of 165 | elapsed: 32.0min remaining:    0.0s
[CV] max_features=0.8, min_samples_leaf=10, n_estimators=50 ..........
[CV]  max_features=0.8, min_samples_leaf=10, n_estimators=50, score=-0.4064121808278779, total=  24.4s
[Parallel(n_jobs=1)]: Done 166 out of 166 | elapsed: 32.5min remaining:    0.0s
[CV] max_features=0.8, min_samples_leaf=10, n_estimators=50 ..........
[CV]  max_features=0.8, min_samples_leaf=10, n_estimators=50, score=-0.4074433999019346, total=  25.6s
[Parallel(n_jobs=1)]: Done 167 out of 167 | elapsed: 32.9min remaining:    0.0s
[CV] max_features=0.8, min_samples_leaf=10, n_estimators=50 ..........
[CV]  max_features=0.8, min_samples_leaf=10, n_estimators=50, score=-0.4062317313349785, total=  25.9s
[Parallel(n_jobs=1)]: Done 168 out of 168 | elapsed: 33.4min remaining:    0.0s
[CV] max_features=0.8, min_samples_leaf=20, n_estimator

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 20, 50], 'max_features': [0.2, 0.4, 0.6, 0.8], 'min_samples_leaf': [2, 10, 20, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=600)

In [43]:
outfile = 'grid_ml_rfr.pkl'

with open(outfile, 'wb') as pickle_file: pickle.dump(gscv, pickle_file)

In [42]:
#best estimators
print("Best Estimator: ", gscv.best_estimator_)
#printing best scores
print("Best Score: ", gscv.best_score_)
#printing best parameters for optimal parameter tuning
print("Best Parameters: ", gscv.best_params_)

Best Estimator:  RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.2, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
Best Score:  -0.40658340241
Best Parameters:  {'max_features': 0.2, 'min_samples_leaf': 2, 'n_estimators': 10}


In [45]:
#load pickle file for the random forest regressor gridsearch
grid_ml_rfr = pickle.load(open('grid_ml_rfr.pkl', "rb"))

#### Applying the best estimators to the Random Forest Regressor Model for testing:

In [46]:
model = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.2, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [47]:
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.2, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [48]:
model.score(X_test, y_test)

0.069174438612197742

In [49]:
predictions = model.predict(X_test)
rmsle = sqrt(mean_squared_error(y_test, predictions))
print(rmsle)

0.636588838568236


In [50]:
from sklearn.linear_model import LinearRegression
estimators = [("lin_reg", LinearRegression())]
model = Pipeline(estimators)
model.fit(X, y)

params={"fit_intercept": [True,False],
        "normalize": [True,False]
       }

In [51]:
linreg_gscv = GridSearchCV(estimator=LinearRegression(), param_grid = params, scoring = "neg_mean_squared_error" , verbose=600)

In [52]:
linreg_gscv.fit(X, y)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] fit_intercept=True, normalize=True ..............................
[CV]  fit_intercept=True, normalize=True, score=-0.4155683120652552, total=   0.9s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s
[CV] fit_intercept=True, normalize=True ..............................
[CV]  fit_intercept=True, normalize=True, score=-0.4164438254390223, total=   1.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.1s remaining:    0.0s
[CV] fit_intercept=True, normalize=True ..............................
[CV]  fit_intercept=True, normalize=True, score=-0.41563038475556513, total=   0.9s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.1s remaining:    0.0s
[CV] fit_intercept=True, normalize=False .............................
[CV]  fit_intercept=True, normalize=False, score=-0.41556831206533507, total=   0.9s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    4.2s remaining:    0.0s
[

GridSearchCV(cv=None, error_score='raise',
       estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'fit_intercept': [True, False], 'normalize': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=600)

In [53]:
#best estimators
print("Best Estimator: ", linreg_gscv.best_estimator_)
#printing best scores
print("Best Score: ", linreg_gscv.best_score_)
#printing best parameters for optimal parameter tuning
print("Best Parameters: ", linreg_gscv.best_params_)

Best Estimator:  LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
Best Score:  -0.415880840753
Best Parameters:  {'fit_intercept': True, 'normalize': False}


In [54]:
outfile = 'grid_ml_linreg.pkl'

with open(outfile, 'wb') as pickle_file: pickle.dump(linreg_gscv, pickle_file)

In [55]:
grid_ml_linreg = pickle.load(open('grid_ml_linreg.pkl', "rb"))

In [56]:
model = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [57]:
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [58]:
model.score(X_test, y_test)

0.047382471802861037

In [59]:
predictions = model.predict(X_test)
rmsle = sqrt(mean_squared_error(y_test, predictions))
print(rmsle)

0.6439974585233642
