# Mercari Price Prediction ML

## Importing Libraries:

In [1]:
import scipy as sp
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import RidgeCV, Ridge
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import scipy
import pickle


## RMSLE Function:

In [2]:
def rmsle(y_pred, y_test) : 
    assert len(y_test.astype(int)) == len(y_pred.astype(int))
    return(np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y_test))**2)))

#### Pulling in the data

In [3]:
train = pd.read_csv("train.tsv", sep='\t', header=0)
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [4]:
test = pd.read_csv("test.tsv", sep='\t', header=0)
test.head()

Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description
0,0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,,1,Size 7
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined..."
2,2,Coach bag,1,Vintage & Collectibles/Bags and Purses/Handbag,Coach,1,Brand new coach bag. Bought for [rm] at a Coac...
3,3,Floral Kimono,2,Women/Sweaters/Cardigan,,0,-floral kimono -never worn -lightweight and pe...
4,4,Life after Death,3,Other/Books/Religion & Spirituality,,1,Rediscovering life after the loss of a loved o...


## Data Cleaning

#### Need to remove null values from item_description, as they would interfere with the algorithm.

In [5]:
#Deleting null rows to clean up the data.
train = train[~train['item_description'].isnull()]
test = test[~test['item_description'].isnull()]

#### Combining datasets for train and test together so that all function performed in preparing the data will not skew the size of the matrix. Here, we are also going to remove 0 value entries for log price and other outliers as they would skew the data.

In [6]:
#using log price and pre-setting quantiles to delete outliers within the data.
train['log_price'] = np.log(train.price + 1)
q = train['log_price'].quantile(0.975)
train = train[train['log_price'] < q]
train = train[train['log_price'] > 0]

In [7]:
data = pd.concat([train, test], 0)
train_rows = train.shape[0]
print(train_rows)

1442262


#### Replacing 'No description yet' to no_desc for tfidf vectorizer to count these as stop words. Also transforming category_name into two distinct categories to introduce two features for the matrix

In [8]:
data.item_description = data.item_description.str.replace('No description yet', 'no_desc')
data['primary_cat'] = data.category_name.str.extract('([^/]+)/[^/]+/[^/]+')
data['secondary_cat'] = data.category_name.str.extract('[^/]+/([^/]+/[^/]+)')
data = data.apply(lambda x: x.astype(str).str.lower())

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [9]:
data.head()

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,log_price,name,price,shipping,test_id,train_id,primary_cat,secondary_cat
0,,men/tops/t-shirts,3,no_desc,2.3978952728,mlb cincinnati reds t shirt size xl,10.0,1,,0.0,men,tops/t-shirts
1,razer,electronics/computers & tablets/components & p...,3,this keyboard is in great condition and works ...,3.97029191355,razer blackwidow chroma keyboard,52.0,0,,1.0,electronics,computers & tablets/components & parts
2,target,women/tops & blouses/blouse,1,adorable top with a hint of lace and a key hol...,2.3978952728,ava-viv blouse,10.0,1,,2.0,women,tops & blouses/blouse
3,,home/home décor/home décor accents,1,new with tags. leather horses. retail for [rm]...,3.58351893846,leather horse statues,35.0,1,,3.0,home,home décor/home décor accents
4,,women/jewelry/necklaces,1,complete with certificate of authenticity,3.80666248977,24k gold plated rose,44.0,0,,4.0,women,jewelry/necklaces


In [10]:
text_features = ['name', 'brand_name', 'category_name', 'primary_cat', 'secondary_cat', 'item_description']
for t in text_features:
    data[t].replace(regex=True,inplace=True,to_replace=r'\W',value=r' ')
    
data.head()

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,log_price,name,price,shipping,test_id,train_id,primary_cat,secondary_cat
0,,men tops t shirts,3,no_desc,2.3978952728,mlb cincinnati reds t shirt size xl,10.0,1,,0.0,men,tops t shirts
1,razer,electronics computers tablets components p...,3,this keyboard is in great condition and works ...,3.97029191355,razer blackwidow chroma keyboard,52.0,0,,1.0,electronics,computers tablets components parts
2,target,women tops blouses blouse,1,adorable top with a hint of lace and a key hol...,2.3978952728,ava viv blouse,10.0,1,,2.0,women,tops blouses blouse
3,,home home décor home décor accents,1,new with tags leather horses retail for rm ...,3.58351893846,leather horse statues,35.0,1,,3.0,home,home décor home décor accents
4,,women jewelry necklaces,1,complete with certificate of authenticity,3.80666248977,24k gold plated rose,44.0,0,,4.0,women,jewelry necklaces


In [11]:
data['item_condition_id'] = pd.to_numeric(data.item_condition_id, errors = 'ignore')
data['shipping'] = pd.to_numeric(data.shipping, errors = 'ignore')
data['item_description'] = data['item_description'].fillna('')

In [12]:
data['brand_name'] = data['brand_name'].replace([np.nan,'nan'], 'negative', regex=True)
data['brand_name'] = data['brand_name'].str.replace('\s+', '')  # in case there are multiple white spaces
data['brand_name'] = 'brand_' + data['brand_name'].astype(str)


In [13]:
data.head()

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,log_price,name,price,shipping,test_id,train_id,primary_cat,secondary_cat
0,brand_negative,men tops t shirts,3,no_desc,2.3978952728,mlb cincinnati reds t shirt size xl,10.0,1,,0.0,men,tops t shirts
1,brand_razer,electronics computers tablets components p...,3,this keyboard is in great condition and works ...,3.97029191355,razer blackwidow chroma keyboard,52.0,0,,1.0,electronics,computers tablets components parts
2,brand_target,women tops blouses blouse,1,adorable top with a hint of lace and a key hol...,2.3978952728,ava viv blouse,10.0,1,,2.0,women,tops blouses blouse
3,brand_negative,home home décor home décor accents,1,new with tags leather horses retail for rm ...,3.58351893846,leather horse statues,35.0,1,,3.0,home,home décor home décor accents
4,brand_negative,women jewelry necklaces,1,complete with certificate of authenticity,3.80666248977,24k gold plated rose,44.0,0,,4.0,women,jewelry necklaces


#### Combining text values into one single text  corpus

In [14]:
text = (data['name'] + ' '+ data['brand_name'] + ' ' +
          data['item_description'] + ' ' + data['primary_cat'] + ' ' + 
          data['secondary_cat']).values

In [15]:
#separating the text data into train and test set again
text_train = text[:train_rows]
text_test = text[train_rows:]


dummies = scipy.sparse.csr_matrix(pd.get_dummies(data[["shipping",'item_condition_id']], sparse = True).values)
X_dummies = dummies[:train_rows]
y_dummies = dummies[train_rows:]
y = np.array(train['log_price'].values)

#### This gridsearch took at upwards of 12 hours for 405 candidates for a total of 1215 fits. The results of the gridsearch have been pickled and saved to a pkl file and the code has been converted to markdown format. However, because gridsearch is a CPU intensive exercise and I am working on a local machine, the results will stand as is, but some features that have interfered with the outputs will be lef tout 

In [None]:
#estimators = [("tf_idf", TfidfVectorizer(lowercase=True)), 
#              ("ridge", Ridge())]
#model = Pipeline(estimators)
#model.fit(text_train, y)


#params = {"ridge__alpha":[0.1, 0.3, 0.5, 1, 3], #regularization param
#          "tf_idf__min_df": [1, 5, 10],#min count of words allowed
#          "tf_idf__max_df": [0.7,0.8,0.9],
#          "tf_idf__ngram_range": [(1,1), (1,2), (1,3)], #1-grams or 2-grams
#          "tf_idf__max_features": [5000, 7500, 10000]#5000 features, 7500, or 10000
#         }
#grid = GridSearchCV(estimator=model, param_grid = params, scoring = "neg_mean_squared_error" , verbose=600)
#grid.fit(text_train, y)
#import pickle

#Pickling the output:
#outfile = 'grid_ml.pkl'

#with open(outfile, 'wb') as pickle_file:
#    pickle.dump(grid, pickle_file)

#### The gridsearch results for tfidf vectorizer and the following best estimator/best score/best parameters from the pickle file are uploaded in the cell below:

In [16]:
import pickle
grid_ml = pickle.load(open('grid_ml.pkl', "rb"))

#best estimators
print("Best Estimator: ", grid_ml.best_estimator_)
#printing best scores
print("Best Score: ", grid_ml.best_score_)
#printing best parameters for optimal parameter tuning
print("Best Parameters: ", grid_ml.best_params_)

Best Estimator:  Pipeline(memory=None,
     steps=[('tf_idf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=10000, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,...it_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])
Best Score:  -0.222323799127
Best Parameters:  {'ridge__alpha': 1, 'tf_idf__max_df': 0.7, 'tf_idf__max_features': 10000, 'tf_idf__min_df': 1, 'tf_idf__ngram_range': (1, 2)}


#### Separating the training and test sets again after massaging the data for the distinct train and test sets with the train_rows mask. Also creating the y value with train['price'] as log as we had found during the exploratory data analysis portion of the project.

#### Transforming the text data into a matrix and combinining dummies data (with shipping and item condition) into the matrix

In [17]:
tfidf = TfidfVectorizer(max_features = 10000, lowercase=True, max_df=0.7, min_df=1, ngram_range=(1,2), stop_words= 'english')
text_transformed = tfidf.fit_transform(text_train)

#### Checking to see the result of the combined matrix if the row counts are in line with what we originally started with for validation purposes:

In [18]:
print(text_transformed.shape)
print(X_dummies.shape)

(1442262, 10000)
(1442262, 2)


In [19]:
X = scipy.sparse.hstack((X_dummies, text_transformed)).tocsr()

In [20]:
X.shape

(1442262, 10002)

#### Train Test Split of the data to create a sample train/test from the training data set.

In [21]:
X_train, X_test, y_train, y_test = train_test_split(text_transformed, y, test_size=0.4, random_state=42)

#### Trying to see what parts of a ridge regression could help reach the best score. by running grid search and pickling the results as done before, the run code is below in markdown format while the reference to the pickle file is below.

In [22]:
from sklearn.model_selection import GridSearchCV

#estimators = [("ridge", Ridge())]
#model = Pipeline(estimators)
#model.fit(X, y)


#params = {"ridge__alpha":[0.1, 0.3, 0.5, 1, 3], #regularization param
#         "ridge__fit_intercept": [True, False],
#          "ridge__normalize": [True, False]    
#         }
               
#grid = GridSearchCV(estimator=model, param_grid = params, verbose=600) 
#grid.fit(X, y)

#Pickling the output:
#outfile = 'grid_ridge.pkl'

#with open(outfile, 'wb') as pickle_file:
#    pickle.dump(grid, pickle_file)

In [23]:
grid_ridge = pickle.load(open('grid_ridge.pkl', "rb"))
#best estimators
print("Best Estimator: ", grid_ridge.best_estimator_)
#printing best scores
print("Best Score: ", grid_ridge.best_score_)
#printing best parameters for optimal parameter tuning
print("Best Parameters: ", grid_ridge.best_params_)


Best Estimator:  Pipeline(memory=None,
     steps=[('ridge', Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])
Best Score:  0.518507368615
Best Parameters:  {'ridge__alpha': 1, 'ridge__fit_intercept': True, 'ridge__normalize': False}


#### Applying the best estimators to the Ridge Regression Model for testing:

In [24]:
ridge = Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='atuo', tol=0.001)

In [25]:
ridge.fit(X_train, y_train)



Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='atuo', tol=0.001)

In [26]:
print("Ridge Regression Score: ",(ridge.score(X_test,y_test))*100,"%")

Ridge Regression Score:  49.4311120567 %


In [27]:
predictions = ridge.predict(X_test)
#Would need to do a root mean squared error on the predictions vs real y_test.
from sklearn.metrics import mean_squared_error
from math import sqrt

rmsle = rmsle(y_test, predictions)
print("Root Mean Squared Logarithmic Error for Ridge Regression: ",rmsle*100,"%")

Root Mean Squared Logarithmic Error for Ridge Regression:  12.2031294305 %


In [30]:
from sklearn.ensemble import RandomForestRegressor
estimators = [("rfr", RandomForestRegressor())]
model = Pipeline(estimators)
#model.fit(X, y)

params={"n_estimators": [10, 20, 50, 100],
        "max_features": [0.2, 0.4, 0.6, 0.8,1],
        "min_samples_leaf": [2, 10, 20, 50, 100]
       }


In [None]:
gscv = GridSearchCV(estimator=RandomForestRegressor(), param_grid = params, verbose=600)
gscv.fit(X, y)



In [None]:
#Pickling the output:
outfile = 'grid_ml_rfr.pkl'

with open(outfile, 'wb') as pickle_file: pickle.dump(gscv, pickle_file)

In [28]:
#load pickle file for the random forest regressor gridsearch
grid_ml_rfr = pickle.load(open('grid_ml_rfr.pkl', "rb"))
#best estimators
print("Best Estimator: ", grid_ml_rfr.best_estimator_)
#printing best scores
print("Best Score: ", grid_ml_rfr.best_score_)
#printing best parameters for optimal parameter tuning
print("Best Parameters: ", grid_ml_rfr.best_params_)

Best Estimator:  RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.2, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
Best Score:  -0.40658340241
Best Parameters:  {'max_features': 0.2, 'min_samples_leaf': 2, 'n_estimators': 10}


#### Applying the best estimators to the Random Forest Regressor Model for testing:

In [31]:
model = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.2, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [32]:
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.2, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [33]:
print("Random Forest Regressor Score: ",model.score(X_test, y_test)*100,"%")

Random Forest Regressor Score:  48.8483840713 %


In [34]:
predictions= model.predict(X_test)


In [35]:
rmsle = rmsle(y_test, predictions)
print("Root Mean Squared Logarithmic Error for Random Forest Regressor: ",rmsle*100,"%")

TypeError: 'numpy.float64' object is not callable

In [36]:
from sklearn.linear_model import LinearRegression
estimators = [("lin_reg", LinearRegression())]
model = Pipeline(estimators)
model.fit(X, y)

params={"fit_intercept": [True,False],
        "normalize": [True,False]
       }
linreg_gscv = GridSearchCV(estimator=LinearRegression(), param_grid = params, scoring = "neg_mean_squared_error" , verbose=600)

linreg_gscv.fit(X, y)
#Pickling the output:
outfile = 'grid_ml_linreg.pkl'

with open(outfile, 'wb') as pickle_file: pickle.dump(linreg_gscv, pickle_file)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] fit_intercept=True, normalize=True ..............................
[CV]  fit_intercept=True, normalize=True, score=-0.2103649835455393, total= 4.2min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.2min remaining:    0.0s
[CV] fit_intercept=True, normalize=True ..............................
[CV]  fit_intercept=True, normalize=True, score=-0.21065075600566505, total= 4.4min
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  8.6min remaining:    0.0s
[CV] fit_intercept=True, normalize=True ..............................
[CV]  fit_intercept=True, normalize=True, score=-0.2100811760610622, total= 4.2min
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 12.8min remaining:    0.0s
[CV] fit_intercept=True, normalize=False .............................
[CV]  fit_intercept=True, normalize=False, score=-0.21036546741215942, total= 5.0min
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 17.8min remaining:    0.0s
[

In [37]:
grid_ml_linreg = pickle.load(open('grid_ml_linreg.pkl', "rb"))

In [38]:
#best estimators
print("Best Estimator: ", grid_ml_linreg.best_estimator_)
#printing best scores
print("Best Score: ", grid_ml_linreg.best_score_)
#printing best parameters for optimal parameter tuning
print("Best Parameters: ", grid_ml_linreg.best_params_)

Best Estimator:  LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
Best Score:  -0.210363433423
Best Parameters:  {'fit_intercept': True, 'normalize': False}


In [39]:
model = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [40]:
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [41]:
print("Linear Regression Score: ",model.score(X_test, y_test)*100,"%")

Linear Regression Score:  49.1602373287 %


In [42]:
predictions = model.predict(X_test)
rmsle = rmsle(y_test, predictions)
print("Root Mean Squared Logarithmic Error for Linear Regression: ",rmsle*100,"%")

TypeError: 'numpy.float64' object is not callable