# Mercari Price Prediction ML

In [1]:
import scipy as sp
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import RidgeCV, Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import scipy

#### Pulling in premade Kaggle competition data.

In [2]:
train = pd.read_csv("train.tsv", sep='\t', header=0)
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [3]:
test = pd.read_csv("test.tsv", sep='\t', header=0)
test.head()

Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description
0,0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,,1,Size 7
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined..."
2,2,Coach bag,1,Vintage & Collectibles/Bags and Purses/Handbag,Coach,1,Brand new coach bag. Bought for [rm] at a Coac...
3,3,Floral Kimono,2,Women/Sweaters/Cardigan,,0,-floral kimono -never worn -lightweight and pe...
4,4,Life after Death,3,Other/Books/Religion & Spirituality,,1,Rediscovering life after the loss of a loved o...


#### Need to remove null values from item_description, as they would interfere with the algorithm.

In [4]:
train = train[~train['item_description'].isnull()]
test = test[~test['item_description'].isnull()]

#### Combining datasets for train and test together so that all function performed in preparing the data will not skew the size of the matrix

In [5]:
train['log_price'] = np.log(train.price + 1)
q = train['log_price'].quantile(0.975)
train = train[train['log_price'] < q]
train = train[train['log_price'] > 0]

In [6]:
data = pd.concat([train, test], 0)
train_rows = train.shape[0]
print(train_rows)

1442262


Replacing 'No description yet' to no_desc for tfidf vectorizer to count these as stop words. Also transforming category_name into two distinct categories to introduce two features for the matrix

In [7]:
data.item_description = data.item_description.str.replace('No description yet', 'no_desc')
data['primary_cat'] = data.category_name.str.extract('([^/]+)/[^/]+/[^/]+')
data['secondary_cat'] = data.category_name.str.extract('[^/]+/([^/]+/[^/]+)')
data = data.apply(lambda x: x.astype(str).str.lower())

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
data.head()

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,log_price,name,price,shipping,test_id,train_id,primary_cat,secondary_cat
0,,men/tops/t-shirts,3,no_desc,2.3978952728,mlb cincinnati reds t shirt size xl,10.0,1,,0.0,men,tops/t-shirts
1,razer,electronics/computers & tablets/components & p...,3,this keyboard is in great condition and works ...,3.97029191355,razer blackwidow chroma keyboard,52.0,0,,1.0,electronics,computers & tablets/components & parts
2,target,women/tops & blouses/blouse,1,adorable top with a hint of lace and a key hol...,2.3978952728,ava-viv blouse,10.0,1,,2.0,women,tops & blouses/blouse
3,,home/home décor/home décor accents,1,new with tags. leather horses. retail for [rm]...,3.58351893846,leather horse statues,35.0,1,,3.0,home,home décor/home décor accents
4,,women/jewelry/necklaces,1,complete with certificate of authenticity,3.80666248977,24k gold plated rose,44.0,0,,4.0,women,jewelry/necklaces


In [9]:
text_features = ['name', 'brand_name', 'category_name', 'primary_cat', 'secondary_cat', 'item_description']
for t in text_features:
    data[t].replace(regex=True,inplace=True,to_replace=r'\W',value=r' ')
    
data.head()

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,log_price,name,price,shipping,test_id,train_id,primary_cat,secondary_cat
0,,men tops t shirts,3,no_desc,2.3978952728,mlb cincinnati reds t shirt size xl,10.0,1,,0.0,men,tops t shirts
1,razer,electronics computers tablets components p...,3,this keyboard is in great condition and works ...,3.97029191355,razer blackwidow chroma keyboard,52.0,0,,1.0,electronics,computers tablets components parts
2,target,women tops blouses blouse,1,adorable top with a hint of lace and a key hol...,2.3978952728,ava viv blouse,10.0,1,,2.0,women,tops blouses blouse
3,,home home décor home décor accents,1,new with tags leather horses retail for rm ...,3.58351893846,leather horse statues,35.0,1,,3.0,home,home décor home décor accents
4,,women jewelry necklaces,1,complete with certificate of authenticity,3.80666248977,24k gold plated rose,44.0,0,,4.0,women,jewelry necklaces


In [10]:
data['price'] = pd.to_numeric(data.price, errors = 'ignore')
data['item_condition_id'] = pd.to_numeric(data.item_condition_id, errors = 'ignore')
data['shipping'] = pd.to_numeric(data.shipping, errors = 'ignore')
data['item_description'] = data['item_description'].fillna('')

In [11]:
data['brand_name'] = data['brand_name'].replace([np.nan,'nan'], 'negative', regex=True)
data['brand_name'] = data['brand_name'].str.replace('\s+', '')  # in case there are multiple white spaces
data['brand_name'] = 'brand_' + data['brand_name'].astype(str)


In [12]:
data.head()

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,log_price,name,price,shipping,test_id,train_id,primary_cat,secondary_cat
0,brand_negative,men tops t shirts,3,no_desc,2.3978952728,mlb cincinnati reds t shirt size xl,10.0,1,,0.0,men,tops t shirts
1,brand_razer,electronics computers tablets components p...,3,this keyboard is in great condition and works ...,3.97029191355,razer blackwidow chroma keyboard,52.0,0,,1.0,electronics,computers tablets components parts
2,brand_target,women tops blouses blouse,1,adorable top with a hint of lace and a key hol...,2.3978952728,ava viv blouse,10.0,1,,2.0,women,tops blouses blouse
3,brand_negative,home home décor home décor accents,1,new with tags leather horses retail for rm ...,3.58351893846,leather horse statues,35.0,1,,3.0,home,home décor home décor accents
4,brand_negative,women jewelry necklaces,1,complete with certificate of authenticity,3.80666248977,24k gold plated rose,44.0,0,,4.0,women,jewelry necklaces


data.shape

In [13]:
text = (data['name'] + ' '+ data['brand_name'] + ' ' +
          data['item_description'] + ' ' + data['primary_cat'] + ' ' + 
          data['secondary_cat']).values

In [38]:
text_train = text[:train_rows]
X_condition = data['item_condition_id'][:train_rows]
X_dummy = data['shipping'][:train_rows]

text_test = text[train_rows:]
y_condition = data['item_condition_id'][train_rows:]
y_dummy = data['shipping'][train_rows:]
y = np.array(train['price'].values)

In [15]:
from nltk import word_tokenize          
#from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
import nltk

#class LemmaTokenizer(object):
#    def __init__(self):
##        self.wnl = WordNetLemmatizer()
#    def __call__(self, articles):
#        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [16]:
import nltk
import string
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems
token_dict = {}
stemmer = PorterStemmer()

In [17]:
estimators = [("tf_idf", TfidfVectorizer(max_features = 5000, lowercase=True)), 
              ("ridge", Ridge())]
model = Pipeline(estimators)
model.fit(text_train, y)


params = {"ridge__alpha":[0.1, 0.3, 0.5, 1, 3], #regularization param
          "tf_idf__min_df": [1, 5],#min count of words allowed
          "tf_idf__max_df": [0.7,0.8],
          "tf_idf__ngram_range": [(1,1), (1,2)], #1-grams or 2-grams
          "tf_idf__stop_words": [None, "english"]#use stopwords or don't
         }


In [18]:
grid = GridSearchCV(estimator=model, param_grid = params, scoring = "neg_mean_squared_error" , verbose=600)
grid.fit(text_train, y)

Fitting 3 folds for each of 80 candidates, totalling 240 fits
[CV] ridge__alpha=0.1, tf_idf__max_df=0.7, tf_idf__min_df=1, tf_idf__ngram_range=(1, 1), tf_idf__stop_words=None 
[CV]  ridge__alpha=0.1, tf_idf__max_df=0.7, tf_idf__min_df=1, tf_idf__ngram_range=(1, 1), tf_idf__stop_words=None, score=-181.59077975260786, total= 1.4min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.0min remaining:    0.0s
[CV] ridge__alpha=0.1, tf_idf__max_df=0.7, tf_idf__min_df=1, tf_idf__ngram_range=(1, 1), tf_idf__stop_words=None 
[CV]  ridge__alpha=0.1, tf_idf__max_df=0.7, tf_idf__min_df=1, tf_idf__ngram_range=(1, 1), tf_idf__stop_words=None, score=-181.31198492094188, total= 1.7min
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  4.3min remaining:    0.0s
[CV] ridge__alpha=0.1, tf_idf__max_df=0.7, tf_idf__min_df=1, tf_idf__ngram_range=(1, 1), tf_idf__stop_words=None 
[CV]  ridge__alpha=0.1, tf_idf__max_df=0.7, tf_idf__min_df=1, tf_idf__ngram_range=(1, 1), tf_idf__stop_words=None, score=-18

[CV]  ridge__alpha=0.1, tf_idf__max_df=0.7, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english, score=-179.69866949019078, total= 3.2min
[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed: 84.4min remaining:    0.0s
[CV] ridge__alpha=0.1, tf_idf__max_df=0.8, tf_idf__min_df=1, tf_idf__ngram_range=(1, 1), tf_idf__stop_words=None 
[CV]  ridge__alpha=0.1, tf_idf__max_df=0.8, tf_idf__min_df=1, tf_idf__ngram_range=(1, 1), tf_idf__stop_words=None, score=-181.5908830535301, total= 1.6min
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed: 86.6min remaining:    0.0s
[CV] ridge__alpha=0.1, tf_idf__max_df=0.8, tf_idf__min_df=1, tf_idf__ngram_range=(1, 1), tf_idf__stop_words=None 
[CV]  ridge__alpha=0.1, tf_idf__max_df=0.8, tf_idf__min_df=1, tf_idf__ngram_range=(1, 1), tf_idf__stop_words=None, score=-181.31110690182092, total= 1.6min
[Parallel(n_jobs=1)]: Done  26 out of  26 | elapsed: 88.8min remaining:    0.0s
[CV] ridge__alpha=0.1, tf_idf__max_df=0.8, tf_idf__min_df=1, t

[Parallel(n_jobs=1)]: Done  47 out of  47 | elapsed: 179.2min remaining:    0.0s
[CV] ridge__alpha=0.1, tf_idf__max_df=0.8, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english 
[CV]  ridge__alpha=0.1, tf_idf__max_df=0.8, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english, score=-179.6982706053097, total= 3.9min
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed: 186.1min remaining:    0.0s
[CV] ridge__alpha=0.3, tf_idf__max_df=0.7, tf_idf__min_df=1, tf_idf__ngram_range=(1, 1), tf_idf__stop_words=None 
[CV]  ridge__alpha=0.3, tf_idf__max_df=0.7, tf_idf__min_df=1, tf_idf__ngram_range=(1, 1), tf_idf__stop_words=None, score=-181.58129540709209, total= 6.5min
[Parallel(n_jobs=1)]: Done  49 out of  49 | elapsed: 194.9min remaining:    0.0s
[CV] ridge__alpha=0.3, tf_idf__max_df=0.7, tf_idf__min_df=1, tf_idf__ngram_range=(1, 1), tf_idf__stop_words=None 
[CV]  ridge__alpha=0.3, tf_idf__max_df=0.7, tf_idf__min_df=1, tf_idf__ngram_range=(1, 1), tf_idf__

[CV]  ridge__alpha=0.3, tf_idf__max_df=0.7, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english, score=-179.59460403609947, total= 3.6min
[Parallel(n_jobs=1)]: Done  71 out of  71 | elapsed: 279.2min remaining:    0.0s
[CV] ridge__alpha=0.3, tf_idf__max_df=0.7, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english 
[CV]  ridge__alpha=0.3, tf_idf__max_df=0.7, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english, score=-179.68836065533685, total= 3.3min
[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed: 283.5min remaining:    0.0s
[CV] ridge__alpha=0.3, tf_idf__max_df=0.8, tf_idf__min_df=1, tf_idf__ngram_range=(1, 1), tf_idf__stop_words=None 
[CV]  ridge__alpha=0.3, tf_idf__max_df=0.8, tf_idf__min_df=1, tf_idf__ngram_range=(1, 1), tf_idf__stop_words=None, score=-181.5807281074883, total= 1.6min
[Parallel(n_jobs=1)]: Done  73 out of  73 | elapsed: 285.8min remaining:    0.0s
[CV] ridge__alpha=0.3, tf_idf__max_df=0.8, tf_idf__mi

[Parallel(n_jobs=1)]: Done  94 out of  94 | elapsed: 360.0min remaining:    0.0s
[CV] ridge__alpha=0.3, tf_idf__max_df=0.8, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english 
[CV]  ridge__alpha=0.3, tf_idf__max_df=0.8, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english, score=-179.5945982363634, total= 2.8min
[Parallel(n_jobs=1)]: Done  95 out of  95 | elapsed: 363.9min remaining:    0.0s
[CV] ridge__alpha=0.3, tf_idf__max_df=0.8, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english 
[CV]  ridge__alpha=0.3, tf_idf__max_df=0.8, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english, score=-179.68873127850475, total= 2.5min
[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed: 367.4min remaining:    0.0s
[CV] ridge__alpha=0.5, tf_idf__max_df=0.7, tf_idf__min_df=1, tf_idf__ngram_range=(1, 1), tf_idf__stop_words=None 
[CV]  ridge__alpha=0.5, tf_idf__max_df=0.7, tf_idf__min_df=1, tf_idf__ngram_range=(1, 1), tf

[CV]  ridge__alpha=0.5, tf_idf__max_df=0.7, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english, score=-179.75867821973114, total= 2.4min
[Parallel(n_jobs=1)]: Done 118 out of 118 | elapsed: 431.4min remaining:    0.0s
[CV] ridge__alpha=0.5, tf_idf__max_df=0.7, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english 
[CV]  ridge__alpha=0.5, tf_idf__max_df=0.7, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english, score=-179.58626320437614, total= 2.4min
[Parallel(n_jobs=1)]: Done 119 out of 119 | elapsed: 434.8min remaining:    0.0s
[CV] ridge__alpha=0.5, tf_idf__max_df=0.7, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english 
[CV]  ridge__alpha=0.5, tf_idf__max_df=0.7, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english, score=-179.68272607328947, total= 2.4min
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed: 438.1min remaining:    0.0s
[CV] ridge__alpha=0.5, tf_idf__max_df=0.8, tf_

[Parallel(n_jobs=1)]: Done 141 out of 141 | elapsed: 493.0min remaining:    0.0s
[CV] ridge__alpha=0.5, tf_idf__max_df=0.8, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english 
[CV]  ridge__alpha=0.5, tf_idf__max_df=0.8, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english, score=-179.75825252887304, total= 2.4min
[Parallel(n_jobs=1)]: Done 142 out of 142 | elapsed: 496.3min remaining:    0.0s
[CV] ridge__alpha=0.5, tf_idf__max_df=0.8, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english 
[CV]  ridge__alpha=0.5, tf_idf__max_df=0.8, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english, score=-179.58612252047047, total= 2.4min
[Parallel(n_jobs=1)]: Done 143 out of 143 | elapsed: 499.7min remaining:    0.0s
[CV] ridge__alpha=0.5, tf_idf__max_df=0.8, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english 
[CV]  ridge__alpha=0.5, tf_idf__max_df=0.8, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2)

[CV]  ridge__alpha=1, tf_idf__max_df=0.7, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=None, score=-182.53315843329236, total= 2.1min
[Parallel(n_jobs=1)]: Done 165 out of 165 | elapsed: 556.2min remaining:    0.0s
[CV] ridge__alpha=1, tf_idf__max_df=0.7, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english 
[CV]  ridge__alpha=1, tf_idf__max_df=0.7, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english, score=-179.74844084514544, total= 1.9min
[Parallel(n_jobs=1)]: Done 166 out of 166 | elapsed: 558.8min remaining:    0.0s
[CV] ridge__alpha=1, tf_idf__max_df=0.7, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english 
[CV]  ridge__alpha=1, tf_idf__max_df=0.7, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english, score=-179.57820433729958, total= 1.9min
[Parallel(n_jobs=1)]: Done 167 out of 167 | elapsed: 561.4min remaining:    0.0s
[CV] ridge__alpha=1, tf_idf__max_df=0.7, tf_idf__min_df=5, 

[CV]  ridge__alpha=1, tf_idf__max_df=0.8, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=None, score=-182.5325790152072, total= 2.1min
[Parallel(n_jobs=1)]: Done 189 out of 189 | elapsed: 606.9min remaining:    0.0s
[CV] ridge__alpha=1, tf_idf__max_df=0.8, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english 
[CV]  ridge__alpha=1, tf_idf__max_df=0.8, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english, score=-179.7490289833977, total= 1.8min
[Parallel(n_jobs=1)]: Done 190 out of 190 | elapsed: 609.5min remaining:    0.0s
[CV] ridge__alpha=1, tf_idf__max_df=0.8, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english 
[CV]  ridge__alpha=1, tf_idf__max_df=0.8, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english, score=-179.57791180431542, total= 1.9min
[Parallel(n_jobs=1)]: Done 191 out of 191 | elapsed: 612.2min remaining:    0.0s
[CV] ridge__alpha=1, tf_idf__max_df=0.8, tf_idf__min_df=5, tf

[Parallel(n_jobs=1)]: Done 212 out of 212 | elapsed: 653.2min remaining:    0.0s
[CV] ridge__alpha=3, tf_idf__max_df=0.7, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=None 
[CV]  ridge__alpha=3, tf_idf__max_df=0.7, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=None, score=-182.54940164217481, total= 2.4min
[Parallel(n_jobs=1)]: Done 213 out of 213 | elapsed: 656.4min remaining:    0.0s
[CV] ridge__alpha=3, tf_idf__max_df=0.7, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english 
[CV]  ridge__alpha=3, tf_idf__max_df=0.7, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english, score=-179.76373137632757, total= 1.9min
[Parallel(n_jobs=1)]: Done 214 out of 214 | elapsed: 659.4min remaining:    0.0s
[CV] ridge__alpha=3, tf_idf__max_df=0.7, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english 
[CV]  ridge__alpha=3, tf_idf__max_df=0.7, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_wor

[CV]  ridge__alpha=3, tf_idf__max_df=0.8, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=None, score=-182.35343460219747, total= 2.2min
[Parallel(n_jobs=1)]: Done 236 out of 236 | elapsed: 708.2min remaining:    0.0s
[CV] ridge__alpha=3, tf_idf__max_df=0.8, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=None 
[CV]  ridge__alpha=3, tf_idf__max_df=0.8, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=None, score=-182.54944248310616, total= 2.2min
[Parallel(n_jobs=1)]: Done 237 out of 237 | elapsed: 711.3min remaining:    0.0s
[CV] ridge__alpha=3, tf_idf__max_df=0.8, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english 
[CV]  ridge__alpha=3, tf_idf__max_df=0.8, tf_idf__min_df=5, tf_idf__ngram_range=(1, 2), tf_idf__stop_words=english, score=-179.76420376432304, total= 1.9min
[Parallel(n_jobs=1)]: Done 238 out of 238 | elapsed: 714.1min remaining:    0.0s
[CV] ridge__alpha=3, tf_idf__max_df=0.8, tf_idf__min_df=5, tf_idf

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tf_idf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...it_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'ridge__alpha': [0.1, 0.3, 0.5, 1, 3], 'tf_idf__min_df': [1, 5], 'tf_idf__max_df': [0.7, 0.8], 'tf_idf__ngram_range': [(1, 1), (1, 2)], 'tf_idf__stop_words': [None, 'english']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=600)

In [39]:
grid.best_params_

{'ridge__alpha': 1,
 'tf_idf__max_df': 0.7,
 'tf_idf__min_df': 5,
 'tf_idf__ngram_range': (1, 2),
 'tf_idf__stop_words': 'english'}

#### Separating the training and test sets again after massaging the data for the distinct train and test sets with the train_rows mask. Also creating the y value with train['price'] as log as we had found during the exploratory data analysis portion of the project

In [42]:
tfidf = TfidfVectorizer(max_features = 5000, lowercase=True, max_df=0.7, min_df=5, ngram_range=(1,2), stop_words= 'english')
text_transformed = tfidf.fit_transform(text_train)

In [59]:
text_transformed = text_transformed.astype(int)

In [60]:
X_dummies = pd.get_dummies(X_condition, X_dummy, sparse=True)
X_dummies = X_dummies.values
X_dummies = scipy.sparse.csr_matrix(X_dummies).astype(int)

In [50]:
print(text_transformed.shape)
print(X_dummies.shape)

(1442262, 5000)
(1442262, 5)


In [62]:
X = scipy.sparse.csr_matrix(text_transformed,X_dummies)

ValueError: Only two-dimensional sparse arrays are supported.

In [None]:
X.shape

In [None]:
dummies = dummies.astype(int)

In [None]:
X_text = text_transformed[:train_rows]
Y_text = text_transformed[train_rows:]

X_dummies = dummies[:train_rows]
Y_dummies = dummies[train_rows:]

In [None]:
y = np.array(train['price'].values)

In [None]:
train = X[:train_rows]
test = X[train_rows:]

In [None]:
train_price = data['price'][:train_rows]
train_price = train_price.astype(float)

In [None]:
y= np.log1p(train_price)

In [None]:
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline

estimators = [("tf_idf", TfidfVectorizer()), 
              ("ridge", Ridge())]

model = Pipeline(estimators)

model.fit(train, y) 

Pipeline(steps=[('tf_idf', TfidfVectorizer(tokenizer = LemmaTokenzier(),analyzer='word', binary=False, decode_error='strict',
        encoding='utf-8', input='content',
        lowercase=True, max_df=.6, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True, fit_intercept=True, max_iter=None,
       normalize=False, random_state=None, solver='auto', tol=0.001))])


params = {"ridge__alpha":[0.1, 0.3, 1, 3, 10], #regularization param
          "tf_idf__min_df": [1, 3, 10],#min count of words allowed
          "tf_idf__max_df": [0.5,0.6,0.7],
          "tf_idf__ngram_range": [(1,1), (1,2), (1,3)], #1-grams or 2-grams
          "tf_idf__stop_words": [None, "english"]#use stopwords or don't
          "tf_idf__max_features": [10000, 25000, 50000]
         }


In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.4, random_state=42)

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

for Model in [Ridge, RandomForestRegressor]:
    model = Model()
    print('%s: %s' % (Model.__name__,
                      cross_val_score(model, X_train, y_train).mean()))

In [None]:
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor as lgb
param_grid = {'learning_rate': [0.01, 0.1, 0.05, .5, 1], 
              'n_estimators': [20, 40, 60], 
              'num_leaves': [30, 35, 40]}

optimized_GBM = GridSearchCV(lgb(objective='regression', verbose=200),
                             param_grid = param_grid,
                             cv=5,
                             n_jobs=-1
                             )

optimized_GBM.fit(X_train, y_train)

In [None]:

optimized_GBM.score(X_test, y_test)

In [None]:
from sklearn.linear_model import RidgeCV

model = Ridge()
params={'alpha': [.0001,.00001]}
gscv = GridSearchCV(estimator=model,
                  param_grid=params,
                  verbose=200)

In [None]:
from sklearn.metrics import mean_squared_error

ridge = RidgeCV(alphas=(0.1, 1.0, 10.0), fit_intercept=True, 
        normalize=False, scoring='mean_squared_error', cv=5, gcv_mode='auto',
        store_cv_values=False)


In [None]:
ridge.fit(X_train, y_train).score(X_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
params={'n_estimators': [5,10,15,20,25]}
gscv = GridSearchCV(estimator=model,
                  param_grid=params,
                  scoring='mean_squared_error',
                  n_jobs=-1,
                  cv=5,
                  verbose=200)

gscv.fit(X_train, y_train)
#best estimators
print("Best Estimator: ", gscv.best_estimator_)
#printing best scores
print("Best Score: ", gscv.best_score_)
#printing best parameters for optimal parameter tuning
print("Best Parameters: ", gscv.best_params_)


In [None]:
predictions = gscv.predict(X_test)
#Would need to do a root mean squared error on the predictions vs real y_test.
from sklearn.metrics import mean_squared_error
from math import sqrt

rmsle = sqrt(mean_squared_error(y_test, predictions))
