In [1]:
import gzip
from collections import defaultdict
import math
import random
import numpy as np
import scipy.optimize

def readGz(f):
  for l in gzip.open(f):
    yield eval(l)

#reading data
rawdata=list(readGz("reviews_Electronics_5.json.gz"))

In [205]:
import string
def pad(c): return ' '+c
remove_table = str.maketrans({}.fromkeys(string.punctuation))
keep_table = str.maketrans({pun:pad(pun) for pun in string.punctuation})
table={True: remove_table, False:keep_table}

#clean the data before fitting
def clean(text,table,remove):
    return text.lower().translate(table[remove])

data=rawdata
for d in data:
    d['reviewText']=clean(d['reviewText'],table,True)

In [206]:
data=data[:20000]
import pandas as pd
df=pd.DataFrame.from_dict(data)
df.drop(columns=['helpful','reviewTime','reviewerID','reviewerName',
         'summary','unixReviewTime'],inplace=True)

In [213]:
from sklearn.metrics import mean_squared_error as mse
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer=CountVectorizer(max_features=2000)
X=vectorizer.fit_transform(df.reviewText)
y=df.overall
clf=linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X.toarray(), y)
predictions=clf.predict(X)
print(mse(y,predictions))

clf=linear_model.Ridge(1.0, fit_intercept=True)
clf.fit(X.toarray(), y)
predictions=clf.predict(X)
print(mse(y,predictions))

5.1872416244052335
0.7448179899762848


In [220]:
vectorizer=TfidfVectorizer(max_features=2000)
X=vectorizer.fit_transform(df.reviewText)
y=df.overall
clf=linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X.toarray(), y)
predictions=clf.predict(X)
print(mse(y,predictions))

clf=linear_model.Ridge(1.0, fit_intercept=True)
clf.fit(X.toarray(), y)
predictions=clf.predict(X)
print(mse(y,predictions))

1.3945919324032563
0.6884491678880381


In [221]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

pipeline=Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', linear_model.Ridge()),
])
pipeline.set_params(vect__max_features=2000)
pipeline.set_params(clf__alpha=1)
pipeline.set_params(clf__fit_intercept=False)
pipeline.fit(df.reviewText,y)
predictions=pipeline.predict(df.reviewText)
print(mse(y,predictions))

1.3975678867929218


In [222]:
from sklearn.model_selection import GridSearchCV
params={'clf__alpha':[1,10,100],
       'clf__fit_intercept':[True],
       'vect__max_features': [1000,2000,3000],
       'vect__stop_words':['english',None],
       'vect__ngram_range':[(1,1),(1,2),(1,3)]
       }
grid_search=GridSearchCV(pipeline, params,cv=5,verbose=1,
                         scoring='neg_mean_squared_error')

In [223]:
grid_search.fit(df.reviewText,y)

Fitting 2 folds for each of 48 candidates, totalling 96 fits


[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed:  7.6min finished


GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=2000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...t_intercept=False, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'clf__alpha': [0.1, 1, 10, 100], 'clf__fit_intercept': [True], 'vect__max_features': [1000, 2000], 'vect__stop_words': ['english', None], 'vect__ngram_range': [(1, 1), (1, 2), (2, 2)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=1)

In [224]:
grid_search.best_score_

-0.9093955528825496

In [225]:
grid_search.best_params_

{'clf__alpha': 1,
 'clf__fit_intercept': True,
 'vect__max_features': 2000,
 'vect__ngram_range': (1, 1),
 'vect__stop_words': None}