In [1]:
import re
import pickle
import pandas as pd
from scipy import sparse

import nltk
from nltk.corpus import stopwords 
nltk.download("stopwords")  
nltk.download('punkt')
nltk.download('wordnet')     
from nltk.stem import WordNetLemmatizer 

from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
data = pd.read_csv("./reviews_watches_300k.csv")
data.dropna(axis=0,inplace=True)
data.head()

Unnamed: 0,star_rating,review_body
0,5,Absolutely love this watch! Get compliments al...
1,5,I love this watch it keeps time wonderfully.
2,2,Scratches
3,5,"It works well on me. However, I found cheaper ..."
4,4,Beautiful watch face. The band looks nice all...


In [3]:
def parse_single_input(description):
        description = re.sub("[^a-zA-Z]"," ",description)
        description = description.lower()   
        description = nltk.word_tokenize(description)
        lemma = nltk.WordNetLemmatizer()
        description = [ lemma.lemmatize(word) for word in description]
        description = " ".join(description)
        return description

In [4]:
## CLEAN DATA - TAKES ABOUT 6 MINUTES FOR 300K RECORDS
def clean_data(input) : 
    description_list = []
    for description in input.review_body:
        description = parse_single_input(description)
        description_list.append(description)
    return description_list

In [5]:
x = clean_data(data)
y = data['star_rating']

In [6]:
vectorizer = CountVectorizer(stop_words="english", analyzer='word', max_features=None)
pp_data = vectorizer.fit_transform(x)

In [7]:
pickle.dump(vectorizer, open('vectorizer.pk', 'wb'))

In [8]:
result_x = pp_data
result_y = y

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(result_x, result_y, test_size=0.2, random_state=0)

In [10]:
sparse.save_npz("x_train.npz", X_train)

In [11]:
from sklearn.linear_model import Ridge
model = Ridge(alpha=.1)
model.fit(X_train, y_train)

Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [12]:
pickle.dump(model, open("prediction_model.sav", 'wb'))