In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
import seaborn as sns

df_reviews = pd.read_csv('reviews.csv.zip', encoding= 'utf-8')

df_rev = df_reviews[['score', 'content']]

def lower_and_remove_punctuation(review):
    make_lower = review.lower()
    remove = set(string.punctuation)
    return "".join(i for i in make_lower if i not in remove)

df_rev['review_no_punct'] = df_rev['content'].apply(lower_and_remove_punctuation)

def split_and_remove_stopwords(words):
    lower = words.lower()
    split = lower.split()
    final = ''
    for s in split:
        if s not in stopwords.words('english'):
            final += (s + " ")
        return final        

df_rev['review_no_stop'] = df_rev['review_no_punct'].apply(split_and_remove_stopwords)

stemming = PorterStemmer()

def stems(string):
    string = string.split(' ')
    things = ""
    for t in string:
        things += (stemming.stem(t) + " ")
    return things

df_rev['review_processed'] = df_rev['review_no_punct'].apply(stems)

y = df_rev['score']
x = df_rev['review_processed']

x_train, x_test, y_train, y_test = train_test_split(x, y)

cv = CountVectorizer()

df_train  = pd.DataFrame(cv.fit_transform(x_train).todense(),
             columns=cv.get_feature_names())

df_test = pd.DataFrame(cv.transform(x_test).todense(),
             columns=cv.get_feature_names())

rfr = RandomForestRegressor()

rfr.fit(df_train, y_train)

rfr.score(df_train, y_train)

rfr.score(df_test, y_test)

def run_model(x_train, y_train, x_test, y_test, model):
    rfr.fit(x_train, y_train)
    print "Base model score: " + str(np.mean(y_test))[:6]
    print "Training set score: ", str(rfr.score(x_train, y_train))[:6]
    print "Test set score: ", str(rfr.score(x_test, y_test))[:6]
    predictions = rfr.predict(x_test)

run_model(df_train, y_train, df_test, y_test, rfr)