In [52]:
import pandas as pd
import string
import nltk
from collections import defaultdict
from gensim import corpora
from scipy.sparse import csr_matrix

stopwords = ["a", "about", "after", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been",
            "before", "being", "between", "both", "by", "could", "did", "do", "does", "doing", "during", "each",
            "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here",
            "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've",
            "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "of",
            "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "own", "shan't", "she", "she'd",
            "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs",
            "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're",
            "they've", "this", "those", "through", "to", "until", "up", "very", "was", "wasn't", "we", "we'd",
            "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's",
            "which", "while", "who", "who's", "whom", "with", "would", "you", "you'd", "you'll", "you're", "you've",
            "your", "yours", "yourself", "yourselves", "above", "again", "against", "aren't", "below", "but", "can't",
            "cannot", "couldn't", "didn't", "doesn't", "don't", "down", "few", "hadn't", "hasn't", "haven't", "if",
            "isn't", "mustn't", "no", "nor", "not", "off", "out", "over", "shouldn't", "same", "too", "under", "why",
            "why's", "won't", "wouldn't"]

train = pd.read_csv("train.csv", delimiter = ",")
test = pd.read_csv("test.csv", delimiter = ",")

In [53]:
#remove punctation
translator = str.maketrans('', '', string.punctuation)
no_na_train = train.dropna()
no_na_test = test.dropna()
non_punctuated_reviews_train = no_na_train['Reviews'].apply(lambda review: review.translate(translator))
non_punctuated_reviews_test = no_na_test['Reviews'].apply(lambda review: review.translate(translator))

In [54]:
#tokenization
tokenized_reviews_train = non_punctuated_reviews_train.apply(lambda text: nltk.word_tokenize(text.lower()))
tokenized_reviews_test = non_punctuated_reviews_test.apply(lambda text: nltk.word_tokenize(text.lower()))

In [55]:
#remove stopwords
no_stopwords_reviews_train = [[word for word in tokenized_review if word not in stopwords]
          for tokenized_review in tokenized_reviews_train]

no_stopwords_reviews_test = [[word for word in tokenized_review if word not in stopwords]
          for tokenized_review in tokenized_reviews_test]

In [56]:
#calculate frequency and remove words occuring less than 5 times

frequency = defaultdict(int)
for review in tokenized_reviews_train:
    for token in review:
        frequency[token] += 1

reviews = [[token for token in review if frequency[token] > 5]
        for review in no_stopwords_reviews_train]

In [57]:
#create id-token dictionary
id2word = corpora.Dictionary(reviews)

In [58]:
#create sparse matrix from token dictionary train
rows, cols, data = [],[],[]
for i, review in enumerate(reviews):
    for word in review:
        if word in id2word.token2id:
            rows.append(i)
            cols.append(id2word.token2id[word])
            data.append(1)

x_train = csr_matrix((data,(rows,cols)), shape=(len(train), len(id2word)))
y_train = train.iloc[:, 5]

#create sparse matrix from token dictionary test
rows, cols, data = [],[],[]
for i, review in enumerate(no_stopwords_reviews_test):
    for word in review:
        if word in id2word.token2id:
            rows.append(i)
            cols.append(id2word.token2id[word])
            data.append(1)

x_test = csr_matrix((data,(rows,cols)), shape=(len(test), len(id2word)))

In [59]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(random_state = 111, n_jobs = -1, verbose = True)
classifier.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  3.1min remaining:  2.1min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  4.3min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=111, verbose=True,
            warm_start=False)

In [60]:
predicted = classifier.predict(x_test)
print(predicted)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   6 out of  10 | elapsed:    0.6s remaining:    0.4s


[5 5 5 ... 5 5 5]


[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    1.2s finished


In [61]:
test['Rating'] = predicted
results = test[['Id', 'Rating']]
results.to_csv("results.csv", index = False)