In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

train = pd.read_csv("./data/X_train.csv")
test = pd.read_csv("./data/X_test.csv")

movieNums= train['ProductId'].value_counts()
movieNums= movieNums.loc[movieNums.values > 150]
movieNums = movieNums.keys().tolist()
reducedTrain = train[train['ProductId'].isin(movieNums)]

userNums= reducedTrain['UserId'].value_counts()
userNums= userNums.loc[userNums.values > 8]
userNums = userNums.keys().tolist()
reducedTrain = reducedTrain[reducedTrain['UserId'].isin(userNums)]

txtTrain= reducedTrain[['Summary', 'Score']]
len(txtTrain)
txtTrain

Unnamed: 0,Summary,Score
445,"""It Is as It Was!"" True, Even If Not Said!",5.0
450,Tough Passion,5.0
452,The Passion of Mel Gibson,1.0
453,2.5 stars,2.0
454,Powerful.,4.0
...,...,...
1396617,Anderson's movies keep getting better (althoug...,3.0
1396623,"Many stars, but it works well with the film",5.0
1396625,Ornate Silliness,2.0
1396627,Meandering story,3.0


In [2]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import nltk.data
import string
import nltk
import re

def review_to_words (review):
    rev = re.sub('[^a-zA-Z]', ' ', review)
    words = rev.lower().split()
    stops = set(stopwords.words('english'))
    words = [w for w in words if not w in stops]
    stemmer = SnowballStemmer('english')
    words = [stemmer.stem(w) for w in words]
    return (' '.join(words))

review_to_words(txtTrain['Summary'].iloc[0])

'true even said'

In [None]:
clean_train = []
for i in range(0, len(txtTrain)):
    if (i+1)%1000 == 0:
        print('Review {} of {}'.format(i+1, len(txtTrain)))
    clean_train.append(review_to_words(txtTrain['Summary'].iloc[i]))

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer= "word",
                                    tokenizer= None,
                                    preprocessor = None,
                                    max_df = 0.5,
                                    min_df = 2,
                                    ngram_range=(1,3),
                                    stop_words = None,
                                    max_features = 6000)

train_data_features = vectorizer.fit_transform(clean_train)


In [5]:
print(train_data_features.shape)

(190254, 6000)


In [6]:
vocab = vectorizer.get_feature_names()
vocab[:10]



['abbey',
 'abbey season',
 'abduct',
 'abid',
 'abl',
 'aboard',
 'abound',
 'abraham',
 'abraham lincoln',
 'abram']

In [7]:
dist = np.sum(train_data_features, axis = 0)
pd.DataFrame(dist, columns = vocab)

Unnamed: 0,abbey,abbey season,abduct,abid,abl,aboard,abound,abraham,abraham lincoln,abram,...,zero star,zodiac,zombi,zombi film,zombi flick,zombi movi,zombi movi ever,zombieland,zone,zoo
0,58,20,18,19,27,19,18,25,20,22,...,51,20,640,53,51,123,18,23,57,31


In [8]:
def score_preprocessing(value):
    if value <= 2:
        return '-1'
    elif value == 3:
        return '0'
    else: 
        return '1'

txtTrain['Sentiment'] = txtTrain['Score'].apply(score_preprocessing) 
txtTrain.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  txtTrain['Sentiment'] = txtTrain['Score'].apply(score_preprocessing)


Unnamed: 0,Summary,Score,Sentiment
445,"""It Is as It Was!"" True, Even If Not Said!",5.0,1
450,Tough Passion,5.0,1
452,The Passion of Mel Gibson,1.0,-1
453,2.5 stars,2.0,-1
454,Powerful.,4.0,1


In [9]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(
    n_estimators = 100, n_jobs = -1, random_state = 13, max_depth = 8)

forest = forest.fit(train_data_features, txtTrain['Sentiment'])

In [10]:
clean_test = []
for i in range(0, len(test)):
    if (i+1)%5000 == 0:
        print('Review {} of {}'.format(i+1, len(test)))
    clean_test.append(review_to_words(test['Summary'].iloc[i]))

Review 5000 of 300000
Review 10000 of 300000
Review 15000 of 300000
Review 20000 of 300000
Review 25000 of 300000
Review 30000 of 300000
Review 35000 of 300000
Review 40000 of 300000
Review 45000 of 300000
Review 50000 of 300000
Review 55000 of 300000
Review 60000 of 300000
Review 65000 of 300000
Review 70000 of 300000
Review 75000 of 300000
Review 80000 of 300000
Review 85000 of 300000
Review 90000 of 300000
Review 95000 of 300000
Review 100000 of 300000
Review 105000 of 300000
Review 110000 of 300000
Review 115000 of 300000
Review 120000 of 300000
Review 125000 of 300000
Review 130000 of 300000
Review 135000 of 300000
Review 140000 of 300000
Review 145000 of 300000
Review 150000 of 300000
Review 155000 of 300000
Review 160000 of 300000
Review 165000 of 300000
Review 170000 of 300000
Review 175000 of 300000
Review 180000 of 300000
Review 185000 of 300000
Review 190000 of 300000
Review 195000 of 300000
Review 200000 of 300000
Review 205000 of 300000
Review 210000 of 300000
Review 21500

In [11]:
clean_1 = clean_test[:100000]
clean_2 = clean_test[100000:200000]
clean_3 = clean_test[200000:]

In [12]:
test_data_features1 = vectorizer.transform(clean_test)
test_data_features1 = test_data_features1.toarray()

result1 = forest.predict(test_data_features1)

output = pd.DataFrame(data = {"Id": test['Id'], "Sentiment": result1})
output.to_csv("summarySentiment2.csv", index= None)

In [13]:
# test_data_features2 = vectorizer.transform(clean_2)
# test_data_features2 = test_data_features2.toarray()

# result2 = forest.predict(test_data_features2)

# output = pd.DataFrame(data = {"Id": test['Id'][100000:200000], "Sentiment": result2})
# output.to_csv("newresult2_sum.csv", index= None)

In [14]:
# test_data_features3 = vectorizer.transform(clean_3)
# test_data_features3 = test_data_features3.toarray()

# result3 = forest.predict(test_data_features3)

# output = pd.DataFrame(data = {"Id": test['Id'][200000:], "Sentiment": result3})
# output.to_csv("newresult3_sum.csv", index= None)

In [15]:
# result1 = pd.read_csv("newresult1.csv")
# result2 = pd.read_csv("newresult2.csv")
# result3 = pd.read_csv("newresult3.csv")
# result = pd.concat([result1, result2])
# result = pd.concat([result, result3])

In [16]:
# result.to_csv("textSentiment2.csv", index= None)
# result.shape

In [17]:
sampleTest = train.sample(n=10000)
sampleTest

clean_sample = []
for i in range(0, len(sampleTest)):
    if (i+1)%5000 == 0:
        print('Review {} of {}'.format(i+1, len(sampleTest)))
    clean_sample.append(review_to_words(sampleTest['Summary'].iloc[i]))

test_data_features = vectorizer.transform(clean_sample)
test_data_features = test_data_features.toarray()

result = forest.predict(test_data_features)
result
output = pd.DataFrame(data = {"Id": sampleTest['Id'], "Summary": sampleTest['Summary'], "Original_Sentiment": sampleTest['summSentiment'], "Result_Sentiment": result})
output.to_csv("sampleSummary.csv", index= None)    

Review 5000 of 10000
Review 10000 of 10000
