In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

train = pd.read_csv("./data/X_train.csv")
test = pd.read_csv("./data/X_test.csv")

newtrain= train[['Text', 'Score']]


In [3]:
newtrain.Score.value_counts()

5.0    746520
4.0    315587
3.0    165727
1.0     85615
2.0     84084
Name: Score, dtype: int64

In [4]:
movieNums= newtrain['ProductId'].value_counts()
movieNums= movieNums.loc[movieNums.values > 150]
movieNums = movieNums.keys().tolist()
reducedTrain = train[train['ProductId'].isin(movieNums)]

userNums= reducedTrain['UserId'].value_counts()
userNums= userNums.loc[userNums.values > 5]
userNums = userNums.keys().tolist()
txtTrain = reducedTrain[reducedTrain['UserId'].isin(userNums)]
txtTrain

Unnamed: 0,Text,Score
207,This is so lame! The songs are terrible! The v...,1.0
209,This is with out a doubt is one of the worst C...,1.0
210,Narration is OK but the story is weak and the ...,1.0
237,If you love the Rankin-Bass version of this cl...,1.0
336,What's all this complaining about Destiny's Ch...,1.0
...,...,...
41718,"I have always loved this movie, but the DVD ho...",5.0
41719,This B/W film is simply one of the best movies...,5.0
41720,"Of course, Gregory Peck will not disappoint yo...",5.0
41722,"I have always enjoyed this movie, I now own th...",5.0


In [5]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import nltk.data
import string
import nltk
import re

def review_to_words (review):
    rev = re.sub('[^a-zA-Z]', ' ', review)
    words = rev.lower().split()
    stops = set(stopwords.words('english'))
    words = [w for w in words if not w in stops]
    stemmer = SnowballStemmer('english')
    words = [stemmer.stem(w) for w in words]
    return (' '.join(words))

review_to_words(txtTrain['Text'].iloc[0])

'lame song terribl villan ripoff mr burn simpson kid look like loan charli brown show biggest lowpoint polit correct rant bad cbs insist run back back classic'

In [None]:
clean_train = []
for i in range(0, len(txtTrain)):
    if (i+1)%1000 == 0:
        print('Review {} of {}'.format(i+1, len(txtTrain)))
    clean_train.append(review_to_words(txtTrain['Text'].iloc[i]))

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer= "word",
                                    tokenizer= None,
                                    preprocessor = None,
                                    max_df = 0.6,
                                    ngram_range=(1,3),
                                    stop_words = None,
                                    max_features = 15000)

train_data_features = vectorizer.fit_transform(clean_train)


In [8]:
print(train_data_features.shape)

(125000, 15000)


In [None]:
vocab = vectorizer.get_feature_names()
vocab[:10]

In [10]:
dist = np.sum(train_data_features, axis = 0)
pd.DataFrame(dist, columns = vocab)

Unnamed: 0,aaron,ab,abandon,abbey,abbi,abbott,abc,abduct,abil,abl,...,zero star,zeta,zeta jone,zhang,zip,zombi,zombi movi,zone,zoom,zorro
0,172,116,999,179,162,159,151,197,2290,4919,...,205,274,260,114,117,2014,193,546,293,452


In [11]:
def score_preprocessing(value):
    if value <= 2:
        return '-1'
    elif value == 3:
        return '0'
    else: 
        return '1'

txtTrain['Sentiment'] = txtTrain['Score'].apply(score_preprocessing) 
txtTrain.head()

Unnamed: 0,Text,Score,Sentiment
207,This is so lame! The songs are terrible! The v...,1.0,-1
209,This is with out a doubt is one of the worst C...,1.0,-1
210,Narration is OK but the story is weak and the ...,1.0,-1
237,If you love the Rankin-Bass version of this cl...,1.0,-1
336,What's all this complaining about Destiny's Ch...,1.0,-1


In [13]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(
    n_estimators = 100, n_jobs = -1, random_state = 13)

forest = forest.fit(train_data_features, txtTrain['Sentiment'])

In [14]:
clean_test = []
for i in range(0, len(test)):
    if (i+1)%5000 == 0:
        print('Review {} of {}'.format(i+1, len(test)))
    clean_test.append(review_to_words(test['Text'].iloc[i]))

Review 5000 of 300000
Review 10000 of 300000
Review 15000 of 300000
Review 20000 of 300000
Review 25000 of 300000
Review 30000 of 300000
Review 35000 of 300000
Review 40000 of 300000
Review 45000 of 300000
Review 50000 of 300000
Review 55000 of 300000
Review 60000 of 300000
Review 65000 of 300000
Review 70000 of 300000
Review 75000 of 300000
Review 80000 of 300000
Review 85000 of 300000
Review 90000 of 300000
Review 95000 of 300000
Review 100000 of 300000
Review 105000 of 300000
Review 110000 of 300000
Review 115000 of 300000
Review 120000 of 300000
Review 125000 of 300000
Review 130000 of 300000
Review 135000 of 300000
Review 140000 of 300000
Review 145000 of 300000
Review 150000 of 300000
Review 155000 of 300000
Review 160000 of 300000
Review 165000 of 300000
Review 170000 of 300000
Review 175000 of 300000
Review 180000 of 300000
Review 185000 of 300000
Review 190000 of 300000
Review 195000 of 300000
Review 200000 of 300000
Review 205000 of 300000
Review 210000 of 300000
Review 21500

In [15]:
clean_1 = clean_test[:30000]
clean_2 = clean_test[30000:60000]
clean_3 = clean_test[60000:90000]
clean_4 = clean_test[90000:120000]
clean_5 = clean_test[120000:150000]
clean_6 = clean_test[150000:180000]
clean_7 = clean_test[180000:210000]
clean_8 = clean_test[210000:240000]
clean_9 = clean_test[240000:270000]
clean_10 = clean_test[270000:]


In [16]:
# test_data_features1 = vectorizer.transform(clean_1)
# test_data_features1 = test_data_features1.toarray()

# result1 = forest.predict(test_data_features1)

# output = pd.DataFrame(data = {"Id": test['Id'][:30000], "Sentiment": result1})
# output.to_csv("newresult1.csv", index= None)

In [17]:
# test_data_features2 = vectorizer.transform(clean_2)
# test_data_features2 = test_data_features2.toarray()

# result2 = forest.predict(test_data_features2)

# output = pd.DataFrame(data = {"Id": test['Id'][30000:60000], "Sentiment": result2})
# output.to_csv("newresult2.csv", index= None)

In [18]:
# test_data_features3 = vectorizer.transform(clean_3)
# test_data_features3 = test_data_features3.toarray()

# result3 = forest.predict(test_data_features3)

# output = pd.DataFrame(data = {"Id": test['Id'][60000:90000], "Sentiment": result3})
# output.to_csv("newresult3.csv", index= None)

In [19]:
# test_data_features4 = vectorizer.transform(clean_4)
# test_data_features4 = test_data_features4.toarray()

# result4 = forest.predict(test_data_features4)

# output = pd.DataFrame(data = {"Id": test['Id'][90000:120000], "Sentiment": result4})
# output.to_csv("newresult4.csv", index= None)

In [20]:
# test_data_features5 = vectorizer.transform(clean_5)
# test_data_features5 = test_data_features5.toarray()

# result5 = forest.predict(test_data_features5)

# output = pd.DataFrame(data = {"Id": test['Id'][120000:150000], "Sentiment": result5})
# output.to_csv("newresult5.csv", index= None)

In [21]:
# test_data_features6 = vectorizer.transform(clean_6)
# test_data_features6 = test_data_features6.toarray()

# result6 = forest.predict(test_data_features6)

# output = pd.DataFrame(data = {"Id": test['Id'][150000:180000], "Sentiment": result6})
# output.to_csv("newresult6.csv", index= None)

In [22]:
# test_data_features7 = vectorizer.transform(clean_7)
# test_data_features7 = test_data_features7.toarray()

# result7 = forest.predict(test_data_features7)

# output = pd.DataFrame(data = {"Id": test['Id'][180000:210000], "Sentiment": result7})
# output.to_csv("newresult7.csv", index= None)

In [23]:
# test_data_features8 = vectorizer.transform(clean_8)
# test_data_features8 = test_data_features8.toarray()


# result8 = forest.predict(test_data_features8)

# output = pd.DataFrame(data = {"Id": test['Id'][210000:240000], "Sentiment": result8})
# output.to_csv("newresult8.csv", index= None)

In [26]:
sampleTest = train.sample(n=10000)
sampleTest

Unnamed: 0,Id,ProductId,UserId,Score,Summary,Text,Helpfulness,user_mean_score,product_mean_score,YYYYMM,user_num_reviews,reliableUser,maxHelpNum,Sentiment
351356,426993,6304504012,A3N2XVTC8OFKVX,3.0,I love Evita and wish they had made a movie of it,Madonna was excellent. Even Antonio Bandaras ...,0.750000,2.333333,4.220588,200110,6,True,5.0,0
1193553,1450300,B0059XTU3G,A33F3L2E0POEC4,5.0,Great Entertainment,Loved the movie in every way and cannot wait f...,0.500000,4.500000,4.152616,201406,6,False,5.0,1
519251,630873,B00005JNHT,ACT1EPHFS9E7V,4.0,true crime,If you like true crime movies & books this is ...,0.000000,4.290076,4.384615,201312,158,False,5.0,1
1322241,1606266,B00A7E8PA6,A2VZIH75IMKB5L,1.0,overrated movie featuring a washed up disney a...,like i said in my last review i only liked mil...,0.500000,2.079365,3.281250,201304,79,False,3.0,-1
1210749,1471136,B005LAIHPE,A22TRI3C3OI8QV,5.0,Great Popcorn Movie!,When I first rented this movie I expected to s...,0.000000,4.342857,3.819512,201211,38,False,4.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
963073,1170293,B001B3LIOC,A2ZXYCWPNS6KX4,5.0,great movies,Two of my favorite Steven Seagal movies. Two g...,0.000000,4.944444,4.310345,201403,22,False,4.0,1
383334,465776,6305470448,A2PYVF3IFIHIYI,5.0,Forbibben Love,Beautiful film about forbidden love wonderfull...,0.666667,3.363636,4.368182,200012,12,True,5.0,1
443622,538947,B00003CXXF,A152C8GYY25HAH,2.0,Gorgeous Greek island plays host to turgid movie,I have read that the film version of Captain C...,0.750000,3.485944,3.555556,200203,612,True,5.0,-1
696523,846480,B0002VETFO,A32ZOSZVX052IJ,5.0,cool!,Love this sitcom it is funny as crap! I would ...,0.000000,4.966942,4.603774,201306,151,False,5.0,1


In [33]:
clean_sample = []
for i in range(0, len(sampleTest)):
    if (i+1)%5000 == 0:
        print('Review {} of {}'.format(i+1, len(sampleTest)))
    clean_sample.append(review_to_words(sampleTest['Text'].iloc[i]))

Review 5000 of 10000
Review 10000 of 10000


In [34]:
test_data_features = vectorizer.transform(clean_sample)
test_data_features = test_data_features.toarray()

result = forest.predict(test_data_features)
result
output = pd.DataFrame(data = {"Id": sampleTest['Id'], "Text": sampleTest['Text'], "Original_Sentiment": sampleTest['Sentiment'], "Result_Sentiment": result})
output.to_csv("newresult.csv", index= None)

In [35]:
result.shape

(10000,)

In [36]:
output = pd.DataFrame(data = {"Id": sampleTest['Id'], "Result_Sentiment": result})
output.to_csv("newresult.csv", index= None)

In [37]:
test_data_features9 = vectorizer.transform(clean_9)
test_data_features9 = test_data_features9.toarray()

result9 = forest.predict(test_data_features9)

output = pd.DataFrame(data = {"Id": test['Id'][240000:270000], "Sentiment": result9})
output.to_csv("newresult9.csv", index= None)

In [38]:
test_data_features10 = vectorizer.transform(clean_10)
test_data_features10 = test_data_features10.toarray()

test_data_features10.shape

(30000, 15000)

In [39]:
result10 = forest.predict(test_data_features10)
result10[:10]

array(['1', '1', '1', '1', '1', '1', '1', '1', '1', '1'], dtype=object)

In [40]:
output = pd.DataFrame(data = {"Id": test['Id'][270000:], "Sentiment": result10})
output.to_csv("newresult10.csv", index= None)

In [41]:
result1 = pd.read_csv("newresult1.csv")
result2 = pd.read_csv("newresult2.csv")
result3 = pd.read_csv("newresult3.csv")
result4 = pd.read_csv("newresult4.csv")
result5 = pd.read_csv("newresult5.csv")
result6 = pd.read_csv("newresult6.csv")
result7 = pd.read_csv("newresult7.csv")
result8 = pd.read_csv("newresult8.csv")
result9 = pd.read_csv("newresult9.csv")
result10 = pd.read_csv("newresult10.csv")

In [42]:
result1.shape
result2.shape

(30000, 2)

In [43]:
result = pd.concat([result1, result2])
result = pd.concat([result, result3])
result = pd.concat([result, result4])
result = pd.concat([result, result5])
result = pd.concat([result, result6])
result = pd.concat([result, result7])
result = pd.concat([result, result8])
result = pd.concat([result, result9])
result = pd.concat([result, result10])

In [45]:
result.to_csv("textSentiment2.csv", index= None)
result.shape

(300000, 2)