In [1]:
import re
import pandas as pd
import numpy as np
import nltk
from skift import FirstColFtClassifier
from sklearn.model_selection import cross_val_score
from nltk.corpus import stopwords

In [12]:
# Download stopwords dictionary
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Marek\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [2]:
def show_score(_score):
    print(_score)
    print(f"Average Score: {np.mean(_score)}")

# EXPERIMENT 2

## YELP

In [3]:
yelpData = pd.read_csv('../data/yelp_labelled.txt', sep='\t', header=0, encoding="utf-8")
row_sizes = yelpData['SentimentText'].str.split().str.len()
yelpData['SentimentText'] = yelpData['SentimentText'].str.lower()
print(f"Words count: {pd.Series.sum(row_sizes)}")
yelpData

Words count: 10894


Unnamed: 0,SentimentText,Sentiment
0,wow... loved this place.,1
1,crust is not good.,0
2,not tasty and the texture was just nasty.,0
3,stopped by during the late may bank holiday of...,1
4,the selection on the menu was great and so wer...,1
...,...,...
995,i think food should have flavor and texture an...,0
996,appetite instantly gone.,0
997,overall i was not impressed and would not go b...,0
998,"the whole experience was underwhelming, and i ...",0


## Remove stop words

In [3]:
stop_words = set(stopwords.words("english")) 

def remove_stop_words(text):
    text = [word for word in text.split() if not word in stop_words]
    text = " ".join(text)
    return text

In [6]:
def preprocess_train(data, functions, word_ngrams=1):
    _data = pd.DataFrame(data['SentimentText'])
    for function in functions:
        _data['SentimentText'] = _data['SentimentText'].apply(lambda x: function(x))
    _row_sizes = _data['SentimentText'].str.split().str.len()
    print(f"Words count: {pd.Series.sum(_row_sizes)}")
    print(_data)
    _sk_clf = FirstColFtClassifier(wordNgrams=word_ngrams, thread=1)  # lr=0.3, epoch=10
    _scores = cross_val_score(_sk_clf, _data[['SentimentText']], data['Sentiment'], cv=5, scoring='accuracy')
    print(f"Words ngrams: {word_ngrams}")
    return _scores

In [6]:
scores = preprocess_train(yelpData, [remove_stop_words])

                                         SentimentText
0                                  wow... loved place.
1                                          crust good.
2                                 tasty texture nasty.
3    stopped late may bank holiday rick steve recom...
4                         selection menu great prices.
..                                                 ...
995                 think food flavor texture lacking.
996                           appetite instantly gone.
997                   overall impressed would go back.
998  whole experience underwhelming, think we'll go...
999  then, wasted enough life there, poured salt wo...

[1000 rows x 1 columns]
Words ngrams: 1


In [7]:
show_score(scores)

[0.71  0.665 0.535 0.645 0.535]
Average Score: 0.6180000000000001


## Remove punctuation

In [8]:
def remove_punctuation(text):
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    return text

In [20]:
scores = preprocess_train(yelpData, [remove_punctuation])
show_score(scores)

                                         SentimentText
0                                 wow loved this place
1                                    crust is not good
2             not tasty and the texture was just nasty
3    stopped by during the late may bank holiday of...
4    the selection on the menu was great and so wer...
..                                                 ...
995  i think food should have flavor and texture an...
996                            appetite instantly gone
997  overall i was not impressed and would not go back
998  the whole experience was underwhelming and i t...
999  then as if i hadnt wasted enough of my life th...

[1000 rows x 1 columns]
Words ngrams: 1
Window size: 5
[0.605 0.63  0.575 0.62  0.565]
Average Score: 0.599


## Lemmatization

In [21]:
yelpDataLem = pd.read_csv('../data/YelpLemmatized.txt', sep='\t', header=0, encoding="utf-8")
row_sizes = yelpDataLem['SentimentText'].str.split().str.len()
print(f"Words count: {pd.Series.sum(row_sizes)}")
yelpDataLem

Words count: 10922


Unnamed: 0,SentimentText,Sentiment
0,wow love this place,1
1,crust be not good,0
2,not tasty and the texture be just nasty,0
3,stop by during the late may bank holiday off r...,1
4,the selection on the menu be great and so be t...,1
...,...,...
995,i think food should have flavor and texture an...,0
996,appetite instantly go,0
997,overall i be not impress and would not go back,0
998,the whole experience be underwhelm and i think...,0


In [22]:
scores = preprocess_train(yelpDataLem, [])
show_score(scores)

                                         SentimentText
0                                 wow love this place 
1                                   crust be not good 
2             not tasty and the texture be just nasty 
3    stop by during the late may bank holiday off r...
4    the selection on the menu be great and so be t...
..                                                 ...
995  i think food should have flavor and texture an...
996                             appetite instantly go 
997    overall i be not impress and would not go back 
998  the whole experience be underwhelm and i think...
999  then as if i have+not waste enough of my life ...

[1000 rows x 1 columns]
Words ngrams: 1
Window size: 5
[0.59  0.6   0.545 0.585 0.535]
Average Score: 0.571


## Remove stop words AND remove punctuation

In [14]:
scores = preprocess_train(yelpData, [remove_stop_words, remove_punctuation])
show_score(scores)

                                         SentimentText
0                                      wow loved place
1                                           crust good
2                                  tasty texture nasty
3    stopped late may bank holiday rick steve recom...
4                          selection menu great prices
..                                                 ...
995                  think food flavor texture lacking
996                            appetite instantly gone
997                    overall impressed would go back
998  whole experience underwhelming think well go n...
999  then wasted enough life there poured salt woun...

[1000 rows x 1 columns]
Words ngrams: 1
[0.71  0.765 0.64  0.665 0.655]
Average Score: 0.687


## Remove stop words AND Lemmatization

In [26]:
scores = preprocess_train(yelpDataLem, [remove_stop_words])
show_score(scores)

                                         SentimentText
0                                       wow love place
1                                           crust good
2                                  tasty texture nasty
3    stop late may bank holiday rick steve recommen...
4                           selection menu great price
..                                                 ...
995                     think food flavor texture lack
996                              appetite instantly go
997                      overall impress would go back
998  whole experience underwhelm think wewill go ni...
999  have+not waste enough life pour salt wound dra...

[1000 rows x 1 columns]
Words ngrams: 1
Window size: 5
[0.715 0.72  0.56  0.645 0.62 ]
Average Score: 0.652


## N-GRAMS

In [10]:
for i in range(1, 4):  # word_ngrams
    scores = preprocess_train(yelpData, [remove_stop_words, remove_punctuation], word_ngrams=i)
    show_score(scores)

                                         SentimentText
0                                      wow loved place
1                                           crust good
2                                  tasty texture nasty
3    stopped late may bank holiday rick steve recom...
4                          selection menu great prices
..                                                 ...
995                  think food flavor texture lacking
996                            appetite instantly gone
997                    overall impressed would go back
998  whole experience underwhelming think well go n...
999  then wasted enough life there poured salt woun...

[1000 rows x 1 columns]
Words ngrams: 1
[0.71  0.765 0.64  0.665 0.655]
Average Score: 0.687
                                         SentimentText
0                                      wow loved place
1                                           crust good
2                                  tasty texture nasty
3    stopped late may bank

## IMDB

In [4]:
imdbData = pd.read_csv('../data/imdb_50k.tsv', sep='\t', header=0, encoding="utf-8", doublequote=False, escapechar="\\")
imdbData = imdbData.drop(['id'], axis=1)
row_sizes = imdbData['SentimentText'].str.split().str.len()
imdbData['SentimentText'] = imdbData['SentimentText'].str.lower()
print(f"Words count: {pd.Series.sum(row_sizes)}")
imdbData

Words count: 11557842


Unnamed: 0,Sentiment,SentimentText
0,0,"a bit of a disappointing film, i'd say: the ac..."
1,0,"the acting was terrible, the cheesy, fake, che..."
2,1,"plenty has been written about mamet's ""the hou..."
3,1,"""journey to the far side of the sun"" (aka ""dop..."
4,1,i lived in that area (hoboken and jersey city)...
...,...,...
49995,1,"as a big dostoyevsky fan, i had always been di..."
49996,0,i didn't watch this show that much when i was ...
49997,1,for people who are first timers in film making...
49998,0,pumpkinhead was in itself a decent 80s horror ...


## Remove stopwords

In [7]:
scores = preprocess_train(imdbData, [remove_stop_words])

Words count: 6365334
                                           SentimentText
0      bit disappointing film, i'd say: acting stilte...
1      acting terrible, cheesy, fake, cheap green scr...
2      plenty written mamet's "the house games"; good...
3      "journey far side sun" (aka "doppelganger") en...
4      lived area (hoboken jersey city)for ten years....
...                                                  ...
49995  big dostoyevsky fan, always disappointed holly...
49996  watch show much little. think watched 1 episod...
49997  people first timers film making, think excelle...
49998  pumpkinhead decent 80s horror flick. classic m...
49999  would like start saying hope makers movie sist...

[50000 rows x 1 columns]
Words ngrams: 1


In [32]:
show_score(scores)

[0.8876 0.8923 0.8878 0.8901 0.8881]
Average Score: 0.88918


## Remove punctuation

In [33]:
scores = preprocess_train(imdbData, [remove_punctuation])
show_score(scores)

                                           SentimentText
0      a bit of a disappointing film id say the actin...
1      the acting was terrible the cheesy fake cheap ...
2      plenty has been written about mamets the house...
3      journey to the far side of the sun aka doppelg...
4      i lived in that area hoboken and jersey cityfo...
...                                                  ...
49995  as a big dostoyevsky fan i had always been dis...
49996  i didnt watch this show that much when i was l...
49997  for people who are first timers in film making...
49998  pumpkinhead was in itself a decent 80s horror ...
49999  i would like to start by saying i can only hop...

[50000 rows x 1 columns]
Words ngrams: 1
Window size: 5
[0.8847 0.885  0.8867 0.8836 0.883 ]
Average Score: 0.8846


## Lemmatization

In [15]:
imdbDataLem = pd.read_csv('../data/Imdb50KLemmatized.tsv', sep='\t', header=0, encoding="utf-8", doublequote=False, escapechar="\\")
imdbDataLem = imdbDataLem.drop(['id'], axis=1)
row_sizes = imdbDataLem['SentimentText'].str.split().str.len()
print(f"Words count: {pd.Series.sum(row_sizes)}")
imdbDataLem

Words count: 11680609


Unnamed: 0,Sentiment,SentimentText
0,0,a bit of a disappoint film i'd say: the acting...
1,0,the acting be terrible the cheesy fake cheap g...
2,1,"plenty have be write about mamet ""the house of..."
3,1,"""journey to the far side of the sun"" aka ""dopp..."
4,1,i live in that area hoboken and jersey city fo...
...,...,...
49995,1,a a big dostoyevsky fan i have always be disap...
49996,0,i do+not watch this show that much when i be l...
49997,1,for people who be first timer in film making i...
49998,0,pumpkinhead be in itself a decent 80 horror fl...


In [37]:
scores = preprocess_train(imdbDataLem, [])
show_score(scores)

                                           SentimentText
0      a bit of a disappoint film i'd say: the acting...
1      the acting be terrible the cheesy fake cheap g...
2      plenty have be write about mamet "the house of...
3      "journey to the far side of the sun" aka "dopp...
4      i live in that area hoboken and jersey city fo...
...                                                  ...
49995  a a big dostoyevsky fan i have always be disap...
49996  i do+not watch this show that much when i be l...
49997  for people who be first timer in film making i...
49998  pumpkinhead be in itself a decent 80 horror fl...
49999  i would like to start by say i can only hope t...

[50000 rows x 1 columns]
Words ngrams: 1
Window size: 5
[0.8892 0.8849 0.8909 0.8853 0.8839]
Average Score: 0.88684


## Remove stop words AND remove punctuation

In [34]:
scores = preprocess_train(imdbData, [remove_stop_words, remove_punctuation])
show_score(scores)

                                           SentimentText
0      bit disappointing film id say acting stilted s...
1      acting terrible cheesy fake cheap green screen...
2      plenty written mamets the house games good dec...
3      journey far side sun aka doppelganger entertai...
4      lived area hoboken jersey cityfor ten years fi...
...                                                  ...
49995  big dostoyevsky fan always disappointed hollyw...
49996  watch show much little think watched 1 episode...
49997  people first timers film making think excellen...
49998  pumpkinhead decent 80s horror flick classic me...
49999  would like start saying hope makers movie sist...

[50000 rows x 1 columns]
Words ngrams: 1
Window size: 5
[0.8914 0.8898 0.8891 0.8902 0.8866]
Average Score: 0.8894200000000001


## Remove stop words AND Lemmatization

In [17]:
scores = preprocess_train(imdbDataLem, [remove_stop_words])
show_score(scores)

Words count: 6328911
                                           SentimentText
0      bit disappoint film i'd say: acting stilted so...
1      acting terrible cheesy fake cheap green screen...
2      plenty write mamet "the house games"; good dec...
3      "journey far side sun" aka "doppelganger" ente...
4      live area hoboken jersey city ten year film ce...
...                                                  ...
49995  big dostoyevsky fan always disappoint hollywoo...
49996  do+not watch show much little think watch 1 ep...
49997  people first timer film making think excellent...
49998  pumpkinhead decent 80 horror flick classic mea...
49999  would like start say hope maker movie sister f...

[50000 rows x 1 columns]
Words ngrams: 1
[0.8939 0.893  0.8927 0.8925 0.8894]
Average Score: 0.8923


## N-GRAMS

In [36]:
for i in range(1, 4):  # word_ngrams
    scores = preprocess_train(imdbDataLem, [remove_stop_words], word_ngrams=i)
    show_score(scores)

                                           SentimentText
0      bit disappoint film i'd say: acting stilted so...
1      acting terrible cheesy fake cheap green screen...
2      plenty write mamet "the house games"; good dec...
3      "journey far side sun" aka "doppelganger" ente...
4      live area hoboken jersey city ten year film ce...
...                                                  ...
49995  big dostoyevsky fan always disappoint hollywoo...
49996  do+not watch show much little think watch 1 ep...
49997  people first timer film making think excellent...
49998  pumpkinhead decent 80 horror flick classic mea...
49999  would like start say hope maker movie sister f...

[50000 rows x 1 columns]
Words ngrams: 1
Window size: 2
[0.8939 0.893  0.8927 0.8925 0.8894]
Average Score: 0.8923
                                           SentimentText
0      bit disappoint film i'd say: acting stilted so...
1      acting terrible cheesy fake cheap green screen...
2      plenty write mamet "th