In [2]:
import re
import pandas as pd
import numpy as np
import nltk
from skift import FirstColFtClassifier
from sklearn.model_selection import cross_val_score
from nltk.corpus import stopwords
import time

In [None]:
nltk.download('stopwords')

In [3]:
def show_score(_score):
    print(_score)
    print(f"Average Score: {np.mean(_score)}")

In [5]:
resultsColumns = ["index", "lossFunction", "learningRate", "NOiter", "accuracy"]

# EXPERIMENT 3

## YELP

In [12]:
yelpData = pd.read_csv('../data/yelp_labelled.txt', sep='\t', header=0, encoding="utf-8")
row_sizes = yelpData['SentimentText'].str.split().str.len()
yelpData['SentimentText'] = yelpData['SentimentText'].str.lower()
print(f"Words count: {pd.Series.sum(row_sizes)}")
yelpData

Words count: 10894


Unnamed: 0,SentimentText,Sentiment
0,wow... loved this place.,1
1,crust is not good.,0
2,not tasty and the texture was just nasty.,0
3,stopped by during the late may bank holiday of...,1
4,the selection on the menu was great and so wer...,1
...,...,...
995,i think food should have flavor and texture an...,0
996,appetite instantly gone.,0
997,overall i was not impressed and would not go b...,0
998,"the whole experience was underwhelming, and i ...",0


In [6]:
stop_words = set(stopwords.words("english")) 

def remove_stop_words(text):
    text = [word for word in text.split() if not word in stop_words]
    text = " ".join(text)
    return text

def remove_punctuation(text):
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    return text

In [7]:
def preprocess_train(data, functions, word_ngrams=1, _iterations=5, _lr=0.1, _loss="softmax"):
    _data = pd.DataFrame(data['SentimentText'])
    for function in functions:
        _data['SentimentText'] = _data['SentimentText'].apply(lambda x: function(x))
    _sk_clf = FirstColFtClassifier(wordNgrams=word_ngrams, thread=1, epoch=_iterations, lr=_lr, loss=_loss)  # lr=0.3, epoch=10
    _scores = cross_val_score(_sk_clf, _data[['SentimentText']], data['Sentiment'], cv=5, scoring='accuracy')
    print(f"Words ngrams: {word_ngrams}")
    return _scores

In [13]:
index = 0
data = []
for loss in ["softmax", "ns", "hs"]: 
    for epoch in [5, 10, 50 ,100]:
        for lr in [0.05, 0.1, 0.2, 0.5, 1]:
            start = time.time()
            ## Removed stop words AND removed punctuation
            scores = preprocess_train(yelpData, [remove_stop_words, remove_punctuation], _iterations=epoch, _lr=lr, _loss=loss)
            end = time.time()
            print(f"Loss: {loss}, epoch: {epoch}, lr: {lr}")
            print(f"Time: {end - start}")
            show_score(scores)
            data.append([index, loss, lr, epoch, np.mean(scores)])
            index += 1

Words ngrams: 1
Loss: softmax, epoch: 5, lr: 0.05
Time: 1.1592442989349365
[0.585 0.63  0.52  0.575 0.515]
Average Score: 0.565
Words ngrams: 1
Loss: softmax, epoch: 5, lr: 0.1
Time: 1.1494300365447998
[0.71  0.765 0.64  0.665 0.655]
Average Score: 0.687
Words ngrams: 1
Loss: softmax, epoch: 5, lr: 0.2
Time: 1.1184124946594238
[0.73  0.775 0.765 0.79  0.77 ]
Average Score: 0.766
Words ngrams: 1
Loss: softmax, epoch: 5, lr: 0.5
Time: 1.148317575454712
[0.71  0.76  0.755 0.795 0.75 ]
Average Score: 0.754
Words ngrams: 1
Loss: softmax, epoch: 5, lr: 1
Time: 1.1528754234313965
[0.71  0.745 0.76  0.79  0.73 ]
Average Score: 0.747
Words ngrams: 1
Loss: softmax, epoch: 10, lr: 0.05
Time: 1.2479867935180664
[0.72  0.765 0.625 0.67  0.665]
Average Score: 0.689
Words ngrams: 1
Loss: softmax, epoch: 10, lr: 0.1
Time: 1.1389987468719482
[0.74  0.78  0.76  0.805 0.76 ]
Average Score: 0.7690000000000001
Words ngrams: 1
Loss: softmax, epoch: 10, lr: 0.2
Time: 1.1365158557891846
[0.735 0.76  0.79  0.8

In [18]:
# epoch 200 only
for loss in ["softmax", "ns", "hs"]:
    for epoch in [200]:
        for lr in [0.05, 0.1, 0.2, 0.5, 1]:
            start = time.time()
            scores = preprocess_train(yelpData, [remove_stop_words, remove_punctuation], _iterations=epoch, _lr=lr, _loss=loss)
            end = time.time()
            print(f"Loss: {loss}, epoch: {epoch}, lr: {lr}")
            print(f"Time: {end - start}")
            show_score(scores)
            data.append([index, loss, lr, epoch, np.mean(scores)])

Words ngrams: 1
Loss: softmax, epoch: 200, lr: 0.05
Time: 4.032491445541382
[0.745 0.73  0.78  0.785 0.735]
Average Score: 0.755
Words ngrams: 1
Loss: softmax, epoch: 200, lr: 0.1
Time: 3.8946728706359863
[0.725 0.72  0.755 0.785 0.75 ]
Average Score: 0.747
Words ngrams: 1
Loss: softmax, epoch: 200, lr: 0.2
Time: 3.765923261642456
[0.71  0.7   0.745 0.79  0.735]
Average Score: 0.736
Words ngrams: 1
Loss: softmax, epoch: 200, lr: 0.5
Time: 3.6832127571105957
[0.715 0.7   0.74  0.79  0.725]
Average Score: 0.7340000000000001
Words ngrams: 1
Loss: softmax, epoch: 200, lr: 1
Time: 3.7989680767059326
[0.715 0.71  0.75  0.785 0.74 ]
Average Score: 0.74
Words ngrams: 1
Loss: ns, epoch: 200, lr: 0.05
Time: 6.860926151275635
[0.725 0.73  0.76  0.785 0.745]
Average Score: 0.749
Words ngrams: 1
Loss: ns, epoch: 200, lr: 0.1
Time: 6.745258331298828
[0.69 0.71 0.76 0.78 0.73]
Average Score: 0.7340000000000001
Words ngrams: 1
Loss: ns, epoch: 200, lr: 0.2
Time: 6.827725172042847
[0.685 0.695 0.765 0.

In [None]:
results_df = pd.DataFrame(data, columns=resultsColumns)
results_df.to_csv("Ex3FastText/Ex3ReportYelp.csv", index=False, header=True)

## IMDB

In [8]:
imdbDataLem = pd.read_csv('../data/Imdb50KLemmatized.tsv', sep='\t', header=0, encoding="utf-8", doublequote=False, escapechar="\\")
imdbDataLem = imdbDataLem.drop(['id'], axis=1)
row_sizes = imdbDataLem['SentimentText'].str.split().str.len()
print(f"Words count: {pd.Series.sum(row_sizes)}")
imdbDataLem

Words count: 11680609


Unnamed: 0,Sentiment,SentimentText
0,0,a bit of a disappoint film i'd say: the acting...
1,0,the acting be terrible the cheesy fake cheap g...
2,1,"plenty have be write about mamet ""the house of..."
3,1,"""journey to the far side of the sun"" aka ""dopp..."
4,1,i live in that area hoboken and jersey city fo...
...,...,...
49995,1,a a big dostoyevsky fan i have always be disap...
49996,0,i do+not watch this show that much when i be l...
49997,1,for people who be first timer in film making i...
49998,0,pumpkinhead be in itself a decent 80 horror fl...


In [15]:
index = 0
data = []
for loss in ["softmax", "ns", "hs"]:  # "softmax", "ns", "hs"
    for epoch in [5, 10, 50 ,100]:
        for lr in [0.05, 0.1, 0.2, 0.5, 1]:
            start = time.time()
            scores = preprocess_train(imdbDataLem, [remove_stop_words], _iterations=epoch, _lr=lr, _loss=loss, word_ngrams=2)
            end = time.time()
            print(f"Loss: {loss}, epoch: {epoch}, lr: {lr}")
            print(f"Time: {end - start}")
            show_score(scores)
            data.append([index, loss, lr, epoch, np.mean(scores)])
            index += 1

Words ngrams: 2
Loss: softmax, epoch: 5, lr: 0.05
Time: 184.29815816879272
[0.8646 0.872  0.8727 0.865  0.87  ]
Average Score: 0.8688600000000001
Words ngrams: 2
Loss: softmax, epoch: 5, lr: 0.1
Time: 181.4628188610077
[0.8979 0.8914 0.8978 0.8938 0.893 ]
Average Score: 0.8947800000000001
Words ngrams: 2
Loss: softmax, epoch: 5, lr: 0.2
Time: 184.90914583206177
[0.9062 0.8985 0.9051 0.8994 0.9022]
Average Score: 0.90228
Words ngrams: 2
Loss: softmax, epoch: 5, lr: 0.5
Time: 176.4304177761078
[0.9068 0.9013 0.9057 0.9019 0.9032]
Average Score: 0.90378
Words ngrams: 2
Loss: softmax, epoch: 5, lr: 1
Time: 166.16732001304626
[0.9072 0.902  0.9041 0.9006 0.9014]
Average Score: 0.90306
Words ngrams: 2
Loss: softmax, epoch: 10, lr: 0.05
Time: 293.83214044570923
[0.8981 0.8939 0.898  0.8946 0.8928]
Average Score: 0.89548
Words ngrams: 2
Loss: softmax, epoch: 10, lr: 0.1
Time: 328.14668583869934
[0.9069 0.9    0.906  0.9002 0.9042]
Average Score: 0.9034600000000002
Words ngrams: 2
Loss: softmax

In [17]:
# epoch 200 only
for loss in ["softmax", "ns", "hs"]:  # "softmax", "ns", "hs"
    for epoch in [200]:
        for lr in [0.05, 0.1, 0.2, 0.5, 1]:
            start = time.time()
            scores = preprocess_train(imdbDataLem, [remove_stop_words], _iterations=epoch, _lr=lr, _loss=loss, word_ngrams=2)
            end = time.time()
            print(f"Loss: {loss}, epoch: {epoch}, lr: {lr}")
            print(f"Time: {end - start}")
            show_score(scores)
            data.append([index, loss, lr, epoch, np.mean(scores)])
            index += 1

Words ngrams: 2
Loss: softmax, epoch: 200, lr: 0.05
Time: 4371.943948030472
[0.9071 0.9049 0.9095 0.9033 0.9051]
Average Score: 0.9059799999999999
Words ngrams: 2
Loss: softmax, epoch: 200, lr: 0.1
Time: 4313.324980020523
[0.9068 0.9046 0.9091 0.9029 0.9052]
Average Score: 0.90572
Words ngrams: 2
Loss: softmax, epoch: 200, lr: 0.2
Time: 4589.946355104446
[0.9065 0.9046 0.9089 0.9029 0.9048]
Average Score: 0.9055399999999999
Words ngrams: 2
Loss: softmax, epoch: 200, lr: 0.5
Time: 5762.751471996307
[0.9067 0.9041 0.9074 0.9023 0.9051]
Average Score: 0.9051199999999999
Words ngrams: 2
Loss: softmax, epoch: 200, lr: 1
Time: 5625.8195769786835
[0.9039 0.9018 0.9046 0.8994 0.9017]
Average Score: 0.90228
Words ngrams: 2
Loss: ns, epoch: 200, lr: 0.05
Time: 6980.918215513229
[0.9067 0.9046 0.909  0.9031 0.9052]
Average Score: 0.90572
Words ngrams: 2
Loss: ns, epoch: 200, lr: 0.1
Time: 4303.546540975571
[0.9064 0.9042 0.9084 0.9025 0.9045]
Average Score: 0.9052
Words ngrams: 2
Loss: ns, epoch:

In [None]:
results_df = pd.DataFrame(data, columns=resultsColumns)
results_df.to_csv("../data/Ex3FastText/Ex3ReportImdb.csv", index=False, header=True)