In [1]:
import re
import pandas as pd
import nltk
from skift import FirstColFtClassifier
from nltk.corpus import stopwords

In [2]:
# Download stopwords dictionary
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Marek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# EXPERIMENT 2

## Amazon

In [3]:
# train data
data_train = pd.read_csv('../data/AmazonTrainSet1M.tsv', sep='\t', header=0, encoding="utf-8")
data_train['SentimentText'] = data_train['SentimentText'].str.lower()
row_sizes = data_train['SentimentText'].str.split().str.len()
print(f"Words count: {pd.Series.sum(row_sizes)}")
data_train


Words count: 78450202


Unnamed: 0,Sentiment,SentimentText
0,0,defective: i was really excited to get the fis...
1,1,m-audio 2496 sound card: excellent sound card ...
2,0,missing links: it's a shame the quality of thi...
3,0,tribute album: this is a tribute album...i did...
4,1,pretty good: it does taste pretty good and is ...
...,...,...
999995,1,tlc...... need i say more: tlc is the best gro...
999996,1,alternative ending: an excellent book no doubt...
999997,1,p-town series: i read these out of order becau...
999998,0,pretty sad....: this book would play out bette...


In [4]:
# test data
data_test = pd.read_csv('../data/AmazonTestSet400k2.tsv', sep='\t', header=0, encoding="utf-8")
data_test['SentimentText'] = data_test['SentimentText'].str.lower()
row_sizes = data_test['SentimentText'].str.split().str.len()
print(f"Words count: {pd.Series.sum(row_sizes)}")
data_test

Words count: 31369658


Unnamed: 0,Sentiment,SentimentText
0,1,this is a great book: i must preface this by s...
1,0,"huge disappointment.: as a big time, long term..."
2,1,wayne is tight but cant hang with turk.: this ...
3,1,excellent: i read this book when i was in elem...
4,0,not about anusara: although this book is toute...
...,...,...
399995,0,you can fool all the people some of the time b...
399996,0,it was good but not good: it was an 'okay' boo...
399997,0,unwatchable: the product arrived promptly and ...
399998,0,not worth the money or the time to read.: the ...


## Remove stop words

In [5]:
stop_words = set(stopwords.words("english")) 

def remove_stop_words(text):
    text = [word for word in text.split() if not word in stop_words]
    text = " ".join(text)
    return text

In [5]:
def preprocess_train(df_train, df_test, functions, word_ngrams=1):
    _data_train = pd.DataFrame(df_train['SentimentText'])
    _data_test = pd.DataFrame(df_test['SentimentText'])
    for function in functions:
        _data_train['SentimentText'] = _data_train['SentimentText'].apply(lambda x: function(x))
        _data_test['SentimentText'] = _data_test['SentimentText'].apply(lambda x: function(x))
    _row_sizes = _data_train['SentimentText'].str.split().str.len()
    print(f"Words count: {pd.Series.sum(_row_sizes)}")
    print(_data_train)
    _sk_clf = FirstColFtClassifier(wordNgrams=word_ngrams, thread=1)  # lr=0.3, epoch=10
    _sk_clf.fit(_data_train[['SentimentText']], df_train['Sentiment'])
    _score = _sk_clf.score(_data_test[['SentimentText']], df_test['Sentiment'])
    print(f"Words ngrams: {word_ngrams}")
    return _score

In [9]:
score = preprocess_train(data_train, data_test, [remove_stop_words])
score

Words count: 41817156
                                            SentimentText
0       defective: really excited get fisher-price ama...
1       m-audio 2496 sound card: excellent sound card ...
2       missing links: shame quality video poor. movie...
3       tribute album: tribute album...i pay close eno...
4       pretty good: taste pretty good filling staying...
...                                                   ...
999995  tlc...... need say more: tlc best group ever w...
999996  alternative ending: excellent book doubt. go r...
999997  p-town series: read order know series. loved r...
999998  pretty sad....: book would play better movie s...
999999  awesome funky jazz band, definetly check out: ...

[1000000 rows x 1 columns]
Words ngrams: 1


0.89961

## Remove punctuation

In [7]:
def remove_punctuation(text):
    text = re.sub(r'[^\w\s]','',text)
    return text

In [8]:
score = preprocess_train(data_train, data_test, [remove_punctuation])
score

Words count: 78154408
                                            SentimentText
0       defective i was really excited to get the fish...
1       maudio 2496 sound card excellent sound card fo...
2       missing links its a shame the quality of this ...
3       tribute album this is a tribute albumi didnt p...
4       pretty good it does taste pretty good and is f...
...                                                   ...
999995  tlc need i say more tlc is the best group ther...
999996  alternative ending an excellent book no doubt ...
999997  ptown series i read these out of order because...
999998  pretty sad this book would play out better on ...
999999  awesome funky jazz band definetly check them o...

[1000000 rows x 1 columns]
Words ngrams: 1


0.9068575

## Lemmatization

In [5]:
# train data
data_train_lem = pd.read_csv('../data/AmazonTrainLemmatized.tsv', sep='\t', header=0, encoding="utf-8")
row_sizes = data_train_lem['SentimentText'].str.split().str.len()
print(f"Words count: {pd.Series.sum(row_sizes)}")
data_train_lem

Words count: 79272968


Unnamed: 0,Sentiment,SentimentText
0,0,defective: i be really excite to get the fishe...
1,1,m-audio 2496 sound card: excellent sound card ...
2,0,miss links: it a shame the quality of this vid...
3,0,tribute album: this be a tribute album i do+no...
4,1,pretty good: it do taste pretty good and be fi...
...,...,...
999995,1,tlc need i say more: tlc be the best group the...
999996,1,alternative ending: a excellent book no doubt ...
999997,1,p-town series: i read these out of order becau...
999998,0,pretty sad : this book would play out better o...


In [6]:
# test data
data_test_lem = pd.read_csv('../data/AmazonTestLemmatized.tsv', sep='\t', header=0, encoding="utf-8")
row_sizes = data_test_lem['SentimentText'].str.split().str.len()
print(f"Words count: {pd.Series.sum(row_sizes)}")
data_test_lem

Words count: 31698635


Unnamed: 0,Sentiment,SentimentText
0,1,this be a great book: i must preface this by s...
1,0,huge disappointment : as a big time long term ...
2,1,wayne be tight but cant hang with turk : this ...
3,1,excellent: i read this book when i be in eleme...
4,0,not about anusara: although this book be tout ...
...,...,...
399995,0,you can fool all the people some of the time b...
399996,0,it be good but not good: it be a 'okay' book i...
399997,0,unwatchable: the product arrive promptly and b...
399998,0,not worth the money or the time to read : the ...


In [7]:
score = preprocess_train(data_train_lem, data_test_lem, [])
score

Words count: 79272968
                                            SentimentText
0       defective: i be really excite to get the fishe...
1       m-audio 2496 sound card: excellent sound card ...
2       miss links: it a shame the quality of this vid...
3       tribute album: this be a tribute album i do+no...
4       pretty good: it do taste pretty good and be fi...
...                                                   ...
999995  tlc need i say more: tlc be the best group the...
999996  alternative ending: a excellent book no doubt ...
999997  p-town series: i read these out of order becau...
999998  pretty sad : this book would play out better o...
999999  awesome funky jazz band definetly check them o...

[1000000 rows x 1 columns]
Words ngrams: 1


0.9030475

## Remove stop words AND remove punctuation

In [9]:
score = preprocess_train(data_train, data_test, [remove_stop_words, remove_punctuation])
score

Words count: 41521362
                                            SentimentText
0       defective really excited get fisherprice amazi...
1       maudio 2496 sound card excellent sound card co...
2       missing links shame quality video poor movie f...
3       tribute album tribute albumi pay close enough ...
4       pretty good taste pretty good filling staying ...
...                                                   ...
999995  tlc need say more tlc best group ever waz ever...
999996  alternative ending excellent book doubt go rob...
999997  ptown series read order know series loved rere...
999998  pretty sad book would play better movie screen...
999999  awesome funky jazz band definetly check out tr...

[1000000 rows x 1 columns]
Words ngrams: 1


0.89808

## Remove stop words AND Lemmatization

In [9]:
score = preprocess_train(data_train_lem, data_test_lem, [remove_stop_words])
score

Words count: 41408983
                                            SentimentText
0       defective: really excite get fisher-price amaz...
1       m-audio 2496 sound card: excellent sound card ...
2       miss links: shame quality video poor movie fas...
3       tribute album: tribute album do+not pay close ...
4       pretty good: taste pretty good fill stay away ...
...                                                   ...
999995  tlc need say more: tlc best group ever waz eve...
999996  alternative ending: excellent book doubt go ro...
999997  p-town series: read order do+not know series l...
999998  pretty sad : book would play better movie scre...
999999  awesome funky jazz band definetly check out: t...

[1000000 rows x 1 columns]
Words ngrams: 1


0.89564

## N-GRAMS

In [6]:
for i in range(1, 4):  # word_ngrams
    score = preprocess_train(data_train, data_test, [], word_ngrams=i)
    print(score)

Words count: 78450202
                                            SentimentText
0       defective: i was really excited to get the fis...
1       m-audio 2496 sound card: excellent sound card ...
2       missing links: it's a shame the quality of thi...
3       tribute album: this is a tribute album...i did...
4       pretty good: it does taste pretty good and is ...
...                                                   ...
999995  tlc...... need i say more: tlc is the best gro...
999996  alternative ending: an excellent book no doubt...
999997  p-town series: i read these out of order becau...
999998  pretty sad....: this book would play out bette...
999999  awesome funky jazz band, definetly check them ...

[1000000 rows x 1 columns]
Words ngrams: 1
0.9085275
Words count: 78450202
                                            SentimentText
0       defective: i was really excited to get the fis...
1       m-audio 2496 sound card: excellent sound card ...
2       missing links: it's a sh