In [19]:
# installing fasttext
pip install fasttext



In [37]:
#importing packages
import fasttext
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
import csv
import bz2
from bz2 import decompress
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras import models, layers, optimizers
import tensorflow

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Data Preparations**

In [0]:
# unarchiving train & test data
!bunzip2 test.ft.txt.bz2
!bunzip2 train.ft.txt.bz2

In [0]:
# reading and preparing train & test data

#train
train_file = open('train.ft.txt')
train_data = train_file.readlines()

# test
test_file = open('test.ft.txt')
test_data = test_file.readlines()

In [0]:
# check & look at train data
train_data[1:5]

["__label__2 The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny.\n",
 '__label__2 Amazing!: This soundtrack is my favorite music of all time, hands down. The intense sadness of "Prisoners of Fate" (which means all the more if you\'ve played the game) and the hope in "A Distant Promise" and "Girl who Stole the Star" have been an important inspiration to me personally throughout my teen years. The higher energy tracks like "Chrono Cross ~ Time\'s Scar~", "Time of the Dreamwatch", and "Chronomantique" (indefinably remeniscent of Chrono Tri

In [0]:
# check & look at test data
test_data[1:5]

["__label__2 One of the best game music soundtracks - for a game I didn't really play: Despite the fact that I have only played a small portion of the game, the music I heard (plus the connection to Chrono Trigger which was great as well) led me to purchase the soundtrack, and it remains one of my favorite albums. There is an incredible mix of fun, epic, and emotional songs. Those sad and beautiful tracks I especially like, as there's not too many of those kinds of songs in my other video game soundtracks. I must admit that one of the songs (Life-A Distant Promise) has brought tears to my eyes on many occasions.My one complaint about this soundtrack is that they use guitar fretting effects in many of the songs, which I find distracting. But even if those weren't included I would still consider the collection worth it.\n",
 '__label__1 Batteries died within a year ...: I bought this charger in Jul 2003 and it worked OK for a while. The design is nice and convenient. However, after about

In [0]:
# data preprocessing
data = pd.DataFrame(train_data)
data.head()

Unnamed: 0,0
0,__label__2 Stuning even for the non-gamer: Thi...
1,__label__2 The best soundtrack ever to anythin...
2,__label__2 Amazing!: This soundtrack is my fav...
3,__label__2 Excellent Soundtrack: I truly like ...
4,"__label__2 Remember, Pull Your Jaw Off The Flo..."


In [0]:
# data preprocessing
data.to_csv("train.txt", index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")

In [0]:
# removing  __label__1 / __label__2 from the test set
test_clean = [w.replace('__label__2 ', '') for w in test_data]
test_clean = [w.replace('__label__1 ', '') for w in test_clean]

In [0]:
# check
test_clean[1:5]

["One of the best game music soundtracks - for a game I didn't really play: Despite the fact that I have only played a small portion of the game, the music I heard (plus the connection to Chrono Trigger which was great as well) led me to purchase the soundtrack, and it remains one of my favorite albums. There is an incredible mix of fun, epic, and emotional songs. Those sad and beautiful tracks I especially like, as there's not too many of those kinds of songs in my other video game soundtracks. I must admit that one of the songs (Life-A Distant Promise) has brought tears to my eyes on many occasions.My one complaint about this soundtrack is that they use guitar fretting effects in many of the songs, which I find distracting. But even if those weren't included I would still consider the collection worth it.\n",
 'Batteries died within a year ...: I bought this charger in Jul 2003 and it worked OK for a while. The design is nice and convenient. However, after about a year, the batteries

In [0]:
# removing  __label__1 / __label__2 from the train set
train_clean = [w.replace('__label__2 ', '') for w in train_data]
train_clean = [w.replace('__label__1 ', '') for w in train_clean]

**Target Metric**


For the target metric I choose F1 score as it consideres both precision and recall and gives a more holistic understanding of model's performance

**Fasttext Modelling**

In [0]:
# fasttext model
model = fasttext.train_supervised('train.txt',label_prefix='__label__', thread=4, epoch = 15)

In [0]:
# fasttext model predicting 
pred = model.predict(test_clean)

In [0]:
# check the predictions
print(pred[0][0], ': predicted')
print(pred[0][1], ': probability score')

['__label__2'] : predicted
['__label__2'] : probability score


In [0]:
# recoding to 0/1 system 
labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_data]
pred_labels = [0 if x == ['__label__1'] else 1 for x in pred[0]]

In [0]:
# run the accuracy measure. 
print(f1_score(labels, pred_labels))

0.9168198517918282


**Hard baseline** 

*Count Vectorizer + SVM*

In [0]:
# creating train DF
data = pd.DataFrame(train_clean)

In [0]:
# converting 1 = positive, 0 = negative
# creating DF
train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_data]
text_lbl = pd.DataFrame(train_labels)

In [7]:
# final train DF
final_train_df = pd.concat([text_lbl, data],axis=1)
final_train_df.columns=['label','text']
final_train_df.head()

Unnamed: 0,label,text
0,1,Stuning even for the non-gamer: This sound tra...
1,1,The best soundtrack ever to anything.: I'm rea...
2,1,Amazing!: This soundtrack is my favorite music...
3,1,Excellent Soundtrack: I truly like this soundt...
4,1,"Remember, Pull Your Jaw Off The Floor After He..."


In [0]:
# preprocessing test DF
data_test = pd.DataFrame(test_clean)

In [0]:
# preprocessing test DF
test_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_data]
text_lbl_test = pd.DataFrame(test_labels)

In [10]:
# creating test DF
final_test_df = pd.concat([text_lbl_test,data_test],axis=1)
final_test_df.columns=['label','text']
final_test_df.head()

Unnamed: 0,label,text
0,1,Great CD: My lovely Pat has one of the GREAT v...
1,1,One of the best game music soundtracks - for a...
2,0,Batteries died within a year ...: I bought thi...
3,1,"works fine, but Maha Energy is better: Check o..."
4,1,Great for the non-audiophile: Reviewed quite a...


In [0]:
# defining function for text cleaning
def process_text(data): 
    data_low = data.str.lower()
    data_punct = data_low.apply(lambda s: s.translate(str.maketrans('', '', string.punctuation)))
    data_nostop=[word for word in data_punct if word not in stopwords.words('english')]
    return data_nostop

In [0]:
# applying function for text cleaning for train set
final_train_df['processed_text'] = process_text(final_train_df['text'])

In [13]:
final_train_df.head()

Unnamed: 0,label,text,processed_text
0,1,Stuning even for the non-gamer: This sound tra...,stuning even for the nongamer this sound track...
1,1,The best soundtrack ever to anything.: I'm rea...,the best soundtrack ever to anything im readin...
2,1,Amazing!: This soundtrack is my favorite music...,amazing this soundtrack is my favorite music o...
3,1,Excellent Soundtrack: I truly like this soundt...,excellent soundtrack i truly like this soundtr...
4,1,"Remember, Pull Your Jaw Off The Floor After He...",remember pull your jaw off the floor after hea...


In [0]:
#applying function for text cleaning for test set
final_test_df['processed_text'] = process_text(final_test_df['text'])

In [15]:
final_test_df.head()

Unnamed: 0,label,text,processed_text
0,1,Great CD: My lovely Pat has one of the GREAT v...,great cd my lovely pat has one of the great vo...
1,1,One of the best game music soundtracks - for a...,one of the best game music soundtracks for a ...
2,0,Batteries died within a year ...: I bought thi...,batteries died within a year i bought this ch...
3,1,"works fine, but Maha Energy is better: Check o...",works fine but maha energy is better check out...
4,1,Great for the non-audiophile: Reviewed quite a...,great for the nonaudiophile reviewed quite a b...


*Due to RAM limitations I had to extremely minimize the number of lines in order to get just anything*

In [0]:
# fighting RAM limitation
x_train = final_train_df['processed_text'].iloc[0:100]
x_test = final_test_df['processed_text'].iloc[0:100]
y_train = final_train_df['label'].iloc[0:100]
y_test = final_test_df['label'].iloc[0:100]

In [0]:
# building a classifier
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SVC(kernel='rbf'))])

In [37]:
# fitting a model
text_clf.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,

In [0]:
# predicting
predicted_svm = text_clf.predict(x_test)

In [40]:
# evaluating F1 score
print(f1_score(y_test, predicted_svm))

0.7123287671232875


*Though it seems much worse than fasttext, comparison is incorrect due to severe data limitation that I forced on the model. 
On the other hand I can note that fasttext didn't need that much RAM and I was able to process whole dataset really fast*

**ANN**

*Once again I couldn't fight against RAM, sorry*

In [0]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    x_train, y_train, random_state=0, test_size=0.2)

In [0]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_texts)

In [0]:
test_labels = y_test
test_texts = x_test

In [0]:
train_texts = tokenizer.texts_to_sequences(train_texts)
val_texts = tokenizer.texts_to_sequences(val_texts)
test_texts = tokenizer.texts_to_sequences(test_texts)

In [0]:
maxlength = max(len(train_ex) for train_ex in train_texts)
train_texts = pad_sequences(train_texts, maxlen=maxlength)
val_texts = pad_sequences(val_texts, maxlen=maxlength)
test_texts = pad_sequences(test_texts, maxlen=maxlength)

In [0]:
sequence = layers.Input(shape=(maxlength,))
embedded = layers.Embedding(20000, 64)(sequence)
newlayer = layers.Conv1D(32, 3, activation='relu')(embedded)
newlayer = layers.BatchNormalization()(newlayer)
newlayer = layers.MaxPool1D(3)(newlayer)
newlayer = layers.Conv1D(64, 5, activation='relu')(newlayer)
newlayer = layers.GlobalMaxPool1D()(newlayer)
newlayer = layers.Flatten()(newlayer)
output_layer = layers.Dense(1, activation='sigmoid')(newlayer)

In [0]:
model = models.Model(sequence, output_layer)
model.compile(
    optimizer='Adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy']
    )

In [53]:
model.fit(
    train_texts, 
    train_labels, 
    batch_size=128,
    epochs=5,
    validation_data=(val_texts, val_labels), )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f372f983208>

In [0]:
predictions = model.predict(test_texts)

In [0]:
predictions1 = np.argmax(predictions, axis=1)

In [56]:
print(f1_score(test_labels, predictions1 , average="macro"))

0.31972789115646255


*Well, this is extremely poor! first, once again I used not the whole data but the results are worse than CountVectorizer+SVM* 

*I need to experiment more with the structure of the network itself*


**Overall comment**


*I can't exactly compare the scores as only Fasttext model was applied to the whole dataset.* 

*But I should say I would choose **fasttext.** After endless times of my computer showing warnig signs, restarting, me moving to collab, reading collab messages about RAM limitation & automatic restart... After 10th time it's not even funny, it's really really sad. Fasttext was indeed fast and the result of 91,6 is not that bad in relation to the small amount of time that was used to build it*