# Text classification with fasttext using sklearn wrapper

In [None]:
!pip install skift

### Prepare Train IMDB Sentiment Data

In [2]:
import pandas as pd

In [23]:
df = pd.read_csv('/home/jupyter/data/IMDB Dataset.csv')
df = df[:5000]
df.loc[df['sentiment'] == 'positive', 'sentiment'] = 1
df.loc[df['sentiment'] == 'negative', 'sentiment'] = 0
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [24]:
import string
import re

import spacy


nlp = spacy.load('en_core_web_sm', disable=['ner', 'tagger'])


def normalize(text):
    text = text.lower().strip().replace('\n', ' ')
    my_tokens = nlp(text)
    
    # Lemmatizing each token and converting each token into lowercase
    my_tokens = [word.lemma_.lower() if word.lemma_ != '-PRON-' else word.lower_ 
                 for word in my_tokens]
    
    return ' '.join([word for word in my_tokens if word not in string.punctuation])
    return ' '.join(my_tokens)

### Normalize review with spaCy tokenization, lemmization, and punctuation removal

In [25]:
df['normalized_review'] = df['review'].apply(normalize)

In [26]:
df.to_csv('/home/jupyter/data/IMDB_Dataset_5000_normalized.csv', index=False)

### Training

In [27]:
from sklearn.model_selection import train_test_split

X = df['normalized_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [48]:
from sklearn.pipeline import Pipeline
from skift import FirstColFtClassifier

classifier = FirstColFtClassifier(lr=0.2, epoch=100, wordNgrams=2, bucket=200000, dim=50, loss='hs')

from time import time
print('Fitting model...')
t0 = time()
classifier.fit(pd.DataFrame({'review': X_train.values}), list(y_train.values))
print('Time taken: ', (time() - t0))

Fitting model...
Time taken:  5.3542749881744385


### Testing

In [69]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

from time import time
predicted = classifier.predict(pd.DataFrame({'review': X_test.values}))
print(confusion_matrix(list(y_test.values), predicted))
print(classification_report(list(y_test.values), predicted))
print(accuracy_score(list(y_test.values), predicted))
print('Time taken: ', (time() - t0) / 60)

[[451  55]
 [ 67 427]]
              precision    recall  f1-score   support

           0       0.87      0.89      0.88       506
           1       0.89      0.86      0.88       494

    accuracy                           0.88      1000
   macro avg       0.88      0.88      0.88      1000
weighted avg       0.88      0.88      0.88      1000

0.878
Time taken:  3.9945039709409076


### Testing with random text

In [70]:
test_list = [
    ('The movie is good in the first part but I find very boring onwards', 0),
    ('Although this is a budget movie and actors are not famous, I really enjoy watching it.', 1),
    ('This movie is so sick! One of the best!', 1),
]
test_df = pd.DataFrame(test_list, columns=['review', 'sentiment'])
test_df['review'] = test_df['review'].apply(normalize)

In [74]:
from time import time
predicted = classifier.predict(test_df[['review']])
print(confusion_matrix(test_df['sentiment'], list(predicted), labels=[1, 0]))
print(classification_report(test_df['sentiment'], list(predicted), labels=[1, 0]))

[[2 0]
 [0 1]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         2
           0       1.00      1.00      1.00         1

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3

