In [34]:
import pandas as pd
import numpy as np
import re
import fasttext
from pyvi import ViTokenizer
from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [35]:
# Load data
df = pd.read_csv('data_processed.csv', encoding='utf-8')

In [36]:
df.shape

(13872, 6)

In [37]:
from collections import Counter
Counter(df.ground)

Counter({1: 10888, 0: 2984})

In [38]:
# Convert text to lowercase
df['content'] = df['content'].str.lower()
# Remove numbers and words with numbers
df['content'] = df['content'].str.replace('\w*\d\w*', ' ')
# Remove punctuation
df['content'] = df['content'].str.replace('[^\w\s]', ' ')
# Remove whitespaces
df['content'] =  df['content'].str.split().apply(lambda x : ' '.join(word for word in x))
# Tokenize
# df['content'] = df['content'].apply(lambda x : ViTokenizer.tokenize(x))

In [None]:
df['content']

In [11]:
# df.to_csv('checked/data_processed.csv', sep='\t', encoding='utf-16')

In [40]:
dfP = df[df.ground == 1]
dfN = df[df.ground == 0]
print(dfP.shape, dfN.shape)
dff = pd.concat([dfN, dfP[:3254]])
dff.shape

(10888, 6) (2984, 6)


(6238, 6)

In [41]:
dff = shuffle(dff)
Counter(dff.ground)

Counter({1: 3254, 0: 2984})

In [42]:
train, test = train_test_split(dff, test_size=0.2)

In [43]:
train.tail()

Unnamed: 0,content,predicted,ground,star,item,brand
1451,loa nghe rõ lớn pin tốt được khoảng hơn ngày n...,1,1,4,nokia-105,nokia
6359,lúc đầu thì rất êm rất mượt nhưng chừng một th...,0,0,3,samsung-galaxy-j1,samsung
4179,xài gần tháng cảm thấy máy bị đơ vân tay chạm ...,0,0,2,vivo-v7,vivo
13508,nói chung tầm giá này thì k đòi hỏi gì nhiều c...,0,0,4,xiaomi-redmi-7a,xiaomi
967,máy có thiết kế trẻ trung sành điệu theo xu hư...,1,1,5,coolpad-n5,coolpad


In [44]:
texts = train['content'].tolist()
labels = train['ground'].tolist()

In [45]:
# Convert file to FastText format
def store_file_fasttext_format(outfile_dir, texts, labels):
    """
    Store texts and labels to fasttext format file
    """
    with open(outfile_dir, 'w') as f:
        for i in range(0, len(texts)):
            f.write('__label__%s %s\n' % (labels[i], texts[i]))

In [46]:
class FastTextEstimator(BaseEstimator):
    def __init__(self, model_dir, epoch_val=5, lr_val=0.1):
        """Initial model and model directory"""
        self.model_dir = model_dir
        self.model = None
        self.epoch_val = epoch_val
        self.lr_val = lr_val
    def fit(self, texts, labels):
        """Fit the model using `texts` and `labels` in fasttext format

        Parameters
        ----------
        texts : list
        labels : list
        """
        # Save texts and labels in fasttext format file
        store_file_fasttext_format(
            self.model_dir+'train.txt',
            texts,
            labels)
        # Train with specific hyperparameters
        self.model = fasttext.train_supervised(
            self.model_dir+'train.txt',
            dim=300,
#             pretrainedVectors=self.model_dir+'cc.vi.300.vec',
            epoch=self.epoch_val,
            lr=self.lr_val)
        return self
    def predict(self, texts):
        """Predict the class labels for the provided texts

        Parameters
        ----------
        texts : list

        Returns: class labels for each text
        """
        predicted_labels = []
        predictions = self.model.predict(texts)[0]
        # Convert predicted labels from text to string for f1 score function
        for prediction in predictions:
            predicted_labels.append(0 if prediction[0] == '__label__0' else 1)
        return np.array(predicted_labels)
    def score(self, texts, labels):
        """Compute accuracy score for the given text and prediction

        Parameters
        ----------
        texts : list
        labels : list
        
        Return: accuracy score
        """
        predicted_labels = self.predict(texts)
        return accuracy_score(labels, predicted_labels)

In [47]:
MODEL_DIR = ''
clf = FastTextEstimator(model_dir=MODEL_DIR)
clf.__init__(model_dir=MODEL_DIR, epoch_val=29, lr_val=0.1)
scores = cross_val_score(
    clf,
    texts,
    labels,
    cv=StratifiedKFold(n_splits=10),
    scoring='accuracy')
print(scores)
scores.mean()

[0.872      0.884      0.88577154 0.89178357 0.90180361 0.88977956
 0.8757515  0.85571142 0.8935743  0.87951807]


0.8829693571882722

In [48]:
clf = FastTextEstimator(model_dir=MODEL_DIR)
clf.__init__(model_dir=MODEL_DIR, epoch_val=29, lr_val=0.1)
clf.fit(texts, labels)
clf.score(test['content'].tolist(), test['ground'].tolist())

0.8990384615384616