In [105]:
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
import pandas as pd
from pyvi import ViTokenizer
import re
import string
import codecs

In [106]:
from unicodedata import normalize
import re

def accent_normalizer(word):
    word = normalize('NFD', word)
    accent = re.findall(r'[\u0300\u0301\u0303\u0309\u0323]', word)
    if len(accent) == 0:
        return word
    else:
        word = re.sub(accent[0], '', word)
        span = re.match('.*([aƒÉ√¢e√™io√¥∆°u∆∞y])', word.lower()).end(1)
        word = word[:span] + accent[0] + word[span:]
        return normalize('NFC', word)

In [107]:
def normalize_text(text):
    
    text = re.sub(r'(\D)\1+', r'\1', text)
    text = text.lower()
    
#     text = ' '.join([accent_normalizer(t) for t in text.split()])

    text = text.translate(str.maketrans('', '', string.punctuation))

    text = ViTokenizer.tokenize(text)

    text = text.replace(u'"', u' ')
    text = text.replace(u'Ô∏è', u'')
    text = text.replace('üèª','')
    return text

In [102]:
class DataSource(object):

    def _load_raw_data(self, filename, is_train=True):

        a = []
        b = []

        regex = 'train_'
        if not is_train:
            regex = 'test_'

        with open(filename, 'r') as file:
            for line in file:
                if regex in line:
                    b.append(a)
                    a = [line]
                elif line != '\n':
                    a.append(line)
        b.append(a)

        return b[1:]

    def _create_row(self, sample, is_train=True):

        d = {}
        d['id'] = sample[0].replace('\n', '')
        review = ""

        if is_train:
            for clause in sample[1:-1]:
                review += clause.replace('\n', ' ')
                review = review.replace('.', ' ')

            d['label'] = int(sample[-1].replace('\n', ' '))
        else:
            for clause in sample[1:]:
                review += clause.replace('\n', ' ')
                review = review.replace('.', ' ')


        d['review'] = review

        return d

    def load_data(self, filename, is_train=True):

        raw_data = self._load_raw_data(filename, is_train)
        lst = []

        for row in raw_data:
            lst.append(self._create_row(row, is_train))

        return lst

    def transform_to_dataset(self, x_set,y_set):
        X, y = [], []
        for document, topic in zip(list(x_set), list(y_set)):
            document = normalize_text(document)
            X.append(document.strip())
            y.append(topic)
        return X, y

In [111]:
# ds = DataSource()
# train_data = pd.DataFrame(ds.load_data('data/train.crash'))
# test_data = pd.DataFrame(ds.load_data('data/test.crash', is_train=False))


train_data = pd.read_csv('./data/train.csv')
# test_data = pd.read_csv('./data/test.csv')

In [112]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,id,label,review
0,0,train_000000,0,dung dc sp tot cam on shop ƒë√≥ng_g√≥i s·∫£n_ph·∫©m r...
1,1,train_000001,0,ch·∫•t_l∆∞·ª£ng s·∫£n_ph·∫©m tuy·ªát_v·ªùi son m·ªãn nh∆∞ng kh...
2,2,train_000002,0,ch·∫•t_l∆∞·ª£ng s·∫£n_ph·∫©m tuy·ªát_v·ªùi nh∆∞ng k c√≥ h·ªôp k...
3,3,train_000003,1,m√¨nh h∆°i th·∫•t_v·ªçng 1 ch√∫t v√¨ m√¨nh ƒë√£ k·ª≥_v·ªçng c...
4,4,train_000004,1,l·∫ßn tr∆∞·ªõc m√¨nh mua √°o_gi√≥ m√†u h·ªìng r·∫•t ok m√† ƒë...


In [113]:
classifier = LinearSVC(fit_intercept = True, multi_class='crammer_singer', C=0.5)
# from sklearn.svm import SVC
# classifier = SVC(kernel="linear", C=0.1)
# from sklearn.linear_model import LogisticRegression
# classifier = LogisticRegression(C=0.5)

In [114]:
X_train, X_test, y_train, y_test = train_test_split(train_data.review, train_data.label, test_size=0.3,random_state=42)
X_train, y_train = (X_train,y_train)
X_test, y_test = (X_test, y_test)

In [115]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

vectorizer = TfidfVectorizer(use_idf=True, max_features=15000, ngram_range=(1, 5), sublinear_tf = True,norm='l2',smooth_idf=True)
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

joblib.dump(vectorizer, './pre-trained/tf_idf.model')

['./pre-trained/tf_idf.model']

In [116]:
classifier.fit(X_train, y_train)

joblib.dump(classifier, './pre-trained/vsentiment.model')

['./pre-trained/vsentiment.model']

In [121]:
len(X_train) == len(y_train)

True

In [124]:
y_pre = classifier.predict(X_test)
# print(len(y_pre), len(y_train))
a = metrics.f1_score(y_pre, y_test)

print(a)

0.9286351471900088


In [125]:
import joblib
vectorizer = joblib.load('./pre-trained/tf_idf.model')
classifier = joblib.load('./pre-trained/vsentiment.model')

In [126]:
# from sklearn.feature_extraction.text import CountVectorizer
# import joblib

# vectorizer = CountVectorizer(use_idf=True, max_features=15000, ngram_range=(1, 5), sublinear_tf = True,norm='l2',smooth_idf=True)
# X_train = vectorizer.fit_transform(X_train).toarray()
# X_test = vectorizer.transform(X_test).toarray()

In [127]:
train_transform = vectorizer.transform(train_data['review'])[:15800]
train_labels = train_data['label'][:15800]

# test_transform = vectorizer.transform(test_data['review'])
# test_labels = classifier.predict(test_transform)

In [98]:
vectorizer.vocabulary_

{'ƒë√≥ng': 14063,
 'g√≥i': 3373,
 's·∫£n': 10610,
 'ph·∫©m': 8896,
 'k√©m': 5245,
 'ƒë·∫ø': 14635,
 'gi√†y': 3095,
 'kh√¥ng': 4790,
 'ch·∫Øc': 1261,
 'ch·∫Øn': 1349,
 'ƒë√°ng': 13861,
 'ti·ªÅn': 11728,
 'b·ªã': 554,
 'm√≥p': 6928,
 'ƒë√≥ng g√≥i': 14064,
 'g√≥i s·∫£n': 3434,
 's·∫£n ph·∫©m': 10613,
 'ph·∫©m k√©m': 8948,
 'ƒë·∫ø gi√†y': 14636,
 'gi√†y kh√¥ng': 3106,
 'kh√¥ng ch·∫Øc': 4825,
 'ch·∫Øc ch·∫Øn': 1263,
 'ch·∫Øn kh√¥ng': 1376,
 'kh√¥ng ƒë√°ng': 4991,
 'ƒë√°ng ti·ªÅn': 13870,
 'ti·ªÅn gi√†y': 11748,
 'gi√†y b·ªã': 3097,
 'b·ªã m√≥p': 598,
 'ƒë√≥ng g√≥i s·∫£n': 14106,
 'g√≥i s·∫£n ph·∫©m': 3435,
 's·∫£n ph·∫©m k√©m': 10660,
 'kh√¥ng ch·∫Øc ch·∫Øn': 4826,
 'ch·∫Øc ch·∫Øn kh√¥ng': 1289,
 'kh√¥ng ƒë√°ng ti·ªÅn': 4992,
 'gi√†y b·ªã m√≥p': 3098,
 'ƒë√≥ng g√≥i s·∫£n ph·∫©m': 14107,
 'g√≥i s·∫£n ph·∫©m k√©m': 3440,
 'ƒë√≥ng g√≥i s·∫£n ph·∫©m k√©m': 14110,
 'r·∫•t': 9472,
 'ƒë·∫πp': 14443,
 'v√†': 12755,
 'shop': 10026,
 'ph·ª•c': 9130,
 'v·ª•': 13355,
 'ch·∫•t': 1112,
 'l∆∞·ª£ng': 5823,

In [82]:
import scipy

# data_transform = scipy.sparse.vstack((train_transform, test_transform))
# print(train_transform.shape, test_transform.shape, data_transform.shape)

In [70]:
len(list(test_data.review))

10981

In [132]:
data = []

for text, label in zip(list(train_data['review'])[:15800], list(train_labels)[:15800]):
    data.append((label, text.strip()))

In [133]:
joblib.dump((train_transform, data), './pre-trained/data.db')

['./pre-trained/data.db']

In [53]:
# joblib.dump(classifier, './pre-trained/vsentiment.model')

In [11]:
# text = "s·∫£n ph·∫©m n√†y x·∫•u qu√°"
# text = normalize_text(text)
# x = vectorizer.transform([text])
# y_pred = classifier.predict_proba(x)

# print(y_pred)

In [1]:
import re
import string
from unicodedata import normalize
def accent_normalizer(word):
    word = normalize('NFD', word)
    accent = re.findall(r'[\u0300\u0301\u0303\u0309\u0323]', word)
    if len(accent) == 0:
        return word
    else:
        word = re.sub(accent[0], '', word)
        span = re.match('.*([aƒÉ√¢e√™io√¥∆°u∆∞y])', word.lower()).end(1)
        word = word[:span] + accent[0] + word[span:]
        return normalize('NFC', word)

In [6]:
accent_normalizer('Uy·ªÉn')

'Uy·∫ªÃÇn'

In [None]:
span = re.match('.*([aƒÉ√¢e√™io√¥∆°u∆∞y])', '').end(1)