In [3]:
#import library
import pandas as pd
import numpy as np
import requests
import os
from gensim.models import Word2Vec
from sklearn import cluster
from sklearn import metrics
import TextProcess.text_process as text_process


In [4]:
def read_input(file_name):
    """
    read input file which is tsv file
    
    """
    df =  pd.read_csv(os.path.join('./data',file_name), sep="\t")
    return df

In [5]:
#Import dataset
train=read_input('train.tsv')
train.head(3)

Unnamed: 0.1,Unnamed: 0,product_id,description2,title,label
0,0,100191,JACK OF CLUBS PSA 5 9 OF SPADES PSA 5 7 OF HEA...,(4) 1889 N220 KINNEY HARLEQUIN PSA 5 EX TOBACC...,0
1,1,100217,Powered by Frooition About us Newsletter Feedb...,12 Jars Of Dalfour Beauty Gold Seal EXCEL Beau...,0
2,2,100272,StrawberryNET Check out our eBay store >> Cate...,Perricone MD Vitamin C Ester Eye Serum 15ml/0.5oz,0


In [6]:
test=read_input('dev.tsv')
test.head(3)

Unnamed: 0.1,Unnamed: 0,product_id,description2,title,label
0,0,100394,TOP Gold Light 100's 100MM - 1 Box - 200 Tubes...,TOP Gold Light 100's 100MM - 1 Box - 200 Tubes...,0
1,1,100692,15oceaneshop Add to my favorite sellers Mon. t...,Portable Clear Mini Acrylic Water Pipes Smokin...,0
2,2,100857,StrawberryNET Check out our eBay store >> Cate...,Clarins Daily Energizer Cleansing Gel 75ml/2.5oz,0


In [7]:
def preprocess_text(lists):
    """
    Remove noise, normalize and tokenize text
    """
    preprocess_text_list=[]
    for i in range(len(lists)):
        if lists[i]=='None':
            preprocess_text_list.append('None')
        else:
            try:
                preprocess_text_list.append(text_process.text_processing(lists[i]))
            except:
                preprocess_text_list.append('None')
    return (preprocess_text_list)

In [8]:
#Tokenization
train_description_tokenize=preprocess_text(train['description2']) #Tokenize description of train set
train_title_tokenize=preprocess_text(train['title'])  #Tokenize title of train set
test_description_tokenize=preprocess_text(test['description2'])  #Tokenize description of test set
test_title_tokenize=preprocess_text(test['title']) #Tokenize title of test set

In [9]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, wordtwovec):
        self.wordtwovec = wordtwovec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(wordtwovec.values())))

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.wordtwovec[w] for w in words if w in self.wordtwovec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [10]:
def Word2Vec_Processing(train_set,test_set,n):
    # let X be a list of tokenized texts (i.e. list of lists of tokens)
    train_w2v=Word2Vec(train_set, size=n)
    test_w2v=Word2Vec(test_set, size=n)
    train_w2v_dict = dict(zip(train_w2v.wv.index2word, train_w2v.wv.syn0))
    test_w2v_dict = dict(zip(test_w2v.wv.index2word, test_w2v.wv.syn0))
    # get vector data
    train_vector = MeanEmbeddingVectorizer(train_w2v_dict).transform(train_set)
    test_vector=MeanEmbeddingVectorizer(test_w2v_dict).transform(test_set)
    return train_vector, test_vector


In [11]:
train_set,test_set=Word2Vec_Processing(train_title_tokenize,test_title_tokenize,200) #Use only title

  "C extension not loaded, training will be slow. "
  """
  


In [12]:
train_set_des,test_set_des=Word2Vec_Processing(train_description_tokenize,test_description_tokenize,200)#Use only description

  """
  


In [13]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
my_tags=['Non-violation','Violation']


In [14]:
#Train with title
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(train_set, train['label'])
y_pred = logreg.predict(test_set)
print('accuracy %s' % accuracy_score(y_pred, test.label))
print(classification_report(test.label, y_pred,target_names=my_tags))



accuracy 0.8252063015753939
               precision    recall  f1-score   support

Non-violation       0.83      1.00      0.90      1101
    Violation       0.00      0.00      0.00       232

    micro avg       0.83      0.83      0.83      1333
    macro avg       0.41      0.50      0.45      1333
 weighted avg       0.68      0.83      0.75      1333



The results show a good precision and recall on non-violation group. However, precision and recall are extremly bad in case of violation group. It seems like the model only capture a samll amount of actual viola

In [15]:
#Train with description only
my_tags=['Non-violation','Violation']
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(train_set_des, train['label'])
y_pred = logreg.predict(test_set_des)
print('accuracy %s' % accuracy_score(y_pred, test.label))
print(classification_report(test.label, y_pred,target_names=my_tags))



accuracy 0.7959489872468117
               precision    recall  f1-score   support

Non-violation       0.84      0.94      0.88      1101
    Violation       0.30      0.13      0.19       232

    micro avg       0.80      0.80      0.80      1333
    macro avg       0.57      0.53      0.53      1333
 weighted avg       0.74      0.80      0.76      1333



In [16]:
#USING OTHER METHOD
#Linear Support Vector Machine

from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)
sgd.fit(train_set, train['label']) #Use title
y_pred = sgd.predict(test_set)

print('accuracy %s' % accuracy_score(y_pred, test.label))
print(classification_report(test.label, y_pred,target_names=my_tags))

accuracy 0.8259564891222806
               precision    recall  f1-score   support

Non-violation       0.83      1.00      0.90      1101
    Violation       0.00      0.00      0.00       232

    micro avg       0.83      0.83      0.83      1333
    macro avg       0.41      0.50      0.45      1333
 weighted avg       0.68      0.83      0.75      1333



  'precision', 'predicted', average, warn_for)


In [17]:
sgd.fit(train_set_des, train['label']) #Use Description
y_pred = sgd.predict(test_set_des)

print('accuracy %s' % accuracy_score(y_pred, test.label))
print(classification_report(test.label, y_pred,target_names=my_tags))


accuracy 0.8042010502625656
               precision    recall  f1-score   support

Non-violation       0.83      0.96      0.89      1101
    Violation       0.28      0.08      0.13       232

    micro avg       0.80      0.80      0.80      1333
    macro avg       0.56      0.52      0.51      1333
 weighted avg       0.74      0.80      0.76      1333





In [22]:
#USING SMOTE TO DEAL WITH IMBALANCE DATASET
#FIT SMOTE

from imblearn.over_sampling import SMOTE
train['label'].value_counts()
smt = SMOTE()
X_train, y_train = smt.fit_sample(train_set_des, train['label'])

In [23]:
#LOGISTIC REGRESSION
logreg = logreg.fit(X_train, y_train)
y_pred = logreg.predict(test_set_des)
print('accuracy %s' % accuracy_score(y_pred, test.label))
print(classification_report(test.label, y_pred,target_names=my_tags))



accuracy 0.7749437359339835
               precision    recall  f1-score   support

Non-violation       0.86      0.86      0.86      1101
    Violation       0.35      0.35      0.35       232

    micro avg       0.77      0.77      0.77      1333
    macro avg       0.61      0.61      0.61      1333
 weighted avg       0.77      0.77      0.77      1333



In [27]:
#SUPPORT VECTOR MACHINE
sgd.fit(X_train, y_train) #Use Description
y_pred = sgd.predict(test_set_des)

print('accuracy %s' % accuracy_score(y_pred, test.label))
print(classification_report(test.label, y_pred,target_names=my_tags))


accuracy 0.7719429857464366
               precision    recall  f1-score   support

Non-violation       0.83      0.91      0.87      1101
    Violation       0.22      0.12      0.16       232

    micro avg       0.77      0.77      0.77      1333
    macro avg       0.52      0.51      0.51      1333
 weighted avg       0.72      0.77      0.74      1333





In [28]:
#USE NEARMISS TO RESAMPLING
from imblearn.under_sampling import NearMiss
nr = NearMiss()
X_train_nr, y_train_nr = nr.fit_sample(train_set_des, train['label'])



In [29]:
#LOGISTIC REGRESSION
logreg = logreg.fit(X_train_nr, y_train_nr)
y_pred = logreg.predict(test_set_des)
print('accuracy %s' % accuracy_score(y_pred, test.label))
print(classification_report(test.label, y_pred,target_names=my_tags))



accuracy 0.668417104276069
               precision    recall  f1-score   support

Non-violation       0.86      0.72      0.78      1101
    Violation       0.25      0.44      0.31       232

    micro avg       0.67      0.67      0.67      1333
    macro avg       0.55      0.58      0.55      1333
 weighted avg       0.75      0.67      0.70      1333



In [30]:
#SUPPORT VECTOR MACHINE
sgd.fit(X_train_nr, y_train_nr) #Use Description
y_pred = sgd.predict(test_set_des)

print('accuracy %s' % accuracy_score(y_pred, test.label))
print(classification_report(test.label, y_pred,target_names=my_tags))

accuracy 0.4891222805701425
               precision    recall  f1-score   support

Non-violation       0.86      0.46      0.60      1101
    Violation       0.20      0.64      0.30       232

    micro avg       0.49      0.49      0.49      1333
    macro avg       0.53      0.55      0.45      1333
 weighted avg       0.74      0.49      0.55      1333



