In [34]:
## Data mining research project

In [51]:
## depencies
from sklearn.pipeline import Pipeline
from gensim.models import Phrases
from gensim.models.phrases import Phraser
import spacy, en_core_web_sm

## import dependencies for training model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn import metrics
from sklearn.svm import SVC

## depencies for manipulating data
import nltk
import pandas as pd
from nltk.tokenize import MWETokenizer
from nltk.corpus import stopwords
import string
import re

In [52]:
## import given training data set
## use dataset_1: laptop review for training model

dataset = pd.read_csv('data-2_train.csv', na_filter=True)
dataset.columns=['example_id', 'text', 'aspect_term', 'term_location', 'class']
dataset['text']=dataset['text'].replace("\[comma]", "", regex=True)

## data preprocessing phase

In data preprocessing phase, I removed all tags, special character, twitter handler, newline character, punctuation and non-ascii characters. Note that sometimes removing punctuation and stopwords cause the accuracy get decreased. 

In [53]:
def preprocessing_txt(dataset):
    stop_words = set(stopwords.words('english'))
    corpus=[]
    for elm in range(0, len(dataset.index)):
        res=' '.join([i for i in dataset['text'][elm].lower().split() if i not in stop_words])
        res=re.sub("</?.*?>"," <> ",dataset['text'][elm])    # remove tags
        res=re.sub("(\\d|\\W)+"," ",dataset['text'][elm])        # remove special characte
        res=re.sub(r'@([A-Za-z0-9_]+)', "",dataset['text'][elm])  # remove twitter handler
        res=re.sub('(\r)+', "", dataset['text'][elm])            # remove newline character
        res=re.sub('[^\x00-\x7F]+', "", dataset['text'][elm])    # remove non-ascii characters
        res=''.join(x for x in dataset['text'][elm] if x not in set(string.punctuation))   ## remove punctuation
        corpus.append(res)
    return corpus

corpus=preprocessing_txt(dataset)

## Build up features with TF-IDF scheme by using bigram phrase

In [54]:
nlp=en_core_web_sm.load()

def bigphrase_tfidf_feats(corpus):
    lemmetized_sent=[]
    for each_sent in nlp.pipe(corpus, batch_size=50, n_threads=-1):
        if each_sent.is_parsed:
            res=[tok.lemma_ for tok in each_sent if not tok.is_punct or tok.is_space or tok.is_stop or tok.like_num]
            lemmetized_sent.append(res)
        else:
            lemmetized_sent.append(None)
    bigram=Phraser(Phrases(lemmetized_sent))
    bigram_lem=list(bigram[lemmetized_sent])
    parsed=[]
    for k in range(0, len(bigram_lem)):
        joined=' '.join(bigram_lem[k])
        parsed.append(joined)
    return parsed

In [55]:
## bigram phrase
bigram=bigphrase_tfidf_feats(corpus)

# Split test and train data using bigram phrase
targetClass=dataset['class'].values
x_train, x_test, y_train, y_test = train_test_split(bigram, targetClass, test_size = 0.2, stratify = targetClass)
text_pipe_nb = Pipeline([('tfidf', TfidfVectorizer()), ('nb', MultinomialNB())])
text_pipe_nb.fit(x_train,y_train)
predicted = text_pipe_nb.predict(x_test)
print("accuracy metrics for training naive bayes classifier:\n",metrics.classification_report(y_test, predicted, target_names = ['1','0','-1']))

## use bigram phrase to train logistic regression
LR = LogisticRegression()
text_pipe_LR = Pipeline([('tfidf', TfidfVectorizer()), ('LR', LR)])
text_pipe_LR.fit(x_train,y_train)
predicted = text_pipe_LR.predict(x_test)
print("accuracy metrics for logistic regression classifier:\n",metrics.classification_report(y_test, predicted, target_names = ['1','0','-1']))

##
sgd_clf = linear_model.SGDClassifier(max_iter=1000)
text_pipe_sgd = Pipeline([('tfidf', TfidfVectorizer()), ('sgd', sgd_clf)])
text_pipe_sgd.fit(x_train,y_train)
predicted = text_pipe_sgd.predict(x_test)
print("accuracy metrics for SGD classifier:\n",metrics.classification_report(y_test, predicted, target_names = ['1','0','-1']))

accuracy metrics for training naive bayes classifier:
              precision    recall  f1-score   support

          1       0.77      0.25      0.38       161
          0       0.76      0.10      0.18       127
         -1       0.66      0.99      0.79       433

avg / total       0.70      0.67      0.59       721

accuracy metrics for logistic regression classifier:
              precision    recall  f1-score   support

          1       0.72      0.40      0.51       161
          0       0.75      0.30      0.43       127
         -1       0.72      0.97      0.83       433

avg / total       0.73      0.73      0.69       721

accuracy metrics for SGD classifier:
              precision    recall  f1-score   support

          1       0.68      0.61      0.65       161
          0       0.58      0.51      0.54       127
         -1       0.82      0.88      0.85       433

avg / total       0.75      0.76      0.75       721



In [56]:
## test data part
Data_1_test = pd.read_csv('Data-2_test.csv', na_filter=True)
Data_1_test.columns=['example_id', 'text', 'aspect_term', 'term_location']
Data_1_test['text']=Data_1_test['text'].replace("\[comma]", "", regex=True)

## 
Data_1_test_corpus=preprocessing_txt(Data_1_test)

In [57]:
## build up features from test data
test_bigram=bigphrase_tfidf_feats(Data_1_test_corpus)


In [58]:
## use trained naive-bayes classifier on new test data
test_predicted_nb=text_pipe_nb.predict(test_bigram)
with open('Data_1_test_nb.txt', 'w') as file:
        for index,i in enumerate(test_predicted_nb):
            file.write(str(Data_1_test['example_id'][index]) + ';;' + str(i) + '\n')
        file.close()
        
##
test_predicted_LR=text_pipe_LR.predict(test_bigram)
with open('Data_1_test_LR.txt', 'w') as file:
        for index,i in enumerate(test_predicted_LR):
            file.write(str(Data_1_test['example_id'][index]) + ';;' + str(i) + '\n')
        file.close()
        
##
test_predicted_sgd=text_pipe_sgd.predict(test_bigram)
with open('Data_1_test_sgd.txt', 'w') as file:
        for index,i in enumerate(test_predicted_sgd):
            file.write(str(Data_1_test['example_id'][index]) + ';;' + str(i) + '\n')
        file.close()