In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
from gensim import models
from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
import pickle

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from stop_words import get_stop_words
from nltk.corpus import stopwords
from nltk import pos_tag

# ! pip install stopwords
# ! pip install stop_words
# ! pip install gensim
# ! pip install python-Levenshtein
# ! pip install pickle-mixin

# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
# nltk.download('wordnet')

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
def analysis(labels, predictions):
    print("Report Classification: \n", classification_report(labels, predictions))
    print("Matrix Confusion: \n", confusion_matrix(labels, predictions))
    print("Accuracy: \n", accuracy_score(labels,predictions))

### read_data

In [5]:
data_set2 = pd.read_csv('dataset2.csv')

X2 = data_set2['comment']
Y2 = data_set2['sentiment']


# label binarization
label_binarizer = LabelBinarizer()
Y2 = label_binarizer.fit_transform(Y2)
Y2 = np.ravel(Y2)

### pre-processing functions

In [6]:
X_train2, X_val2, Y_train2, Y_val2 = train_test_split(X2, Y2, test_size=0.1, random_state=5)

In [7]:
stop_words = list(get_stop_words('en'))
nltk_words = list(stopwords.words('english'))
stop_words.extend(nltk_words)

def lemmatize(text):
    list_pos = 0
    cleaned_str = ''
    lmtzr = WordNetLemmatizer()
    
    tagged_words = pos_tag(text)
    for word in tagged_words:
        if 'v' in word[1].lower():
            lemma = lmtzr.lemmatize(word[0], pos='v')
        else:
            lemma = lmtzr.lemmatize(word[0], pos='n')
        if list_pos == 0:
            cleaned_str = lemma
        else:
            cleaned_str = cleaned_str + ' ' + lemma
        list_pos += 1
    return cleaned_str

def clean(text):
    text = str(text).lower() #lowercase
    text = re.sub(r'\bid\b', 'i would', text) #start abbreviation
    text = re.sub(r'\bive\b', 'i have', text)
    text = re.sub(r'\bim\b', 'i am', text)
    text = re.sub(r'\bcant\b', 'can not', text)
    text = re.sub(r'\bdont\b', 'do not', text)
    text = re.sub(r'\bwont\b', 'will not', text)
    text = re.sub(r'\bthats\b', 'that is', text) #end abbreviation
    text = re.sub('[0-9]+', '', text) # delete numbers
    text = re.sub(r'[^\x00-\x7F]+',' ', text) #remove non-ascii
    text = re.sub('[<>{}=~.,،:\\!?\\-()\\[\\]#/@"]+|[_x000D_]+|\u200c+|[\r\n]', ' ', text) #remove punctuations
    word_list = nltk.word_tokenize(text)
    text = lemmatize(word_list)
    word_list = text.split()
    word_list = list(filter(lambda word: word not in stop_words, word_list)) # delete stopwords
    word_list = [w for w in word_list if len(w)>1] # delete len = 1
    return word_list


# W2V

In [8]:
corpus_text_train2 = '\n'.join(X_train2) # just for X_train fits
sentences_train2 = corpus_text_train2.split('\n')
sentences_train2 = [clean(line) for line in sentences_train2]

model = Word2Vec(sentences_train2, window=5, min_count=3, workers=4)
vectors = model.wv

w2v_sentences_train2 = []
for index in range(len(sentences_train2)):
    temp = []
    for word in sentences_train2[index]:
        try:
            temp.append(vectors[word])
        except:
            pass
    w2v_sentences_train2.append(np.mean(temp, axis=0))
    
    
corpus_text_val2 = '\n'.join(X_val2) # just apply for X_test
sentences_val2 = corpus_text_val2.split('\n')
sentences_val2 = [clean(line) for line in sentences_val2]

w2v_sentences_val2 = []
for index in range(len(sentences_val2)):
    temp = []
    for word in sentences_val2[index]:
        try:
            temp.append(vectors[word])
        except:
            pass
    w2v_sentences_val2.append(np.mean(temp, axis=0))

## MLP: W2V

In [9]:
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50),(10,30,10),(20,), (50,), (100,), (150,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05, 0.1],
    'learning_rate': ['constant','adaptive'],
}

best_param = {
    'hidden_layer_sizes': parameter_space['hidden_layer_sizes'][-1],
    'activation': parameter_space['activation'][-1],
    'solver': parameter_space['solver'][-1],
    'alpha': parameter_space['alpha'][-1],
    'learning_rate': parameter_space['learning_rate'][-1],
}

best_score = 0

In [10]:
for hls in parameter_space['hidden_layer_sizes']:
    for ac in parameter_space['activation']:
        for so in parameter_space['solver']:
            for al in parameter_space['alpha']:
                for lr in parameter_space['learning_rate']:
                    clf = MLPClassifier(hidden_layer_sizes=hls,learning_rate=lr,alpha=al,solver=so,activation=ac,max_iter=100)
                    clf.fit(w2v_sentences_train2, Y_train2) 
                    score = accuracy_score(Y_val2, clf.predict(w2v_sentences_val2))
                    
                    if score > best_score:
                        best_score = score
                        best_param['hidden_layer_sizes'] = hls
                        best_param['activation'] = ac
                        best_param['solver'] = so
                        best_param['alpha'] = al
                        best_param['learning_rate'] = lr
print(best_score)
print(best_param)

0.6
{'hidden_layer_sizes': (20,), 'activation': 'relu', 'solver': 'adam', 'alpha': 0.1, 'learning_rate': 'adaptive'}


In [11]:
analysis(Y_val2, clf.predict(w2v_sentences_val2))

Report Classification: 
               precision    recall  f1-score   support

           0       0.44      0.72      0.55        25
           1       0.22      0.08      0.12        25

    accuracy                           0.40        50
   macro avg       0.33      0.40      0.33        50
weighted avg       0.33      0.40      0.33        50

Matrix Confusion: 
 [[18  7]
 [23  2]]
Accuracy: 
 0.4


## Fine Tune on pretuned MLP

### W2V from GoogleNews-vectors to make same size inputs

In [12]:
corpus_text_train2 = '\n'.join(X_train2) # just for X_train fits
sentences_train2 = corpus_text_train2.split('\n')
sentences_train2 = [clean(line) for line in sentences_train2]

vectors = models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)

w2v_sentences_train2 = []
for index in range(len(sentences_train2)):
    temp = []
    for word in sentences_train2[index]:
        try:
            temp.append(vectors[word])
        except:
            pass
    w2v_sentences_train2.append(np.mean(temp, axis=0))
    
    
corpus_text_val2 = '\n'.join(X_val2) # just apply for X_test
sentences_val2 = corpus_text_val2.split('\n')
sentences_val2 = [clean(line) for line in sentences_val2]

w2v_sentences_val2 = []
for index in range(len(sentences_val2)):
    temp = []
    for word in sentences_val2[index]:
        try:
            temp.append(vectors[word])
        except:
            pass
    w2v_sentences_val2.append(np.mean(temp, axis=0))

In [13]:
filename = 'best.pkl'
loaded_model = pickle.load(open(filename, 'rb'))

In [14]:
loaded_model = loaded_model.partial_fit(w2v_sentences_train2, Y_train2)
loaded_model = loaded_model.partial_fit(w2v_sentences_train2, Y_train2)
loaded_model = loaded_model.partial_fit(w2v_sentences_train2, Y_train2)

In [15]:
analysis(Y_val2, loaded_model.predict(w2v_sentences_val2))

Report Classification: 
               precision    recall  f1-score   support

           0       0.95      0.72      0.82        25
           1       0.77      0.96      0.86        25

    accuracy                           0.84        50
   macro avg       0.86      0.84      0.84        50
weighted avg       0.86      0.84      0.84        50

Matrix Confusion: 
 [[18  7]
 [ 1 24]]
Accuracy: 
 0.84
