In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
from gensim import models
from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
import pickle

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from stop_words import get_stop_words
from nltk.corpus import stopwords
from nltk import pos_tag

# ! pip install stopwords
# ! pip install stop_words
# ! pip install gensim
# ! pip install python-Levenshtein
# ! pip install pickle-mixin

# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
# nltk.download('wordnet')

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
def analysis(labels, predictions):
    print("Report Classification: \n", classification_report(labels, predictions, target_names=['positive', 'negative']))
    print("Matrix Confusion: \n", confusion_matrix(labels, predictions))
    print("Accuracy: \n", accuracy_score(labels,predictions))

### read_data

In [5]:
data_set = pd.read_csv('dataset2.csv')[:10]

X = data_set['comment']
Y = data_set['sentiment']


# label binarization
label_binarizer = LabelBinarizer()
Y = label_binarizer.fit_transform(Y)
Y = np.ravel(Y)

### pre-processing functions

In [6]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.1, random_state=5)

In [7]:
stop_words = list(get_stop_words('en'))
nltk_words = list(stopwords.words('english'))
stop_words.extend(nltk_words)

def lemmatize(text):
    list_pos = 0
    cleaned_str = ''
    lmtzr = WordNetLemmatizer()
    
    tagged_words = pos_tag(text)
    for word in tagged_words:
        if 'v' in word[1].lower():
            lemma = lmtzr.lemmatize(word[0], pos='v')
        else:
            lemma = lmtzr.lemmatize(word[0], pos='n')
        if list_pos == 0:
            cleaned_str = lemma
        else:
            cleaned_str = cleaned_str + ' ' + lemma
        list_pos += 1
    return cleaned_str

def clean(text):
    text = str(text).lower() #lowercase
    text = re.sub(r'\bid\b', 'i would', text) #start abbreviation
    text = re.sub(r'\bive\b', 'i have', text)
    text = re.sub(r'\bim\b', 'i am', text)
    text = re.sub(r'\bcant\b', 'can not', text)
    text = re.sub(r'\bdont\b', 'do not', text)
    text = re.sub(r'\bwont\b', 'will not', text)
    text = re.sub(r'\bthats\b', 'that is', text) #end abbreviation
    text = re.sub('[0-9]+', '', text) # delete numbers
    text = re.sub(r'[^\x00-\x7F]+',' ', text) #remove non-ascii
    text = re.sub('[<>{}=~.,،:\\!?\\-()\\[\\]#/@"]+|[_x000D_]+|\u200c+|[\r\n]', ' ', text) #remove punctuations
    word_list = nltk.word_tokenize(text)
    text = lemmatize(word_list)
    word_list = text.split()
    word_list = list(filter(lambda word: word not in stop_words, word_list)) # delete stopwords
    word_list = [w for w in word_list if len(w)>1] # delete len = 1
    return word_list


# W2V

In [8]:
corpus_text_train = '\n'.join(X_train) # just for X_train fits
sentences_train = corpus_text_train.split('\n')
sentences_train = [clean(line) for line in sentences_train]

model = Word2Vec(sentences_train, window=5, min_count=3, workers=4)
vectors = model.wv

w2v_sentences_train = []
for index in range(len(sentences_train)):
    temp = []
    for word in sentences_train[index]:
        try:
            temp.append(vectors[word])
        except:
            pass
    w2v_sentences_train.append(np.mean(temp, axis=0))
    
    
corpus_text_val = '\n'.join(X_val) # just apply for X_test
sentences_val = corpus_text_val.split('\n')
sentences_val = [clean(line) for line in sentences_val]

w2v_sentences_val = []
for index in range(len(sentences_val)):
    temp = []
    for word in sentences_val[index]:
        try:
            temp.append(vectors[word])
        except:
            pass
    w2v_sentences_val.append(np.mean(temp, axis=0))

## MLP: W2V

In [9]:
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50),(10,30,10),(20,), (50,), (100,), (150,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05, 0.1],
    'learning_rate': ['constant','adaptive'],
}

best_param = {
    'hidden_layer_sizes': parameter_space['hidden_layer_sizes'][-1],
    'activation': parameter_space['activation'][-1],
    'solver': parameter_space['solver'][-1],
    'alpha': parameter_space['alpha'][-1],
    'learning_rate': parameter_space['learning_rate'][-1],
}

best_score = 0

In [10]:
for hls in parameter_space['hidden_layer_sizes']:
    for ac in parameter_space['activation']:
        for so in parameter_space['solver']:
            for al in parameter_space['alpha']:
                for lr in parameter_space['learning_rate']:
                    clf = MLPClassifier(hidden_layer_sizes=hls,learning_rate=lr,alpha=al,solver=so,activation=ac,max_iter=100)
                    clf.fit(w2v_sentences_train, Y_train) 
                    score = accuracy_score(Y_val, clf.predict(w2v_sentences_val))
                    
                    if score > best_score:
                        best_score = score
                        best_param['hidden_layer_sizes'] = hls
                        best_param['activation'] = ac
                        best_param['solver'] = so
                        best_param['alpha'] = al
                        best_param['learning_rate'] = lr
print(best_score)
print(best_param)

1.0
{'hidden_layer_sizes': (50, 50, 50), 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.0001, 'learning_rate': 'constant'}


## Fine Tune on previous model
- Fine tuning for svm in sklearn can be done just by linear kernel
- So We have to choose Logistic Regression and to see how it helps we should use Word to vec one. We enabled warm_start parameter in the pretuned model and bring it here to being pretuned.

In [11]:
# load the model from disk
filename = 'pretune_LR.pkl'
loaded_clf = pickle.load(open(filename, 'rb'))

In [12]:
loaded_clf.fit(w2v_sentences_train, Y_train)

GridSearchCV(estimator=SVC(),
             param_grid=[{'C': [1, 10], 'kernel': ['poly', 'rbf']}],
             scoring='accuracy')

In [None]:
analysis(Y_val, loaded_clf.predict(w2v_sentences_val))