In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
from gensim import models
from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
import pickle

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from stop_words import get_stop_words
from nltk.corpus import stopwords
from nltk import pos_tag

# ! pip install stopwords
# ! pip install stop_words
# ! pip install gensim
# ! pip install python-Levenshtein
# ! pip install pickle-mixin

# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
# nltk.download('wordnet')
#! wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
#! gzip -d GoogleNews-vectors-negative300.bin.gz

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
def analysis(labels, predictions):
    print("Report Classification: \n", classification_report(labels, predictions, target_names=['positive', 'negative']))
    print("Matrix Confusion: \n", confusion_matrix(labels, predictions))
    print("Accuracy: \n", accuracy_score(labels,predictions))

In [5]:

w = models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)

# Part1

In [6]:
data_set = pd.read_csv('dataset.csv')

X = data_set['comment']
Y = data_set['sentiment']


# label binarization
label_binarizer = LabelBinarizer()
Y = label_binarizer.fit_transform(Y)
Y = np.ravel(Y)

# Part 2, 3

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5)

In [8]:
stop_words = list(get_stop_words('en'))
nltk_words = list(stopwords.words('english'))
stop_words.extend(nltk_words)

def lemmatize(text):
    list_pos = 0
    cleaned_str = ''
    lmtzr = WordNetLemmatizer()
    
    tagged_words = pos_tag(text)
    for word in tagged_words:
        if 'v' in word[1].lower():
            lemma = lmtzr.lemmatize(word[0], pos='v')
        else:
            lemma = lmtzr.lemmatize(word[0], pos='n')
        if list_pos == 0:
            cleaned_str = lemma
        else:
            cleaned_str = cleaned_str + ' ' + lemma
        list_pos += 1
    return cleaned_str

def clean(text):
    text = str(text).lower() #lowercase
    text = re.sub(r'\bid\b', 'i would', text) #start abbreviation
    text = re.sub(r'\bive\b', 'i have', text)
    text = re.sub(r'\bim\b', 'i am', text)
    text = re.sub(r'\bcant\b', 'can not', text)
    text = re.sub(r'\bdont\b', 'do not', text)
    text = re.sub(r'\bwont\b', 'will not', text)
    text = re.sub(r'\bthats\b', 'that is', text) #end abbreviation
    text = re.sub('[0-9]+', '', text) # delete numbers
    text = re.sub(r'[^\x00-\x7F]+',' ', text) #remove non-ascii
    text = re.sub('[<>{}=~.,،:\\!?\\-()\\[\\]#/@"]+|[_x000D_]+|\u200c+|[\r\n]', ' ', text) #remove punctuations
    word_list = nltk.word_tokenize(text)
    text = lemmatize(word_list)
    word_list = text.split()
    word_list = list(filter(lambda word: word not in stop_words, word_list)) # delete stopwords
    word_list = [w for w in word_list if len(w)>1] # delete len = 1
    return word_list


### W2V

In [None]:
corpus_text_train = '\n'.join(X_train) # just for X_train fits
sentences_train = corpus_text_train.split('\n')
sentences_train = [clean(line) for line in sentences_train]

vectors = w

w2v_sentences_train = []
for index in range(len(sentences_train)):
    temp = []
    for word in sentences_train[index]:
        try:
            temp.append(vectors[word])
        except:
            pass
    w2v_sentences_train.append(np.mean(temp, axis=0))
    
    
corpus_text_test = '\n'.join(X_test) # just apply for X_test
sentences_test = corpus_text_test.split('\n')
sentences_test = [clean(line) for line in sentences_test]

w2v_sentences_test = []
for index in range(len(sentences_test)):
    temp = []
    for word in sentences_test[index]:
        try:
            temp.append(vectors[word])
        except:
            pass
    w2v_sentences_test.append(np.mean(temp, axis=0))

### MLP: W2V

In [None]:
mlp_gs = MLPClassifier(max_iter=100)
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50),(10,30,10),(20,), (50,), (100,), (150,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05, 0.1],
    'learning_rate': ['constant','adaptive'],
}
clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=-1, cv=5, scoring='accuracy')
clf.fit(w2v_sentences_train, Y_train) 

print("-------MLP-------:")
print("Best parameters set found on development set:")
print (clf.best_params_)
print (clf.best_estimator_)
Y_test_pred = clf.predict(w2v_sentences_test)
analysis(Y_test, Y_test_pred)

In [None]:
filename = 'best.pkl'
pickle.dump(clf, open(filename, 'wb'))