In [None]:
import pandas as pd
import json
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import pprint
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier
# import xgboost
import gensim
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split, StratifiedKFold
import nltk
from sklearn.metrics import classification_report, accuracy_score
from ipynb.fs.defs.helper import report_to_csv, export_model

In [None]:
# Prepare data source
models = [
            GaussianNB(),
            BernoulliNB(),
            LinearSVC(), 
            LogisticRegression(solver='liblinear', random_state=42, max_iter=100000),
            DecisionTreeClassifier(),
            KNeighborsClassifier(),
            RandomForestClassifier(),
            ExtraTreesClassifier(n_estimators=200),
            VotingClassifier(estimators=[
                ('lr', LogisticRegression(solver='liblinear', random_state=42, max_iter=100000)), 
                ('dt', DecisionTreeClassifier()), 
                ('lsvc', LinearSVC()), 
                ('knn', KNeighborsClassifier()),  
                ('nb', GaussianNB()), 
                ('bnb', BernoulliNB()), 
            ], voting='hard'), 
            AdaBoostClassifier(random_state=42), 
            GradientBoostingClassifier(learning_rate=0.01, random_state=42),
        ]

user_reviews = {}

user_reviews['bug'] = {'data_train': 'Bug_Report_Data_Train.json', 
                      'not_data_train': 'Not_Bug_Report_Data_Train.json',
                      'data_test': 'Bug_Report_Data_Test.json',
                      'not_data_test': 'Not_Bug_Report_Data_Test.json',
                      'label': 'Bug',
                      'not_label': 'Not Bug'}

user_reviews['feature'] = {'data_train': 'Feature_OR_Improvment_Request_Data_Train.json', 
                          'not_data_train': 'Not_Feature_OR_Improvment_Request_Data_Train.json',
                          'data_test': 'Feature_OR_Improvment_Request_Data_Test.json',
                          'not_data_test': 'Not_Feature_OR_Improvment_Request_Data_Test.json',
                          'label': 'Feature',
                          'not_label': 'Not Feature'}

user_reviews['ux'] = {'data_train': 'UserExperience_Data_Train.json', 
                        'not_data_train': 'Not_UserExperience_Data_Train.json',
                        'data_test': 'UserExperience_Data_Test.json',
                        'not_data_test': 'Not_UserExperience_Data_Test.json',
                        'label': 'UserExperience',
                        'not_label': 'Not UserExperience'}

user_reviews['rating'] = {'data_train': 'Rating_Data_Train.json', 
                          'not_data_train': 'Not_Rating_Data_Train.json',
                          'data_test': 'Rating_Data_Test.json',
                          'not_data_test': 'Not_Rating_Data_Test.json',
                         'label': 'Rating',
                         'not_label': 'Not Rating'}

In [None]:
selected_review_type = user_reviews['bug'] # bug, feature, ux, rating

In [None]:
# Import data
with open('../RE2015_data/json_data/' + selected_review_type['data_train']) as data_file:    
    data_train = json.load(data_file)
    
with open('../RE2015_data/json_data/' + selected_review_type['not_data_train']) as data_file:    
    not_data_train = json.load(data_file)
    
with open('../RE2015_data/json_data/' + selected_review_type['data_test']) as data_file:    
    data_test = json.load(data_file)
    
with open('../RE2015_data/json_data/' + selected_review_type['not_data_test']) as data_file:    
    not_data_test = json.load(data_file)

In [None]:
# Prepare data frame
data_train = pd.DataFrame.from_dict(data_train, orient='columns')
data_train['label'] = selected_review_type['label']

data_test = pd.DataFrame.from_dict(data_test, orient='columns')
data_test['label'] = selected_review_type['label']

not_data_train = pd.DataFrame.from_dict(not_data_train, orient='columns')
not_data_train['label'] = selected_review_type['not_label']

not_data_test = pd.DataFrame.from_dict(not_data_test, orient='columns')
not_data_test['label'] = selected_review_type['not_label']

df_train = data_train.append(not_data_train, ignore_index=True)
df_test = data_test.append(not_data_test, ignore_index=True)

df = df_train.append(df_test, ignore_index=True)

In [None]:
wv = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)
wv.init_sims(replace=True)

In [None]:
from itertools import islice
list(islice(wv.vocab, 13030, 13050))

In [None]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.vectors_norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list])

In [None]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [None]:
# comment1 = "This app serves its purpose for me perfectly except for the mobile deposit won't work. It keeps saying can't find endorsement. After calling PNC multiple times about this still no fix."
# stopwords_removal1 = "this app serves purpose for perfectly except for mobile deposit wont work keeps saying cant find endorsement after calling pnc multiple times about this still no fix"
# lemmatized_comment1 = "this app serve it purpose for me perfectly except for the mobile deposit wont work it keep say cant find endorsement after call pnc multiple time about this still no fix"
# stopwords_removal_lemmatization1 = "this app serve purpose for perfectly except for mobile deposit wont work keep say cant find endorsement after call pnc multiple time about this still no fix"

# word2vec_vector = word_averaging_list(wv, w2v_tokenize_text(stopwords_removal_lemmatization1))
# print(word2vec_vector)

In [None]:
train, test = train_test_split(df, test_size=0.3, random_state=42)

text_features = [
                'comment', 
                'lemmatized_comment', 
                'stopwords_removal',
                'stopwords_removal_lemmatization',
                ]

In [None]:
accuracy_rank = []

In [None]:
import warnings
warnings.filterwarnings("ignore")

for text in text_features:
    test_tokenized = test.apply(lambda r: w2v_tokenize_text(r[text]), axis=1).values
    train_tokenized = train.apply(lambda r: w2v_tokenize_text(r[text]), axis=1).values
    X_train_word_average = word_averaging_list(wv, train_tokenized)
    X_test_word_average = word_averaging_list(wv, test_tokenized)
    
    for model in models:
        print(model.__class__.__name__)
        print(text)
        
        model.fit(X_train_word_average, train['label'])
        y_pred = model.predict(X_test_word_average)
        
        accuracy = accuracy_score(test.label, y_pred)
        print('accuracy %s\n' % accuracy)
        
        description = '%s/word2vec + %s + %s' % (selected_review_type['label'], text, model.__class__.__name__)
        accuracy_rank.append((accuracy, description))
        
        # Export model
        export_model(model, file_name=description)
        
        report = classification_report(test.label, y_pred, output_dict=True) 
        report['accuracy'] = {' ': accuracy}
        report_to_csv(report, description)

In [None]:
import warnings
warnings.filterwarnings("ignore")

# Word2Vec with cross validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

X = df[text_features]
y = df[['label']]

for model in models:
    for text in text_features:
        accuracy = []

        for train, test in kfold.split(X, y):    
            test_tokenized = X.iloc[test].apply(lambda r: w2v_tokenize_text(r[text]), axis=1).values
            train_tokenized = X.iloc[train].apply(lambda r: w2v_tokenize_text(r[text]), axis=1).values
            X_train_word_average = word_averaging_list(wv, train_tokenized)
            X_test_word_average = word_averaging_list(wv, test_tokenized)

            model.fit(X_train_word_average, y.iloc[train])
            prediction = model.predict(X_test_word_average)
            accuracy.append(accuracy_score(y.iloc[test], prediction))

        avg_acc = np.mean(accuracy)
        description = '%s/word2vec + CV + %s + %s' % (selected_review_type['label'], text, model.__class__.__name__)
        accuracy_rank.append((avg_acc, description))
        
        print(model.__class__.__name__)
        print(text)
        print(avg_acc)
        print()

In [None]:
# Write sorted results to text file.
text_file = open('results/Rank/' + selected_review_type['label'] + '_word2vec_rank_output.txt', 'w')
sorted_accuracy_rank = sorted(accuracy_rank, key=lambda accuracy_rank: accuracy_rank[0], reverse=True)
for item in sorted_accuracy_rank:
    text = '%f , %s' % (item[0], item[1]) 
    print(text)
    text_file.write(text + '\n')
    
text_file.close()