In [None]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
import gensim
from gensim.models.doc2vec import TaggedDocument
import re
import pandas as pd
import json
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier
# import xgboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from ipynb.fs.defs.helper import report_to_csv, export_model

In [None]:
# Prepare data source
models = [ 
            GaussianNB(),
            BernoulliNB(),
            LinearSVC(), 
            LogisticRegression(solver='liblinear', random_state=42, max_iter=100000),
            DecisionTreeClassifier(),
            KNeighborsClassifier(),
            RandomForestClassifier(),
#             xgboost.XGBClassifier(),
            ExtraTreesClassifier(n_estimators=200),
            VotingClassifier(estimators=[
                ('lr', LogisticRegression(solver='liblinear', random_state=42, max_iter=100000)), 
                ('dt', DecisionTreeClassifier()), 
                ('lsvc', LinearSVC()), 
                ('knn', KNeighborsClassifier()), 
                ('nb', GaussianNB()), 
                ('bnb', BernoulliNB()), 
            ], voting='hard'),
            AdaBoostClassifier(random_state=42), 
            GradientBoostingClassifier(learning_rate=0.01, random_state=42),
        ]

text_features = [
                'comment', 
                'lemmatized_comment', 
                'stopwords_removal',
                'stopwords_removal_lemmatization',
                ]

user_reviews = {}

user_reviews['bug'] = {'data_train': 'Bug_Report_Data_Train.json', 
                      'not_data_train': 'Not_Bug_Report_Data_Train.json',
                      'data_test': 'Bug_Report_Data_Test.json',
                      'not_data_test': 'Not_Bug_Report_Data_Test.json',
                      'label': 'Bug',
                      'not_label': 'Not Bug'}

user_reviews['feature'] = {'data_train': 'Feature_OR_Improvment_Request_Data_Train.json', 
                          'not_data_train': 'Not_Feature_OR_Improvment_Request_Data_Train.json',
                          'data_test': 'Feature_OR_Improvment_Request_Data_Test.json',
                          'not_data_test': 'Not_Feature_OR_Improvment_Request_Data_Test.json',
                          'label': 'Feature',
                          'not_label': 'Not Feature'}

user_reviews['ux'] = {'data_train': 'UserExperience_Data_Train.json', 
                        'not_data_train': 'Not_UserExperience_Data_Train.json',
                        'data_test': 'UserExperience_Data_Test.json',
                        'not_data_test': 'Not_UserExperience_Data_Test.json',
                        'label': 'UserExperience',
                        'not_label': 'Not UserExperience'}

user_reviews['rating'] = {'data_train': 'Rating_Data_Train.json', 
                          'not_data_train': 'Not_Rating_Data_Train.json',
                          'data_test': 'Rating_Data_Test.json',
                          'not_data_test': 'Not_Rating_Data_Test.json',
                         'label': 'Rating',
                         'not_label': 'Not Rating'}

In [None]:
selected_review_type = user_reviews['bug'] # bug, feature, ux, rating

In [None]:
# Import data
with open('../RE2015_data/json_data/' + selected_review_type['data_train']) as data_file:    
    data_train = json.load(data_file)
    
with open('../RE2015_data/json_data/' + selected_review_type['not_data_train']) as data_file:    
    not_data_train = json.load(data_file)
    
with open('../RE2015_data/json_data/' + selected_review_type['data_test']) as data_file:    
    data_test = json.load(data_file)
    
with open('../RE2015_data/json_data/' + selected_review_type['not_data_test']) as data_file:    
    not_data_test = json.load(data_file)

In [None]:
# Prepare data frame
data_train = pd.DataFrame.from_dict(data_train, orient='columns')
data_train['label'] = selected_review_type['label']

data_test = pd.DataFrame.from_dict(data_test, orient='columns')
data_test['label'] = selected_review_type['label']

not_data_train = pd.DataFrame.from_dict(not_data_train, orient='columns')
not_data_train['label'] = selected_review_type['not_label']

not_data_test = pd.DataFrame.from_dict(not_data_test, orient='columns')
not_data_test['label'] = selected_review_type['not_label']

df_train = data_train.append(not_data_train, ignore_index=True)
df_test = data_test.append(not_data_test, ignore_index=True)

df = df_train.append(df_test, ignore_index=True)

In [None]:
def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the post.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(TaggedDocument(v.split(), [label]))
    return labeled

In [None]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors

In [None]:
# comment1 = "This app serves its purpose for me perfectly except for the mobile deposit won't work. It keeps saying can't find endorsement. After calling PNC multiple times about this still no fix."
# stopwords_removal1 = "this app serves purpose for perfectly except for mobile deposit wont work keeps saying cant find endorsement after calling pnc multiple times about this still no fix"
# lemmatized_comment1 = "this app serve it purpose for me perfectly except for the mobile deposit wont work it keep say cant find endorsement after call pnc multiple time about this still no fix"
# stopwords_removal_lemmatization1 = "this app serve purpose for perfectly except for mobile deposit wont work keep say cant find endorsement after call pnc multiple time about this still no fix"

# train = label_sentences([stopwords_removal_lemmatization1], 'Train')

# model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
# model_dbow.build_vocab([x for x in tqdm(train)])
    
# for epoch in range(30):
#     model_dbow.train(utils.shuffle([x for x in tqdm(train)]), total_examples=len(train), epochs=1)
#     model_dbow.alpha -= 0.002
#     model_dbow.min_alpha = model_dbow.alpha

# train_vectors_dbow = get_vectors(model_dbow, len(train), 300, 'Train')

# print(train_vectors_dbow)

In [None]:
accuracy_rank = []

In [None]:
for text in text_features:
    X_train, X_test, y_train, y_test = train_test_split(df[text], df.label, random_state=42, test_size=0.3)
    X_train = label_sentences(X_train, 'Train')
    X_test = label_sentences(X_test, 'Test')
    
    all_data = X_train + X_test
    
    model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
    model_dbow.build_vocab([x for x in tqdm(all_data)])
    
    for epoch in range(30):
        model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
        model_dbow.alpha -= 0.002
        model_dbow.min_alpha = model_dbow.alpha
        
    train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train')
    test_vectors_dbow = get_vectors(model_dbow, len(X_test), 300, 'Test')
    
    for model in models:
        print(model.__class__.__name__)
        print(text)
        
        model.fit(train_vectors_dbow, y_train)
        y_pred = model.predict(test_vectors_dbow)
        
        accuracy = accuracy_score(y_test, y_pred)
        print('accuracy %s\n' % accuracy)
        
        description = '%s/doc2vec + %s + %s' % (selected_review_type['label'], text, model.__class__.__name__)
        accuracy_rank.append((accuracy, description))
        
        # Export model
        export_model(model, file_name=description)
        
        report = classification_report(y_test, y_pred, output_dict=True) 
        report['accuracy'] = {' ': accuracy}
        report_to_csv(report, description)

In [None]:
# Write sorted results to text file.
text_file = open('results/Rank/' + selected_review_type['label'] + '_doc2vec_rank_output.txt', 'w')
sorted_accuracy_rank = sorted(accuracy_rank, key=lambda accuracy_rank: accuracy_rank[0], reverse=True)
for item in sorted_accuracy_rank:
    text = '%f , %s' % (item[0], item[1]) 
    print(text)
    text_file.write(text + '\n')
    
text_file.close()