In [None]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline, FeatureUnion
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import pprint
import numpy as np
from sklearn.impute import SimpleImputer
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=UserWarning)
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier
import xgboost
from gensim.models.word2vec import Word2Vec
from ipynb.fs.defs.helper import fit_predict_evaluate, model_cv, TextSelector, NumberSelector, DenseTransformer, check_pos_tag, MeanEmbeddingVectorizer, TfidfEmbeddingVectorizer

In [None]:
# Prepare data source
models = [
            MultinomialNB(),
            GaussianNB(),
            BernoulliNB(),
            LinearSVC(), 
            LogisticRegression(solver='liblinear', random_state=42, max_iter=100000),
            DecisionTreeClassifier(),
            KNeighborsClassifier(),
            RandomForestClassifier(),
            xgboost.XGBClassifier(),
            ExtraTreesClassifier(n_estimators=200),
            VotingClassifier(estimators=[
                ('lr', LogisticRegression(solver='liblinear', random_state=42, max_iter=100000)), 
                ('dt', DecisionTreeClassifier()), 
                ('lsvc', LinearSVC()), 
                ('knn', KNeighborsClassifier()), 
                ('mnb', MultinomialNB()), 
                ('nb', GaussianNB()), 
                ('bnb', BernoulliNB()), 
            ], voting='hard'),
            AdaBoostClassifier(random_state=42), 
            GradientBoostingClassifier(learning_rate=0.01, random_state=42),
        ]

user_reviews = {}

user_reviews['bug'] = {'data_train': 'Bug_Report_Data_Train.json', 
                      'not_data_train': 'Not_Bug_Report_Data_Train.json',
                      'data_test': 'Bug_Report_Data_Test.json',
                      'not_data_test': 'Not_Bug_Report_Data_Test.json',
                      'label': 'Bug',
                      'not_label': 'Not Bug'}

user_reviews['feature'] = {'data_train': 'Feature_OR_Improvment_Request_Data_Train.json', 
                          'not_data_train': 'Not_Feature_OR_Improvment_Request_Data_Train.json',
                          'data_test': 'Feature_OR_Improvment_Request_Data_Test.json',
                          'not_data_test': 'Not_Feature_OR_Improvment_Request_Data_Test.json',
                          'label': 'Feature',
                          'not_label': 'Not Feature'}

user_reviews['ux'] = {'data_train': 'UserExperience_Data_Train.json', 
                        'not_data_train': 'Not_UserExperience_Data_Train.json',
                        'data_test': 'UserExperience_Data_Test.json',
                        'not_data_test': 'Not_UserExperience_Data_Test.json',
                        'label': 'UserExperience',
                        'not_label': 'Not UserExperience'}

user_reviews['rating'] = {'data_train': 'Rating_Data_Train.json', 
                          'not_data_train': 'Not_Rating_Data_Train.json',
                          'data_test': 'Rating_Data_Test.json',
                          'not_data_test': 'Not_Rating_Data_Test.json',
                         'label': 'Rating',
                         'not_label': 'Not Rating'}

In [None]:
selected_review_type = user_reviews['bug'] # bug, feature, ux, rating
pprint.pprint(selected_review_type)

In [None]:
# Import data
with open('../RE2015_data/json_data/' + selected_review_type['data_train']) as data_file:    
    data_train = json.load(data_file)
    
with open('../RE2015_data/json_data/' + selected_review_type['not_data_train']) as data_file:    
    not_data_train = json.load(data_file)
    
with open('../RE2015_data/json_data/' + selected_review_type['data_test']) as data_file:    
    data_test = json.load(data_file)
    
with open('../RE2015_data/json_data/' + selected_review_type['not_data_test']) as data_file:    
    not_data_test = json.load(data_file)

In [None]:
# Prepare data frame
data_train = pd.DataFrame.from_dict(data_train, orient='columns')
data_train['label'] = selected_review_type['label']

data_test = pd.DataFrame.from_dict(data_test, orient='columns')
data_test['label'] = selected_review_type['label']

not_data_train = pd.DataFrame.from_dict(not_data_train, orient='columns')
not_data_train['label'] = selected_review_type['not_label']

not_data_test = pd.DataFrame.from_dict(not_data_test, orient='columns')
not_data_test['label'] = selected_review_type['not_label']

df_train = data_train.append(not_data_train, ignore_index=True)
df_test = data_test.append(not_data_test, ignore_index=True)

df = df_train.append(df_test, ignore_index=True)

print(df.info())

In [None]:
fig = plt.figure(figsize=(8,6))
df.groupby('label').count().plot.bar(ylim=0)
plt.show()
print(pd.value_counts(pd.Series(df['label'])))

In [None]:
print('data_train:\n', data_train.count(), '\n')
print('not_data_train:\n', not_data_train.count(), '\n')
print('data_test:\n', data_test.count(), '\n')
print('not_data_test:\n', not_data_test.count(), '\n')
print('df_train:\n', df_train.count(), '\n')
print('df_test:\n', df_test.count(), '\n')
print('df:\n', df.count(), '\n')

In [None]:
# Prepare feature list
f1 = ['comment']
f2 = ['lemmatized_comment']
f3 = ['stopwords_removal']
f4 = ['stopwords_removal_lemmatization']
f5 = ['rating', 'length_words']
f6 = ['rating', 'length_words', 'present_simple', 'present_con', 'past', 'future']
f7 = ['rating', 'length_words', 'present_simple', 'present_con', 'past', 'future', 'sentiScore']
f8 = ['comment', 'rating', 'sentiScore']
f9 = ['comment', 'rating', 'sentiScore', 'present_simple', 'present_con', 'past', 'future']
f10 = ['stopwords_removal_lemmatization', 'rating', 'sentiScore', 'present_simple', 'present_con', 'past', 'future']

f11 = ['comment', 
       'noun', 
       'pron', 
       'verb', 
       'adj', 
       'adv',
      ]

f12 = ['lemmatized_comment', 
       'noun', 
       'pron', 
       'verb', 
       'adj', 
       'adv',
      ]

f13 = ['stopwords_removal', 
       'noun', 
       'pron', 
       'verb', 
       'adj', 
       'adv',
      ]

f14 = ['stopwords_removal_lemmatization', 
       'noun', 
       'pron', 
       'verb', 
       'adj', 
       'adv',
      ]

f15 = ['comment', 'rating', 'sentiScore', 
       'noun', 
       'pron', 
       'verb', 
       'adj', 
       'adv',
      ]

f16 = ['comment', 'rating', 'sentiScore', 'present_simple', 'present_con', 'past', 'future', 
       'noun', 
       'pron', 
       'verb', 
       'adj', 
       'adv',
      ]

f17 = ['stopwords_removal_lemmatization', 'rating', 'sentiScore', 'present_simple', 'present_con', 'past', 'future', 
       'noun', 
       'pron', 
       'verb', 
       'adj', 
       'adv',
      ]

f18 = ['rating', 'length_words', 
        'noun', 
       'pron', 
       'verb', 
       'adj', 
       'adv',
      ]

f19 = ['rating', 'length_words', 'present_simple', 'present_con', 'past', 'future',
        'noun', 
       'pron', 
       'verb', 
       'adj', 
       'adv',
      ]

f20 = ['rating', 'length_words', 'present_simple', 'present_con', 'past', 'future', 'sentiScore',
       'noun', 
       'pron', 
       'verb', 
       'adj', 
       'adv',
      ]

features = [
            f1, 
            f2,
            f3,
            f4,
            f5,
            f6,
            f7,
            f8,
            f9,
            f10,
            f11,
            f12,
            f13,
            f14,
            f15,
            f16,
            f17,
            f18, 
            f19, 
            f20,
            ]

In [None]:
# Prepare list of feature union
features_unions = []

tfidf = TfidfVectorizer(min_df=3, ngram_range=(1, 2))

def compute_pos_text(data_frame, tag, text):
    new_data_frame = data_frame
    new_data_frame[tag] = new_data_frame[text].apply(lambda x: check_pos_tag(x, tag))
    return new_data_frame

for feature in features:
    pipeline_tuples = []
    
    if ('comment' in feature):
        pip = Pipeline([
                ('selector', TextSelector(key='comment')),
                ('tfidf', tfidf),
                ('dense', DenseTransformer())
            ])

        pipeline_tuples.append(('comments', pip))
        
    if ('lemmatized_comment' in feature):
        pip = Pipeline([
                ('selector', TextSelector(key='lemmatized_comment')),
                ('tfidf', tfidf),
                ('dense', DenseTransformer())
            ])
        pipeline_tuples.append(('lemmatized_comments', pip))
        
    if ('stopwords_removal' in feature):
        pip = Pipeline([
                ('selector', TextSelector(key='stopwords_removal')),
                ('tfidf', tfidf),
                ('dense', DenseTransformer())
            ])
        pipeline_tuples.append(('stopwords_removals', pip))
        
    if ('stopwords_removal_lemmatization' in feature):
        pip = Pipeline([
                ('selector', TextSelector(key='stopwords_removal_lemmatization')),
                ('tfidf', tfidf),
                ('dense', DenseTransformer())
            ])
        pipeline_tuples.append(('stopwords_removal_lemmatizations', pip))
        
    if ('rating' in feature):
        pip = Pipeline([
                ('selector', NumberSelector(key='rating')),
                ('imp', SimpleImputer(missing_values=np.nan, strategy='mean'))
            ])
        pipeline_tuples.append(('ratings', pip))
        
    if ('present_simple' in feature):
        pip = Pipeline([
                ('selector', NumberSelector(key='present_simple')),
                ('scaler1', StandardScaler()),
                ('scaler2', MinMaxScaler())
            ])
        pipeline_tuples.append(('present_simples', pip))
        
    if ('present_con' in feature):
        pip = Pipeline([
                ('selector', NumberSelector(key='present_con')),
                ('scaler1', StandardScaler()),
                ('scaler2', MinMaxScaler())
            ])
        pipeline_tuples.append(('present_cons', pip))
        
    if ('past' in feature):
        pip = Pipeline([
                ('selector', NumberSelector(key='past')),
                ('scaler1', StandardScaler()),
                ('scaler2', MinMaxScaler())
            ])
        pipeline_tuples.append(('pasts', pip))
        
    if ('future' in feature):
        pip = Pipeline([
                ('selector', NumberSelector(key='future')),
                ('scaler1', StandardScaler()),
                ('scaler2', MinMaxScaler())
            ])
        pipeline_tuples.append(('futures', pip))
        
    if ('length_words' in feature):
        pip = Pipeline([
                ('selector', NumberSelector(key='length_words')),
                ('imp', SimpleImputer(missing_values=np.nan, strategy="median")),
                ('scaler', StandardScaler()),
                ('scaler2', MinMaxScaler())
            ])
        pipeline_tuples.append(('length_words', pip))
        
    if ('sentiScore' in feature):
        pip = Pipeline([
                    ('selector', NumberSelector(key='sentiScore')),
                    ('scaler', MinMaxScaler())
                ])
        pipeline_tuples.append(('sentiments', pip))
        
    # Part of Speech Tags (feature[0] must be text)
    # Noun
    if ('noun' in feature):
        df_train = compute_pos_text(df_train, 'noun', feature[0])
        df_test = compute_pos_text(df_test, 'noun', feature[0])
        df = compute_pos_text(df, 'noun', feature[0])

        pip = Pipeline([
                ('selector', NumberSelector(key='noun')),
                ('scaler1', StandardScaler()),
                ('scaler2', MinMaxScaler())
            ])
        pipeline_tuples.append(('noun', pip))
        
    # Pronoun
    if ('pron' in feature):
        df_train = compute_pos_text(df_train, 'pron', feature[0])
        df_test = compute_pos_text(df_test, 'pron', feature[0])
        df = compute_pos_text(df, 'pron', feature[0])

        pip = Pipeline([
                ('selector', NumberSelector(key='pron')),
                ('scaler1', StandardScaler()),
                ('scaler2', MinMaxScaler())
            ])
        pipeline_tuples.append(('pron', pip))
        
    # Verb
    if ('verb' in feature):
        df_train = compute_pos_text(df_train, 'verb', feature[0])
        df_test = compute_pos_text(df_test, 'verb', feature[0])
        df = compute_pos_text(df, 'verb', feature[0])

        pip = Pipeline([
                ('selector', NumberSelector(key='verb')),
                ('scaler1', StandardScaler()),
                ('scaler2', MinMaxScaler())
            ])
        pipeline_tuples.append(('verb', pip))
        
    # Adjective
    if ('adj' in feature):
        df_train = compute_pos_text(df_train, 'adj', feature[0])
        df_test = compute_pos_text(df_test, 'adj', feature[0])
        df = compute_pos_text(df, 'adj', feature[0])

        pip = Pipeline([
                ('selector', NumberSelector(key='adj')),
                ('scaler1', StandardScaler()),
                ('scaler2', MinMaxScaler())
            ])
        pipeline_tuples.append(('adj', pip))
        
    # Adverb
    if ('adv' in feature):
        df_train = compute_pos_text(df_train, 'adv', feature[0])
        df_test = compute_pos_text(df_test, 'adv', feature[0])
        df = compute_pos_text(df, 'adv', feature[0])

        pip = Pipeline([
                ('selector', NumberSelector(key='adv')),
                ('scaler1', StandardScaler()),
                ('scaler2', MinMaxScaler())
            ])
        pipeline_tuples.append(('adv', pip))
        
    features_unions.append((FeatureUnion(pipeline_tuples), feature))

In [None]:
print(df.describe())

In [None]:
# Prepare targets
y_train = df_train['label']
y_test = df_test['label']
y = df['label']

In [None]:
accuracy_rank = []

In [None]:
# Train and evaluate models
for feats, feature_list in features_unions:
    X_train = df_train[feature_list]
    X_test = df_test[feature_list]
    X = df[feature_list]
    
    for model in models:
        pipeline = Pipeline([
                            ('features', feats),
                            ('classifier', model),
                            ])
        
        desc = '%s/%s + %s' % (selected_review_type['label'], str(feature_list), model.__class__.__name__)
        acc = fit_predict_evaluate(pipeline, X_train, X_test, y_train, y_test, desc)
        accuracy_rank.append((acc, desc))
        
        desc_cv = '%s/%s + CV + %s' % (selected_review_type['label'], str(feature_list), model.__class__.__name__)
        acc_cv = model_cv(5, X, y, pipeline, description=desc_cv, category=selected_review_type['label'])
        accuracy_rank.append((acc_cv, desc_cv))

In [None]:
# Write sorted results to text file.
text_file = open('results/Rank/' + selected_review_type['label'] + '_common_rank_output.txt', 'w')
sorted_accuracy_rank = sorted(accuracy_rank, key=lambda accuracy_rank: accuracy_rank[0], reverse=True)
for item in sorted_accuracy_rank:
    text = '%f , %s' % (item[0], item[1]) 
    print(text)
    text_file.write(text + '\n')
    
text_file.close()