In [1]:
import pandas as pd
import numpy as np
from numpy import random
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import scipy
from scipy import sparse
import _pickle as cPickle
from datetime import datetime
import time
from itertools import product 
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegressionCV
from scipy.stats import uniform
from datetime import datetime

def get_features(mode, file_postfix):
        
    x_train = []
    x_test = []
    
    if mode == "d2v" or mode == "tfidf":  
    
        vectors = {}
        train_other_features = df[df.test_tag == 0][common_columns]
        test_other_features = df[df.test_tag == 1][common_columns]
        
        for section, column in product(["train", "test"], ["title", "body"]):
            with open(f"data/vectors/{mode}_{file_postfix}_{section}_{column}", 'rb') as f:
                vectors[f"{section}_{column}"] = cPickle.load(f)
    
        if mode == "d2v":
            x_train = np.append(vectors["train_title"], vectors["train_body"], axis=1)
            x_test = np.append(vectors["test_title"], vectors["test_body"], axis=1)
        elif mode == "tfidf":
            x_train = sparse.hstack((vectors["train_title"],vectors["train_body"]))
            x_test = sparse.hstack((vectors["test_title"],vectors["test_body"]))
        
    elif feature_mode == "ft2stage":
        x_train = df[df.test_tag == 0][common_columns+ft_columns]
        x_test = df[df.test_tag == 1][common_columns+ft_columns]
        
    return x_train, x_test



n_jobs = 1
random_state = 42

classifiers = {
    "mnb" : {"clf" : MultinomialNB()},
    "gnb" : {"clf" : GaussianNB()},
    "lr" : {"clf" : LogisticRegression(random_state=random_state)},
    "sgd" : {"clf" : SGDClassifier(random_state=random_state)},
    "svc" : {"clf" : SVC(random_state=random_state)},    
    "rf" : {"clf" : RandomForestClassifier(n_jobs=n_jobs, random_state=random_state)},
    "knn" : {"clf" : KNeighborsClassifier(n_jobs=n_jobs)}
}

common_columns = [
 'comments', 'is_pull_request', 'has_milestone', 'num_of_assignees', 'reaction_total_count', 'numeric_association',

 'num_of_sharps',
 'num_of_at',
 'num_of_codesnippets',
 'num_of_functions',
 'num_of_issues',
 'num_of_paths',
 'num_of_dates',
 'num_of_times',
 'num_of_urls',
 'num_of_emails',
 'num_of_obligations',
 'num_of_qmark',
 
 'title_lem_len',
 'title_lem_words_num',
 'body_lem_len',
 'body_lem_words_num',
 'title_alphabet_ratio',
 'body_alphabet_ratio',
 
 'title_sentistrenght_p',
 'body_sentistrenght_p',
 'title_subjectivity',
 'body_subjectivity',
 'positive_body_sentistrenght_n',
 'positive_title_sentistrenght_n',
 'positive_title_polarity',
 'positive_body_polarity']


ft_columns = ['ft_bug', 'ft_feature']
file_postfix = {"tfidf": "processed", "d2v": "500-500_proc-lem", "ft2stage": ""}

In [None]:
dataset_name = "normdf_nontext_columns"
df = pd.read_csv(f"data/{dataset_name}.csv")
feature_mode = "tfidf"
y_train = df[df.test_tag == 0].label_cat
y_test = df[df.test_tag == 1].label_cat
x_train, x_test = get_features(feature_mode, file_postfix=file_postfix[feature_mode])

In [3]:
estimators = [('sgd', SGDClassifier(penalty='l2', loss='modified_huber', l1_ratio=0.0, class_weight=None, random_state=random_state, n_jobs = n_jobs)), 
              ('lr', LogisticRegression(solver='sag', penalty='l2', class_weight=None, C=1.0, random_state=random_state, n_jobs = n_jobs)), 
              ('svc', SVC(kernel='rbf', gamma='scale', class_weight=None, C=1.0, probability=True, random_state=random_state))]

In [4]:
def classify(ensemble_method, voting_strategy = "soft", final_estimator = "lr"):
    
    start_time = datetime.now()

    title = f"{ensemble_method} {voting_strategy}" if ensemble_method == "voting" else f"{ensemble_method} {final_estimator}"    
    report = title.strip() + ":\n"
    
    if ensemble_method == "voting":
        model = VotingClassifier(estimators, voting = voting_strategy, n_jobs=n_jobs)
    elif ensemble_method == "stacking":
        model = StackingClassifier(estimators=estimators, final_estimator=classifiers[final_estimator]["clf"], n_jobs=n_jobs)
  
    y_pred = [] 
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)   
    report += classification_report(y_test, y_pred)
    
    
    report += "\nscore:\n" + str(model.score()) + '\n'
    
#     scores = cross_val_score(model, x, y, cv=10, scoring='accuracy', n_jobs=38)
#     print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
        
    accuracyScore = accuracy_score(y_pred, y_test)
    report += "\naccuracy score:" + str(accuracyScore) + '\n'
    
    report += "\n\nduration: " + str(datetime.now() - start_time)
    
    print(report)   
    
    with open(f"results/{title}_fs2.txt", "w") as f:
        f.write(report)
    
    print("duration: " + str(datetime.now() - start_time))

In [None]:
classify("voting", "soft")
print("******************done******************")