In [None]:
import pandas as pd
import numpy as np
from numpy import random
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import scipy
from scipy import sparse
import _pickle as cPickle
from datetime import datetime
import time
from itertools import product 
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegressionCV
from scipy.stats import uniform
from datetime import datetime

In [None]:
def get_features(mode, file_postfix):
        
    x_train = []
    x_test = []
    
    if mode == "d2v" or mode == "tfidf":  
    
        vectors = {}
        train_other_features = df[df.test_tag == 0][common_columns]
        test_other_features = df[df.test_tag == 1][common_columns]
        
        for section, column in product(["train", "test"], ["title", "body"]):
            with open(f"data/vectors/{mode}_{file_postfix}_{section}_{column}", 'rb') as f:
                vectors[f"{section}_{column}"] = cPickle.load(f)
    
        if mode == "d2v":
            x_train = np.append(vectors["train_title"], np.append(vectors["train_body"], train_other_features ,axis=1), axis=1)
            x_test = np.append(vectors["test_title"], np.append(vectors["test_body"], test_other_features ,axis=1), axis=1)
        elif mode == "tfidf":
            x_train = sparse.hstack((vectors["train_title"],vectors["train_body"],train_other_features))
            x_test = sparse.hstack((vectors["test_title"],vectors["test_body"],test_other_features))
        
    elif feature_mode == "ft2stage":
        x_train = df[df.test_tag == 0][common_columns+ft_columns]
        x_test = df[df.test_tag == 1][common_columns+ft_columns]
        
    return x_train, x_test

def classify(algorithm, param_mode):
    
    start_time = datetime.now()

    title = f"{param_mode} {algorithm} + {feature_mode} {file_postfix[feature_mode]}"    
    report = title.strip() + ":\n"
    
    if param_mode == "default":
        model = classifiers[algorithm]["clf"]
    elif param_mode == "specified":
        model = classifiers[algorithm]["clf_with_params"]
    else:
        model = RandomizedSearchCV(estimator=classifiers[algorithm]["clf"], param_distributions = classifiers[algorithm]["random_grid"], 
                               n_iter=100, verbose=2, cv=3, random_state=42, n_jobs=n_jobs)
        
    y_pred = [] 
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)   
    report += classification_report(y_test, y_pred)
    
    if(param_mode == "tuned"):
        report += "\nbestparameters:\n" + str(model.best_params_) + '\n'
     
    accuracyScore = accuracy_score(y_pred, y_test)
    report += "\naccuracy score:" + str(accuracyScore) + '\n'
    
    report += "\n\nduration: " + str(datetime.now() - start_time)
    
    print(report)   
    
    with open(f"results/{title}.txt", "w") as f:
        f.write(report)
    
    print("duration: " + str(datetime.now() - start_time))

In [None]:
# class_weight = ['balanced', None]
class_weight = [None]

n_jobs = 1
random_state = 42

rf_random_grid = {'bootstrap': [True, False],
                  'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
                  'max_features': ['auto', 'log2', None],
                  'min_samples_leaf': [1, 2, 4],
                  'min_samples_split': [2, 5, 10],
                  'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
                  'class_weight': class_weight+["balanced_subsample"]}

svc_random_grid = {'C': np.logspace(-3, 2, 6), 
                   'gamma': ['auto', 'scale'],
                   'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                   'class_weight' : class_weight}

sgd_random_grid = {"loss": ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
                   "penalty": ["l1", "l2", "elasticnet"],
                   "l1_ratio": 0.2*np.arange(0,6),
                   'class_weight' : class_weight}

knn_random_grid = {"leaf_size" : list(range(1,50)),
                   "n_neighbors" : list(range(1,35)),
                   "p": [1,2]}

lr_random_grid = {'C' : np.logspace(-3, 2, 6),
                  'penalty' : ['l2', 'none'],
                  'solver' : ['newton-cg', 'lbfgs', 'sag', 'saga'],
                  'class_weight' : class_weight}

classifiers = {
    "mnb" : {"clf" : MultinomialNB()},
    "gnb" : {"clf" : GaussianNB()},
    "lr" : {"clf" : LogisticRegression(n_jobs=n_jobs, random_state=random_state), "random_grid" : lr_random_grid, "clf_with_params" : LogisticRegression(n_jobs=n_jobs, random_state=random_state)},
    "sgd" : {"clf" : SGDClassifier(n_jobs=n_jobs, random_state=random_state), "random_grid" : sgd_random_grid, "clf_with_params" : SGDClassifier(n_jobs=n_jobs, random_state=random_state)},
    "svc" : {"clf" : SVC(random_state=random_state), "random_grid" : svc_random_grid, "clf_with_params" : SVC(random_state=random_state)},    
    "rf" : {"clf" : RandomForestClassifier(n_jobs=n_jobs, random_state=random_state), "random_grid" : rf_random_grid, "clf_with_params" : RandomForestClassifier(n_jobs=n_jobs, random_state=random_state)},
    "knn" : {"clf" : KNeighborsClassifier(n_jobs=n_jobs), "random_grid" : knn_random_grid, "clf_with_params" : KNeighborsClassifier(n_jobs=n_jobs)}
}

In [None]:
common_columns = [
 'comments', 'is_pull_request', 'has_milestone', 'num_of_assignees', 'reaction_total_count', 'numeric_association',

 'num_of_sharps',
 'num_of_at',
 'num_of_codesnippets',
 'num_of_functions',
 'num_of_issues',
 'num_of_paths',
 'num_of_dates',
 'num_of_times',
 'num_of_urls',
 'num_of_emails',
 'num_of_obligations',
 'num_of_qmark',
 
 'title_lem_len',
 'title_lem_words_num',
 'body_lem_len',
 'body_lem_words_num',
 'title_alphabet_ratio',
 'body_alphabet_ratio',
 
 'title_sentistrenght_p',
 'body_sentistrenght_p',
 'title_subjectivity',
 'body_subjectivity',
 'positive_body_sentistrenght_n',
 'positive_title_sentistrenght_n',
 'positive_title_polarity',
 'positive_body_polarity']

ft_columns = ['ft_bug', 'ft_feature']

In [None]:
file_postfix = {"tfidf": "processed", "d2v": "500-500_proc-lem", "ft2stage": ""}

dataset_name = "normdf_nontext_columns"
algorithm_name = "rf"
param_mode = "tuned"   # param_modes = ["defualt", "tuned", "specified"]
feature_mode = "tfidf"   # feature_modes = ["d2v", "tfidf", "ft2stage"]

In [None]:
df = pd.read_csv(f"data/{dataset_name}.csv")

y_train = df[df.test_tag == 0].label_cat
y_test = df[df.test_tag == 1].label_cat

x_train, x_test = get_features(feature_mode, file_postfix=file_postfix[feature_mode])

classify(algorithm_name, param_mode)
print("******************done******************")