In [12]:
import pandas as pd
import numpy as np
from numpy import random
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import scipy
from scipy import sparse
import _pickle as cPickle
from datetime import datetime
import time
from itertools import product 
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegressionCV
from scipy.stats import uniform
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
class_weight = ['balanced', None]
# class_weight = [None]

n_jobs = 1
random_state = 42

rf_random_grid = {'bootstrap': [True, False],
                  'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
                  'max_features': ['auto', 'log2', None],
                  'min_samples_leaf': [1, 2, 4],
                  'min_samples_split': [2, 5, 10],
                  'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
                  'class_weight': class_weight+["balanced_subsample"]}

svc_random_grid = {'C': np.logspace(-3, 2, 6), 
                   'gamma': ['auto', 'scale'],
                   'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                   'class_weight' : class_weight}

sgd_random_grid = {"loss": ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
                   "penalty": ["l1", "l2", "elasticnet"],
                   "l1_ratio": 0.2*np.arange(0,6),
                   'class_weight' : class_weight}

knn_random_grid = {"leaf_size" : list(range(1,50)),
                   "n_neighbors" : list(range(1,35)),
                   "p": [1,2]}

lr_random_grid = {'C' : np.logspace(-3, 2, 6),
                  'penalty' : ['l2', 'none'],
                  'solver' : ['newton-cg', 'lbfgs', 'sag', 'saga'],
                  'class_weight' : class_weight}

# lr_random_grid_2 = {'C' : np.logspace(-3, 2, 6),
#                   'penalty' : ['l1', 'l2'],
#                   'solver' : ['saga', 'liblinear'],
#                   'class_weight' : class_weight}

classifiers = {
    "mnb" : {"clf" : MultinomialNB()},
    "gnb" : {"clf" : GaussianNB()},
    "lr" : {"clf" : LogisticRegression(n_jobs=n_jobs, random_state=random_state), "random_grid" : lr_random_grid, "clf_with_params" : LogisticRegression(n_jobs=n_jobs, random_state=random_state)},
    "sgd" : {"clf" : SGDClassifier(n_jobs=n_jobs, random_state=random_state), "random_grid" : sgd_random_grid, "clf_with_params" : SGDClassifier(n_jobs=n_jobs, random_state=random_state)},
    "svc" : {"clf" : SVC(random_state=random_state), "random_grid" : svc_random_grid, "clf_with_params" : SVC(random_state=random_state)},    
    "rf" : {"clf" : RandomForestClassifier(n_jobs=n_jobs, random_state=random_state), "random_grid" : rf_random_grid, "clf_with_params" : RandomForestClassifier(n_jobs=n_jobs, random_state=random_state)},
    "knn" : {"clf" : KNeighborsClassifier(n_jobs=n_jobs), "random_grid" : knn_random_grid, "clf_with_params" : KNeighborsClassifier(n_jobs=n_jobs)}
}

In [27]:
issue_features = [    
    
    'num_comments', 'num_events', 'commits_count', 'is_pull_request', 'num_of_assignees', 'has_milestone',
    
    'cm_mean_len', 'time_to_discuss', 'cm_developers_ratio',
    
    'body_processed_len', 'title_processed_len', 'title_processed_words_num', 'body_processed_words_num', 
    'title_alphabet_ratio', 'body_alphabet_ratio',
    
    'num_of_codesnippets',
    'num_of_functions',
    'num_of_issues',
    'num_of_paths',
    'num_of_urls',

    
    'ft_bug',
    'ft_feature',   
    
    'body_sentistrenght_p',
    'title_subjectivity',
    'body_subjectivity',
    'positive_body_sentistrenght_n',
    'positive_title_polarity',
    'positive_body_polarity',
]
    
user_features = [
    'author_followers', 'author_following', 'author_public_repos', 'author_public_gists', 'author_issue_counts', 
    'author_github_cntrb', 'author_repo_cntrb', 'author_account_age', 'numeric_association'
]

all_features = issue_features + user_features

In [25]:
def classify(dataset, algorithm="rf", param_mode="default", target_column="priority", save_importances=True):    
    
#     if dataset == "spring-framework":
#         features = issue_features
#     else:
#         features = all_features
        
    df = dataframes[dataset]
    
    X_train = df[df.test_tag == 0][all_features]
    X_test = df[df.test_tag == 1][all_features]
    y_train = df[df.test_tag == 0][target_column]
    y_test = df[df.test_tag == 1][target_column]

    title = f"{dataset}_{param_mode}_{algorithm}_{target_column}"    
    report = title + ":\n"

    if param_mode == "default":
        model = classifiers[algorithm]["clf"]
    elif param_mode == "specified":
        model = classifiers[algorithm]["clf_with_params"]
    elif param_mode == "tuned":
        model = RandomizedSearchCV(estimator=classifiers[algorithm]["clf"], param_distributions = classifiers[algorithm]["random_grid"], 
                               n_iter=100, cv=3, random_state=42, n_jobs=-1)
        

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report += classification_report(y_test, y_pred)
    if(param_mode == "tuned"):
        report += "\nbestparameters:\n" + str(model.best_params_) + '\n'
    accuracy = accuracy_score(y_pred, y_test)
    report += "\naccuracy score:" + str(accuracy) + '\n'
    with open(f"results/{title}.txt", "w") as f:
        f.write(report)
    print(report)

    if algorithm == "rf" and save_importances:
        nfeatures  = X_test.shape[1]
        fig, ax = plt.subplots(dpi=300, figsize = [20,15])
        ax.barh(range(nfeatures), model.feature_importances_)
        ax.set_yticks(range(nfeatures))
        ax.set_yticklabels(all_features)
        fig.savefig(f"results/images/{title}")
#         fig.savefig(f"results/images/{title}.pdf")

In [7]:
base_repos = ['elasticsearch', 'spring-framework', 'spring-boot', 'okhttp', 'RxJava', 'guava', 'retrofit']
corss_repos = ["corss_7", "cross_without_sf", "cross_without_bot", "cross_without_sf_el", "cross_without_bot_el"]
dataframes = {}
for repo in base_repos+corss_repos:
    dataframes[repo] = pd.read_csv(f"../data/repos/final/{repo}_norm.csv")

In [None]:
algorithm = "rf"
param_mode = "default"     #param_modes = ["defualt", "tuned", "specified"]
target_column = "priority"     #columns = ["priority", "priority_per_repo"]  
save_importances = False

for repo in base_repos + corss_repos:
    classify(repo, algorithm, param_mode, target_column, save_importances)
    
target_column = "priority_per_repo"
for repo in corss_repos:
    classify(repo, algorithm, param_mode, target_column, save_importances)
    
print("********done********")

In [None]:
# #unnorm
# dataset = "cross-project"
# df = pd.read_csv(f"../data/repos/final/{dataset}.csv")
# reaction_time_med = df.reaction_time.median()

# df["priority"] = df.reaction_time.apply(lambda x: 2 if x<=reaction_time_med else 1 if x>reaction_time_med else 0)

# y = df.priority
# x = df[all_features]

# X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 42, shuffle=True)

# title = f"{dataset} + {algorithm}"    
# report = title + ":\n"

# model = RandomForestClassifier(n_jobs=n_jobs, random_state=random_state, n_estimators=1000, min_samples_split=10, min_samples_leaf=2, max_features='auto', max_depth=100, class_weight='balanced', bootstrap=False)

# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# report += classification_report(y_test, y_pred)
# accuracy = accuracy_score(y_pred, y_test)
# report += "\naccuracy score:" + str(accuracy) + '\n'
# print(report)

In [52]:
# df = pd.read_csv("../data/repos/final/cross-project.csv")
# df = df[~df.reaction_time.isna()]

# y = df.reaction_time
# x = df[all_features]

# X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 42, shuffle=True)

# # X_train = df[df.test_tag == 0][all_features]
# # X_test = df[df.test_tag == 1][all_features]
# # y_train = df[df.test_tag == 0].reaction_time
# # y_test = df[df.test_tag == 1].reaction_time

# algorithm = "rf"
# RandomizedSearchCV(estimator=RandomForestRegressor(), param_distributions = classifiers[algorithm]["random_grid"], 
#                                    n_iter=100, cv=3 , random_state=42, n_jobs=4)
# model.fit(X_train, y_train)
# model.score(X_test, y_test)

In [None]:
# issue_features_total = [    
    
#     'num_comments', 'num_events', 'commits_count', 'is_pull_request', 'num_of_assignees', 'has_milestone',
    
#     'cm_mean_len', 'time_to_discuss', 'cm_developers_ratio',
    
#     'body_processed_len', 'title_processed_len', 'title_processed_words_num', 'body_processed_words_num', 
#     'title_alphabet_ratio', 'body_alphabet_ratio',
    
#     'num_of_qmark',
#     'num_of_codesnippets',
#     'num_of_functions',
#     'num_of_issues',
#     'num_of_paths',
#     'num_of_dates',
#     'num_of_times',
#     'num_of_urls',
#     'num_of_sharps',
#     'num_of_at',
#     'num_of_emails',
#     'num_of_obligations',
    
#     'ft_bug',
#     'ft_feature',   
    
#     'title_sentistrenght_p',
#     'body_sentistrenght_p',
#     'title_subjectivity',
#     'body_subjectivity',
#     'positive_body_sentistrenght_n',
#     'positive_title_sentistrenght_n',
#     'positive_title_polarity',
#     'positive_body_polarity',
# ]

# issue_features_2 = [    
    
#     'num_comments', 'num_events', 'commits_count', 'is_pull_request', 'num_of_assignees', 'has_milestone',
    
#     'cm_mean_len', 'time_to_discuss', 'cm_developers_ratio',
    
#     'body_processed_len', 'title_processed_len', 'title_processed_words_num', 'body_processed_words_num', 
#     'title_alphabet_ratio', 'body_alphabet_ratio',
    
#     'num_of_codesnippets',
#     'num_of_urls',
    
#     'ft_bug',
#     'ft_feature',   
    
#     'body_sentistrenght_p',
#     'body_subjectivity',
#     'positive_body_sentistrenght_n',
#     'positive_body_polarity',
# ]

# user_features_total = [
#     'author_followers', 'closer_followers', 'author_following', 'closer_following', 'author_public_repos', 'closer_public_repos', 
#     'author_public_gists', 'closer_public_gists', 'author_issue_counts',
#     'author_github_cntrb', 'closer_github_cntrb', 'author_repo_cntrb', 'closer_repo_cntrb', 'author_account_age',
#     'closer_account_age', 'numeric_association'
# ]