In [1]:
import pandas as pd
import numpy as np
from numpy import random
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import scipy
from scipy import sparse
import _pickle as cPickle
from datetime import datetime
import time
from itertools import product 
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegressionCV
from scipy.stats import uniform
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import _pickle as cPickle
from scipy import sparse
from tqdm import tqdm
from sklearn import utils
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import json

In [3]:
file_postfix = {"tfidf": "processed", "d2v": "500-500_proc-lem"}

def get_features():
    ngram_range = (1,2)
    title_max_features = 10000
    body_max_features = 20000
    column_postfix = "processed"
    
    for c in ["title_processed", "body_processed"]:
        df[c] = df[c].astype(str)

    train = df[df.test_tag == 0]
    test = df[df.test_tag == 1]
    
    x_train = []
    x_test = []
        
    vectors = {}
    train_other_features = train[feature_set]
    test_other_features = test[feature_set]

    tfidf_vectorizer_title = TfidfVectorizer(
        stop_words='english',
        sublinear_tf=True,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w{2,}',  #vectorize 2-character words or more
        ngram_range=ngram_range,
        max_features=title_max_features)

    vectors["train_title"] = tfidf_vectorizer_title.fit_transform(train[f"title_{column_postfix}"])
    vectors["test_title"] = tfidf_vectorizer_title.transform(test[f"title_{column_postfix}"])

    tfidf_vectorizer_body = TfidfVectorizer(
        stop_words='english',
        sublinear_tf=True,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w{2,}',  #vectorize 2-character words or more
        ngram_range=ngram_range,
        max_features=body_max_features)

    vectors["train_body"] = tfidf_vectorizer_body.fit_transform(train[f"body_{column_postfix}"])
    vectors["test_body"] = tfidf_vectorizer_body.transform(test[f"body_{column_postfix}"])
   
    x_train = sparse.hstack((vectors["train_title"],vectors["train_body"],train_other_features.astype(float)))
    x_test = sparse.hstack((vectors["test_title"],vectors["test_body"],test_other_features.astype(float)))
    
    return x_train, x_test


def classify(algorithm, param_mode):
    
    start_time = datetime.now()

    title = f"{param_mode} {algorithm} + {feature_mode} {file_postfix[feature_mode]}"    
    report = title.strip() + ":\n"
    
    if param_mode == "default":
        model = classifiers[algorithm]["clf"]
    elif param_mode == "specified":
        model = classifiers[algorithm]["clf_with_params"]
    elif param_mode == "tuned":
        model = RandomizedSearchCV(estimator=classifiers[algorithm]["clf"], param_distributions = classifiers[algorithm]["random_grid"], 
                               n_iter=100, verbose=2, cv=3, random_state=42, n_jobs=n_jobs)
        
    y_pred = [] 
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)   
    report += classification_report(y_test, y_pred)
    
    if(param_mode == "tuned"):
        report += "\nbestparameters:\n" + str(model.best_params_) + '\n'
     
    accuracyScore = accuracy_score(y_pred, y_test)
    report += "\naccuracy score:" + str(accuracyScore) + '\n'
    
    report += "\n\nduration: " + str(datetime.now() - start_time)
    
    print(report)   
    
    with open(f"results/{repo}_{title}.txt", "w") as f:
        f.write(report)
    
    print("duration: " + str(datetime.now() - start_time))

In [4]:
class_weight = ['balanced', None]

n_jobs = 1
random_state = 42

rf_random_grid = {'bootstrap': [True, False],
                  'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
                  'max_features': ['auto', 'log2', None],
                  'min_samples_leaf': [1, 2, 4],
                  'min_samples_split': [2, 5, 10],
                  'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
                  'class_weight': class_weight+["balanced_subsample"]}

svc_random_grid = {'C': np.logspace(-3, 2, 6), 
                   'gamma': ['auto', 'scale'],
                   'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                   'class_weight' : class_weight}

sgd_random_grid = {"loss": ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
                   "penalty": ["l1", "l2", "elasticnet"],
                   "l1_ratio": 0.2*np.arange(0,6),
                   'class_weight' : class_weight}

knn_random_grid = {"leaf_size" : list(range(1,50)),
                   "n_neighbors" : list(range(1,35)),
                   "p": [1,2]}

lr_random_grid = {'C' : np.logspace(-3, 2, 6),
                  'penalty' : ['l2', 'none'],
                  'solver' : ['newton-cg', 'lbfgs', 'sag', 'saga'],
                  'class_weight' : class_weight}

classifiers = {
    "mnb" : {"clf" : MultinomialNB()},
    "gnb" : {"clf" : GaussianNB()},
    "lr" : {"clf" : LogisticRegression(n_jobs=n_jobs, random_state=random_state), "random_grid" : lr_random_grid, "clf_with_params" : LogisticRegression(n_jobs=n_jobs, random_state=random_state, class_weight = 'balanced')},
    "sgd" : {"clf" : SGDClassifier(n_jobs=n_jobs, random_state=random_state), "random_grid" : sgd_random_grid, "clf_with_params" : SGDClassifier(n_jobs=n_jobs, random_state=random_state)},
    "svc" : {"clf" : SVC(random_state=random_state), "random_grid" : svc_random_grid, "clf_with_params" : SVC(random_state=random_state, kernel='rbf', gamma='scale', class_weight=None, C=1.0)},    
    "rf" : {"clf" : RandomForestClassifier(n_jobs=n_jobs, random_state=random_state), "random_grid" : rf_random_grid, "clf_with_params" : RandomForestClassifier(n_jobs=n_jobs, random_state=random_state)},
    "knn" : {"clf" : KNeighborsClassifier(n_jobs=n_jobs), "random_grid" : knn_random_grid, "clf_with_params" : KNeighborsClassifier(n_jobs=n_jobs)}
}

In [5]:
# issue_features = [       
#     'is_pull_request'

#     'title_processed_words_num', 'body_processed_words_num', 
    
#     'num_of_codesnippets',
#     'num_of_urls',
        
#     'issue_type',
    
#     'body_sentistrenght_p',
#     'body_subjectivity',
#     'positive_body_sentistrenght_n',
#     'positive_body_polarity'
# ]


# user_features = [
#     'author_followers', 'author_following', 'author_public_repos', 'author_public_gists', 'author_issue_counts', 
#     'author_github_cntrb', 'author_repo_cntrb', 'author_account_age', 'numeric_association'
# ]

labels = pd.read_csv('labels_clusters.csv')

label_features = list(labels.columns)

selected_features = [         
    'ft_issue_type','num_labels',
#----------------------------------
    'title_processed_words_num','body_processed_words_num','num_of_urls','has_code',
#----------------------------------
    'has_commit','has_assignee','is_pull_request',
#----------------------------------
    'same_author_closer','author_followers','author_following','author_public_repos','author_public_gists','author_issue_counts','author_github_cntrb','author_account_age','author_repo_cntrb','numeric_association',
#----------------------------------
    'closer_followers','closer_following','closer_public_repos','closer_public_gists','closer_repo_cntrb','closer_account_age','closer_github_cntrb',
#----------------------------------
    'cm_developers_ratio','cm_mean_len',
#----------------------------------    
    'num_events','num_comments','has_milestone','time_to_discuss',
#----------------------------------
    'body_sentistrenght_p','positive_body_sentistrenght_n','positive_body_polarity','body_subjectivity']

In [None]:
feature_set = selected_features + label_features
# feature_set = selected_features 

target_column = "repo_label_2class"
# target_column = "repo_label_cat"

feature_mode = "tfidf"   

param_mode = "default"
# param_mode = "specified"
# param_mode = "tuned"

algorithm_name = "lr"

smote = True
# smote = False

norm_data = True
# norm_data = False

In [None]:
with open("2class_repo_names.json") as f:
    repo_names = json.loads(f.read())

for repo_name in repo_names:
    if norm_data:
        df = pd.read_csv(f"data/{repo_name}_norm.csv")
    else:
        df = pd.read_csv(f"data/{repo_name}.csv")
        
    if repo_name == 'cross_repo':
        df = df[df.repo.isin(repo_addresses)]
        
    y_train = df[df.test_tag == 0][target_column]
    y_test = df[df.test_tag == 1][target_column]
    x_train, x_test = get_features()

    if smote:
        sm = SMOTE(random_state=42)
        x_train, y_train = sm.fit_resample(x_train, y_train)

    print(f'------------------{repo_name}------------------')
    classify(algorithm_name, param_mode)