# Script to classify (binary problem) First time issues vs the rest

In [1]:
import pandas as pd
import os.path
import numpy as np

# Experiment setup
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV, RandomizedSearchCV
from scipy.stats import randint
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn import metrics

# classifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [2]:
file_name = 'qt-dataset.csv'

In [3]:
df = pd.DataFrame.from_csv(file_name)

In [4]:
df.rename(columns={'fit':'fit_feature'}, inplace=True)  # this bow feature needs to be renamed because it will be interpreted as a function call otherwise

In [5]:
classes = ['class_first_issue', 'class_first_five_issues', 'class_first_ten_issues']

target_class = classes[0]

In [6]:
classifiers = ['rf', 'dt', 'nb', 'svm']

classifier = classifiers[0]

In [7]:
########## SUMMARY ###########
# 1. get data of both classes
#   1.1 get true instances 
#   1.2 get false instances 
# 2. undersample false instances
# 3. combine the data of the true instances with the undersampled false instances
# 4. test and train sample
# 5. oversample the true sample
##############################


# 1. get data of both classes
#   1.1 get true instances 
df_to_oversample = df[df[target_class]==1]

#   1.2 get false instances
df_to_undersample = df[df[target_class]==0]


# 2. undersample false instances
target_sample_size = len(df_to_oversample)*2  # since we oversample the true class by duplicating, we undersample the false class to two times the number of the true class
df_undersampled = df_to_undersample.sample(n=target_sample_size)


# 3. combine the data of the true instances with the undersampled false instances
data_set = df_to_oversample.append(df_undersampled)
data_set = data_set.sample(frac=1)


# 4. test and train sample
train, test = train_test_split(data_set, test_size=0.15)


# 5. oversample the true sample (IMPORTANT: oversample only the remaining rows, that are not part of the test set!)
df_to_oversample = train[train[target_class]==1]
train = train.append(df_to_oversample)

# clear memory
del df_to_oversample, df_to_undersample, df_undersampled

In [8]:
########## SUMMARY ###########
# 1. define columns we do not want to be part in the classification process
# 2. create train data and labels
# 3. create test data and labels
##############################


# 1. define columns we do not want to be part in the classification process
columns_to_drop = classes + ['assignee', 'resolution_date', 'issue_description']

# 2. create train data and labels
train_data = train.drop(columns_to_drop, axis=1)
train_label = train[target_class]

# 3. create test data and labels
test_data = test.drop(columns_to_drop, axis=1)
test_label = test[target_class]

In [9]:
########## SUMMARY ###########
# Classification Benchmark
# 1. define hyper-parameter ranges
# 2. run random search
# 3. print results
##############################

# 1. define hyper-parameter ranges
if classifier == 'rf':
    tuned_parameters = {'n_estimators': [1, 10, 100, 1000, 3000], 'max_features': ['auto', 'sqrt', 'log2', None]}
    rfc = RandomForestClassifier()
elif classifier == 'svm':
    tuned_parameters = {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf', 'linear']}
    rfc = SVC()
elif classifier == 'dt':
     tuned_parameters = {'criterion':['gini', 'entropy'], 'splitter': ['best', 'random'], 'min_samples_split':2, 'min_samples_leaf': 1}
    rfc = DecisionTreeClassifier()
elif classifier == 'nb':
    tuned_parameters = {}
    rfc = GaussianNB()

In [None]:
clf = GridSearchCV(rfc, tuned_parameters, cv=10, scoring='%s_macro' % "precision", n_jobs=38)
clf.fit(train_data, train_label)

# 3. print results
print('Classification Results:')
print('target class:', target_class)
print(clf.best_params_)
y_true, y_pred = test_label, clf.predict(test_data)
result = precision_recall_fscore_support(y_true, y_pred, average="binary", pos_label=1)
print(result)

In [None]:
# store the results
file_name_results = 'classification_results.csv'


d = {'___classifier': rfc.__class__.__name__, '__target_class': target_class, '_precision': result[0], '_recall': result[1], '_f1-score': result[2]}
if clf.best_params_:
    d.update(clf.best_params_)

if os.path.exists(file_name_results):
    df_result = pd.DataFrame.from_csv(file_name_results)
    df_result = df_result.append(d, ignore_index=True)
else:
    df_result = pd.DataFrame(d, index=[0])

df_result.to_csv(file_name_results)

In [None]:
df_result = pd.DataFrame.from_csv(file_name_results)

In [None]:
df_result.sort_values(by=['_precision'], inplace=True, ascending=False)

In [None]:
df_result.head(3)