In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectPercentile, chi2, f_classif, RFECV, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, brier_score_loss
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from warnings import filterwarnings
from sklearn.model_selection import GridSearchCV
from functools import partial
import itertools

* read csv
* encoding, missing value handling, oversampling
* models -> voting classifier for improvements
* evaluation
* visualization
* <3

In [2]:
filterwarnings(action='ignore')

In [3]:
train_df = pd.read_csv('lucene-solr/dataset/training.csv')
test_df = pd.read_csv('lucene-solr/dataset/testing.csv')

In [4]:
print('train: ', len(train_df))
print('test: ', len(test_df))

train:  5329
test:  7117


In [5]:
for df in [train_df, test_df]:
    df.dropna(subset=list(df.columns).remove('Bugged'), inplace=True)
    for col in df.columns:
        df[col] = df[col].apply(lambda x: 1 if x is True else 0)

In [6]:
X_train = train_df.drop(['Bugged'], axis=1)
y_train = train_df['Bugged']

X_test = test_df.drop(['Bugged'], axis=1)
y_test = test_df['Bugged']

In [7]:
print('train: ', len(train_df))
print('test: ', len(test_df))

train:  4105
test:  5643


In [8]:
train_df = train_df.dropna()

In [9]:
models = {
        'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
        'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
        'LogisticRegression': LogisticRegression(),
        'BernoulliNaiveBayes': BernoulliNB(),
        'K-NearestNeighbor': KNeighborsClassifier(),
        'DecisionTree': DecisionTreeClassifier(),
        'RandomForest': RandomForestClassifier(),
        'SupportVectorMachine': SVC(),
        'MultilayerPerceptron': MLPClassifier()
    }

params = {
        'LinearDiscriminantAnalysis': {},
        'QuadraticDiscriminantAnalysis': {},
        'LogisticRegression': {'C': list(np.logspace(-4, 4, 3))},
        'BernoulliNaiveBayes': {},
        'K-NearestNeighbor': {},
        'DecisionTree': {'criterion': ['gini', 'entropy'], },
        'RandomForest': {'n_estimators': [10, 100]},
        'SupportVectorMachine': {'C': [0.1, 100]},
        'MultilayerPerceptron': {'hidden_layer_sizes': [(17, 8, 17)],
                                 'activation': ['tanh', 'relu']}
    }

selection_methods = {
        'chi2_20p': SelectPercentile(chi2, percentile=20),
        'chi2_50p': SelectPercentile(chi2, percentile=50),
        'mutual_info_classif_20p': SelectPercentile(mutual_info_classif, percentile=20),
        'mutual_info_classif_50p': SelectPercentile(mutual_info_classif, percentile=50),
        'f_classif_20': SelectPercentile(f_classif, percentile=20),
        'f_classif_50': SelectPercentile(f_classif, percentile=50),
        'recursive_elimination': RFECV(RandomForestClassifier(), min_features_to_select=3, step=1, cv=5, scoring='f1')
    }


In [10]:
# def select(X, y):
#     selected_data = pd.DataFrame(columns=list(selection_methods.keys()))
#     selected_features = pd.DataFrame(columns=list(selection_methods.keys()))
#     features = X.columns
#     for method_name, method in selection_methods.items():
#         selected_data[method_name] = method.fit_transform(X, y).tolist()
#         features_mask = method.get_support()
#         selected_features[method_name] = np.array(features)[features_mask].tolist()
#         print(np.array(features)[features_mask].tolist())
# #     selected_data['all'] = X
# #     selected_features['all'] = list(features)
#     return selected_features, selected_data

In [11]:
selected_data = {}
selected_features = {}
features = X_train.columns
for method_name, method in selection_methods.items():
    selected_data[method_name] = method.fit_transform(X_train, y_train)
    features_mask = method.get_support()
    selected_features[method_name] = np.array(features)[features_mask].tolist()
selected_data['all'] = X_train
selected_features['all'] = list(features)

In [12]:
# selected_data = pd.DataFrame(columns=list(selection_methods.keys()))
# selected_features = pd.DataFrame(columns=list(selection_methods.keys()))

# selected_features_train, selected_data_train = select(X_train, y_train)

In [13]:
# selected_data
# selected_features

In [14]:
oversampled_datasets = {method: SMOTE().fit_resample(X_train, y_train) for method, X_train in selected_data.items()}

oversampled_training = oversampled_datasets

# selected_features, selected_dataset = select(X_train, y_train)

In [15]:
def get_selected_testing(test_df, selected_features):
    features = test_df.columns
    test_y = test_df['Bugged']
    selected_testing_datasets = {
    method: (test_df[test_df.columns.intersection(features)].values, test_y)
    for method, features in selected_features.items()
    }
    return selected_testing_datasets

In [16]:
selected_testing_datasets = get_selected_testing(test_df, selected_features)
selected_testing = selected_testing_datasets

In [17]:
# selected_testing_datasets

In [18]:
grid_searches = {}

In [19]:
def fit(X, y, cv=5, n_jobs=1, verbose=1, scoring=None, refit=False):
    for key in models.keys():
        model = models[key]
        param = params[key]
        gs = GridSearchCV(model, param, cv=cv, n_jobs=n_jobs, verbose=verbose,
                          scoring=scoring, refit=refit, return_train_score=True)
        gs.fit(X, y)
        grid_searches[key] = gs

In [20]:
def score_summary(sort_by='mean_score'):

    def extract_rows(key: str):
        def get_cv_results(cv, params):
            key = "split{}_test_score".format(cv)
            return grid_search.cv_results_[key]

        def row(key, scores, params):
            d = {
                'estimator': key,
                'min_score': np.min(scores),
                'max_score': np.max(scores),
                'mean_score': np.mean(scores),
                'std_score': np.std(scores)
            }
            return pd.Series({**params, **d})
    
        grid_search = grid_searches[key]
        params = grid_search.cv_results_['params']
        get_cv_results_with_params = partial(get_cv_results, params=params)
        scores = np.hstack(list(map(get_cv_results_with_params, range(grid_search.cv))))
        summary = list(map(lambda values:
                           row(key, values[1], values[0]),
                           list(zip(params, scores))))
        return summary
    
    rows = list(itertools.chain.from_iterable(map(extract_rows, grid_searches.keys())))
    df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
    columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
    columns = columns + [c for c in df.columns if c not in columns]
    return df[columns]


In [21]:
def get_summary(X, y):
    fit(X, y)
    return score_summary()

In [22]:
print(X_train, y_train)
print()
print(selected_data)
# for method, data in selected_data.items():
#     print(data[0], data[1])

      ImperativeAbstraction  MultifacetedAbstraction  UnnecessaryAbstraction  \
0                         0                        0                       0   
1                         0                        0                       0   
2                         0                        0                       0   
3                         0                        0                       0   
4                         0                        0                       0   
...                     ...                      ...                     ...   
4944                      0                        0                       0   
4945                      0                        0                       0   
4946                      0                        0                       0   
4947                      0                        0                       0   
4948                      0                        0                       0   

      UnutilizedAbstraction  DeficientE

In [23]:
for method, data in selected_data.items():
    try:
        print(data[0], data[1])
    except:
        pass

[0 1 0 0] [0 0 0 0]
[0 0 0 0 1 0 0 0] [0 0 0 0 0 0 0 0]
[0 0 0 0] [1 1 0 0]
[0 0 0 0 0 0 0 0] [0 0 0 1 1 0 0 0]
[0 1 0 0] [0 0 0 0]
[0 0 0 0 1 0 0 0] [0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 1 0 0 0 0 0] [0 0 1 1 0 0 0 0 0 0 0 0 0]


In [24]:
summaries = {method: get_summary(data[0], data[1])
                     for method, data in oversampled_datasets.items()}

# top_summaries = {method: summary[:n] for method, summary in summaries.items()}

# configurations = {method: list(map(lambda x: x[1].to_dict(),
#                                            top_summary.drop(EstimatorSelectionHelper.get_scores_info(),
#                                                             axis=1)
#                                            .where(pd.notnull(top_summary), None).iterrows()))
#                           for method, top_summary in top_summaries.items()}

# method_names = configurations.keys()
#         scores_dicts = list(map(lambda method_name:
#                                 list(map(lambda configuration:
#                                          calculate_score(method_name,
#                                                          oversampled_training[method_name],
#                                                          selected_testing[method_name],
#                                                          configuration),
#                                          configurations[method_name])), method_names))
#         scores_df = [pd.DataFrame(score) for score in scores_dicts]
#         scores = pd.concat(scores_df)


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Fitting 5 folds for each of 2 candidates, totall

In [25]:
# summaries

In [26]:
top_summaries = {method: summary[:10] for method, summary in summaries.items()}

In [27]:
def get_scores_info():
    return ['min_score',
            'max_score',
            'mean_score',
            'std_score']

In [28]:
def calculate_scores(configurations, oversampled_training, selected_testing):
    def calculate_score(method_name, training, testing, configuration):
        estimator = models[configuration['estimator']]
        params = {key: val for key, val in configuration.items() if not (val is None or key == 'estimator')}
        estimator.set_params(**params)
        training_X, training_y = training
        estimator.fit(training_X, training_y)
        testing_X, testing_y = testing
        prediction_y = estimator.predict(testing_X)
        scores_dict = {
            'estimator': configuration['estimator'],
            'configuration': str(params),
            'feature_selection': method_name,
            'precision': precision_score(testing_y, prediction_y),
            'recall': recall_score(testing_y, prediction_y),
            'f1-measure': f1_score(testing_y, prediction_y),
            'auc-roc': roc_auc_score(testing_y, prediction_y),
            'brier score': brier_score_loss(testing_y, prediction_y)
        }
        return scores_dict

    method_names = configurations.keys()
    scores_dicts = list(map(lambda method_name:
                            list(map(lambda configuration:
                                     calculate_score(method_name,
                                                     oversampled_training[method_name],
                                                     selected_testing[method_name],
                                                     configuration),
                                     configurations[method_name])), method_names))
    scores_df = [pd.DataFrame(score) for score in scores_dicts]
    scores = pd.concat(scores_df)
    return scores

In [29]:
configurations = {method: list(map(lambda x: x[1].to_dict(),
                                           top_summary.drop(get_scores_info(),
                                                            axis=1)
                                           .where(pd.notnull(top_summary), None).iterrows()))
                          for method, top_summary in top_summaries.items()}

calculate_scores(configurations, oversampled_training, selected_testing)


Unnamed: 0,estimator,configuration,feature_selection,precision,recall,f1-measure,auc-roc,brier score
0,LinearDiscriminantAnalysis,{},chi2_20p,0.380762,0.159798,0.225118,0.545211,0.231792
1,QuadraticDiscriminantAnalysis,{},chi2_20p,0.380762,0.159798,0.225118,0.545211,0.231792
2,LogisticRegression,{'C': 1.0},chi2_20p,0.380762,0.159798,0.225118,0.545211,0.231792
3,LogisticRegression,{'C': 10000.0},chi2_20p,0.380762,0.159798,0.225118,0.545211,0.231792
4,BernoulliNaiveBayes,{},chi2_20p,0.380762,0.159798,0.225118,0.545211,0.231792
...,...,...,...,...,...,...,...,...
5,SupportVectorMachine,{'C': 100},all,0.205679,0.749369,0.322768,0.488403,0.662591
6,SupportVectorMachine,{'C': 0.1},all,0.210790,0.778806,0.331781,0.500202,0.660996
7,RandomForest,{'n_estimators': 10},all,0.208560,0.774601,0.328635,0.494956,0.666844
8,K-NearestNeighbor,{},all,0.199920,0.841043,0.323050,0.471262,0.742690
