In [1]:
import sys
import numpy as np
import pandas as pd
from sklearn import metrics

sys.path.insert(0, "../pycode")
from models import *

%matplotlib inline
pd.set_option("max_rows", 10)
np.set_printoptions(suppress=True)

In [2]:
from seaborn import set_style
set_style("darkgrid")
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
def gen_dataset_df(df_robust, app2metric_dd):
    set1 = set(df_robust['appName'])
    set2 = set(app2metric_dd.keys())
    if set1 == set2: print "Great! appName(s) are identical!"
        
    #
    # generate an empty dataframe
    #
    app_pd_series = app2metric_dd[list(app2metric_dd.keys())[0]] # query the 1st key value
    feat_cols = list(app_pd_series.index)

    feat_cols.insert(0, 'appName')  # add appName to the beginning of the list
    feat_cols.insert(len(feat_cols), 'Robust')

    appNum = len(app2metric_dd.keys())
    df_feat = pd.DataFrame(index=np.arange(0, appNum), columns=feat_cols)

    #
    # fill in the data
    #
    
    feat_cols = list(app_pd_series.index)

    for idx, row in df_feat.iterrows():
        appName, robust = df_robust.loc[idx]['appName'], df_robust.loc[idx]['Robust']
        metric_list = app2metric_dd[appName]

        # update 
        df_feat.loc[idx, 'appName'] = appName
        df_feat.loc[idx, 'Robust'] = robust

        # update other metrics
        for metric in feat_cols:
            df_feat.loc[idx, metric] = metric_list[metric]
            
    return df_feat

In [4]:
# def gen_model_input(df_dataset):
#     np_data = df_dataset.values  # convert df to array
#     np_data_clean = np.delete(np_data, np.s_[0], axis=1) # remove 1st column = appName

#     ColNum = np_data_clean.shape[1]  # number of columns
#     np_X = np_data_clean[:,[i for i in xrange(0,ColNum-1)]]  # select data except the last col
#     np_y = np_data_clean[:,[-1]]  # select the last column (as the label)

#     return np_X, np_y


def gen_model_input(df_dataset):
    df_X = df_dataset.drop(['appName', 'Robust'], axis=1)
    df_y = df_dataset['Robust']
    df_y = df_y.astype('int64')  # convert obj to int
    return df_X, df_y



In [5]:
def GreedySearch(df_X, df_y, fold_k = 3):
    modelErrorList = []
    
    # decision tree
    dt_tree_error, dt_tree_params = GreedySearch_dtree(df_X, df_y, fold_k=fold_k)
    print dt_tree_error, dt_tree_params
    modelErrorList.append((dt_tree_error, 'dtree', dt_tree_params))
    
    # knn
    knn_error, knn_params = GreedySearch_KNN(df_X, df_y, fold_k=fold_k)
    print knn_error, knn_params
    modelErrorList.append((knn_error, 'knn', knn_params))

    # svm
    svc_error, svc_params = GreedySearch_SVC(df_X, df_y, fold_k=fold_k)
    print svc_error, svc_params
    modelErrorList.append((svc_error, 'svc', svc_params))

    # random forest
    rf_error, rf_params = GreedySearch_RandomForest(df_X, df_y, fold_k=fold_k)
    print rf_error, rf_params
    modelErrorList.append((rf_error, 'random forest', rf_params))

    nn_error, nn_params = GreedySearch_MLP(df_X, df_y, fold_k=fold_k)
    print nn_error, nn_params
    modelErrorList.append((nn_error, 'neural nets', nn_params))

    ada_error, ada_params = GreedySearch_AdaBoost(df_X, df_y, fold_k=fold_k)
    print ada_error, ada_params
    modelErrorList.append((ada_error, 'AdaBoost', ada_params))

    nb_error = GreedySearch_GaussianNB(df_X, df_y, fold_k=fold_k)
    print nb_error
    modelErrorList.append((nb_error, 'GaussianNB', ""))

    qda_error = GreedySearch_QDA(df_X, df_y, fold_k=fold_k)
    print qda_error
    modelErrorList.append((qda_error, 'QDA', ""))
    
    sort_model_by_error = sorted(modelErrorList, key=lambda x: x[0]) #  error is the 1st value
    
    # output
    return sort_model_by_error[0]

In [6]:
df_robust = pd.read_csv('../prepare/run2/app_classify.csv')
print type(df_robust)

<class 'pandas.core.frame.DataFrame'>


# featall

In [7]:
# app2metric_featall = np.load('../prepare/app2metric_featAll.npy').item()
# df_dataset = gen_dataset_df(df_robust, app2metric_featall)
# df_dataset.to_csv("dataset_using_featall.csv", index=False, encoding='utf-8')

# print df_dataset.groupby("Robust").size()

# df_X, df_y = gen_model_input(df_dataset)

# # Test different models, find the best
# [error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k=3)
# print bestModel, error, modelParam

# # Test different models, find the best
# [error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k=5)
# print bestModel, error, modelParam

# # Test different models, find the best
# [error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k=10)
# print bestModel, error, modelParam

# feat64

In [8]:
# app2metric_feat = np.load('../prepare/app2metric_feat64.npy').item()
# df_dataset = gen_dataset_df(df_robust, app2metric_feat)
# df_dataset.to_csv("dataset_using_feat64.csv", index=False, encoding='utf-8')

# print df_dataset.groupby("Robust").size()

# df_X, df_y = gen_model_input(df_dataset)

# # Test different models, find the best
# [error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k = 3)
# print bestModel, error, modelParam

# # Test different models, find the best
# [error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k = 5)
# print bestModel, error, modelParam

# # Test different models, find the best
# [error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k = 10)
# print bestModel, error, modelParam

# feat42

In [9]:
# app2metric_feat = np.load('../prepare/app2metric_feat42.npy').item()
# df_dataset = gen_dataset_df(df_robust, app2metric_feat)
# df_dataset.to_csv("dataset_using_feat42.csv", index=False, encoding='utf-8')

# print df_dataset.groupby("Robust").size()

# df_X, df_y = gen_model_input(df_dataset)

# # Test different models, find the best
# [error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k = 3)
# print bestModel, error, modelParam

# # Test different models, find the best
# [error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k = 5)
# print bestModel, error, modelParam

# # Test different models, find the best
# [error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k = 10)
# print bestModel, error, modelParam


# feat26

In [10]:
# app2metric_feat = np.load('../prepare/app2metric_feat26.npy').item()
# df_dataset = gen_dataset_df(df_robust, app2metric_feat)
# df_dataset.to_csv("dataset_using_feat26.csv", index=False, encoding='utf-8')

# print df_dataset.groupby("Robust").size()

# df_X, df_y = gen_model_input(df_dataset)

# # Test different models, find the best
# [error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k = 3)
# print bestModel, error, modelParam

# # Test different models, find the best
# [error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k = 5)
# print bestModel, error, modelParam

# # Test different models, find the best
# [error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k = 10)
# print bestModel, error, modelParam


# feat18

In [11]:
# app2metric_feat = np.load('../prepare/app2metric_feat18.npy').item()
# df_dataset = gen_dataset_df(df_robust, app2metric_feat)
# df_dataset.to_csv("dataset_using_feat18.csv", index=False, encoding='utf-8')

# print df_dataset.groupby("Robust").size()

# df_X, df_y = gen_model_input(df_dataset)

# # Test different models, find the best
# [error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k = 3)
# print bestModel, error, modelParam

# # Test different models, find the best
# [error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k = 5)
# print bestModel, error, modelParam

# # Test different models, find the best
# [error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k = 10)
# print bestModel, error, modelParam

# feat14

In [12]:
app2metric_feat = np.load('../prepare/app2metric_feat14.npy').item()
df_dataset = gen_dataset_df(df_robust, app2metric_feat)
df_dataset.to_csv("dataset_using_feat14.csv", index=False, encoding='utf-8')
#print df_dataset.groupby("Robust").size()

df_X, df_y = gen_model_input(df_dataset)

# Test different models, find the best
[error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k = 3)
print bestModel, error, modelParam

# Test different models, find the best
[error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k = 5)
print bestModel, error, modelParam

# Test different models, find the best
[error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k = 10)
print bestModel, error, modelParam

Great! appName(s) are identical!
0.32790123456790127 {'criterion': 'entropy', 'max_depth': 2}
0.2908641975308642 {'n_neighbors': 2, 'weights': 'uniform', 'algorithm': 'brute', 'p': 2}
0.32888888888888884 {'kernel': 'sigmoid', 'C': 0.025, 'degree': 6}
0.30320987654320986 {'n_estimators': 10, 'max_features': 'log2', 'criterion': 'gini'}




0.30419753086419754 {'alpha': 0.1, 'activation': 'relu', 'solver': 'sgd', 'hidden_layer_sizes': (30, 30, 30)}
0.32888888888888884 {'n_estimators': 30, 'learning_rate': 1.0}
0.46469135802469136
0.425679012345679
knn 0.2908641975308642 {'n_neighbors': 2, 'weights': 'uniform', 'algorithm': 'brute', 'p': 2}


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


0.3813235294117647 {'criterion': 'gini', 'max_depth': 2}
0.24975490196078431 {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'brute', 'p': 2}
0.328921568627451 {'kernel': 'sigmoid', 'C': 0.025, 'degree': 6}
0.26705882352941174 {'n_estimators': 50, 'max_features': 'auto', 'criterion': 'entropy'}
0.276421568627451 {'alpha': 0.001, 'activation': 'tanh', 'solver': 'sgd', 'hidden_layer_sizes': (30, 30, 30)}
0.2787254901960784 {'n_estimators': 60, 'learning_rate': 0.1}
0.42671568627450973
0.31088235294117644
knn 0.24975490196078431 {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'brute', 'p': 2}
0.31726190476190474 {'criterion': 'entropy', 'max_depth': 2}
0.21488095238095237 {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'brute', 'p': 1}
0.28849206349206347 {'kernel': 'linear', 'C': 1.0, 'degree': 6}
0.23988095238095236 {'n_estimators': 10, 'max_features': 'log2', 'criterion': 'gini'}
0.2871031746031746 {'alpha': 0.1, 'activation': 'tanh', 'solver': 'sgd', 'hidden_layer

# feat12

In [13]:
app2metric_feat = np.load('../prepare/app2metric_feat12.npy').item()
df_dataset = gen_dataset_df(df_robust, app2metric_feat)
df_dataset.to_csv("dataset_using_feat12.csv", index=False, encoding='utf-8')
#print df_dataset.groupby("Robust").size()

df_X, df_y = gen_model_input(df_dataset)

# Test different models, find the best
[error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k = 3)
print bestModel, error, modelParam

# Test different models, find the best
[error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k = 5)
print bestModel, error, modelParam

# Test different models, find the best
[error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k = 10)
print bestModel, error, modelParam

Great! appName(s) are identical!
0.3071604938271605 {'criterion': 'entropy', 'max_depth': 6}
0.32888888888888884 {'n_neighbors': 4, 'weights': 'uniform', 'algorithm': 'brute', 'p': 1}
0.32888888888888884 {'kernel': 'sigmoid', 'C': 0.025, 'degree': 6}
0.30518518518518517 {'n_estimators': 50, 'max_features': 'auto', 'criterion': 'entropy'}
0.32888888888888884 {'alpha': 0.001, 'activation': 'relu', 'solver': 'sgd', 'hidden_layer_sizes': (100, 100, 100)}
0.3422222222222222 {'n_estimators': 100, 'learning_rate': 0.01}
0.5654320987654321
0.47901234567901235
random forest 0.30518518518518517 {'n_estimators': 50, 'max_features': 'auto', 'criterion': 'entropy'}
0.32568627450980386 {'criterion': 'entropy', 'max_depth': 4}
0.27558823529411763 {'n_neighbors': 10, 'weights': 'distance', 'algorithm': 'brute', 'p': 1}
0.328921568627451 {'kernel': 'sigmoid', 'C': 0.025, 'degree': 6}
0.27862745098039216 {'n_estimators': 50, 'max_features': 'log2', 'criterion': 'entropy'}
0.28735294117647053 {'alpha': 1

# feat9

In [14]:
app2metric_feat = np.load('../prepare/app2metric_feat9.npy').item()
df_dataset = gen_dataset_df(df_robust, app2metric_feat)
df_dataset.to_csv("dataset_using_feat9.csv", index=False, encoding='utf-8')
#print df_dataset.groupby("Robust").size()

df_X, df_y = gen_model_input(df_dataset)

# Test different models, find the best
[error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k = 3)
print bestModel, error, modelParam

# Test different models, find the best
[error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k = 5)
print bestModel, error, modelParam

# Test different models, find the best
[error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k = 10)
print bestModel, error, modelParam

Great! appName(s) are identical!
0.31555555555555553 {'criterion': 'gini', 'max_depth': 3}
0.30123456790123454 {'n_neighbors': 10, 'weights': 'distance', 'algorithm': 'brute', 'p': 1}
0.32888888888888884 {'kernel': 'sigmoid', 'C': 0.025, 'degree': 6}
0.30419753086419754 {'n_estimators': 50, 'max_features': 'log2', 'criterion': 'entropy'}
0.2898765432098765 {'alpha': 1.0, 'activation': 'identity', 'solver': 'sgd', 'hidden_layer_sizes': (30, 30, 30)}
0.3298765432098765 {'n_estimators': 100, 'learning_rate': 0.01}
0.5293827160493827
0.3916049382716049
neural nets 0.2898765432098765 {'alpha': 1.0, 'activation': 'identity', 'solver': 'sgd', 'hidden_layer_sizes': (30, 30, 30)}
0.29049019607843135 {'criterion': 'entropy', 'max_depth': 4}
0.23715686274509803 {'n_neighbors': 4, 'weights': 'uniform', 'algorithm': 'brute', 'p': 1}
0.328921568627451 {'kernel': 'sigmoid', 'C': 0.025, 'degree': 6}
0.2927941176470588 {'n_estimators': 50, 'max_features': 'auto', 'criterion': 'gini'}
0.2646568627450980

# featMystic

In [15]:
app2metric_feat = np.load('../prepare/app2metric_featMystic.npy').item()
df_dataset = gen_dataset_df(df_robust, app2metric_feat)
df_dataset.to_csv("dataset_using_featMystic.csv", index=False, encoding='utf-8')
#print df_dataset.groupby("Robust").size()

df_X, df_y = gen_model_input(df_dataset)

# Test different models, find the best
[error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k = 3)
print bestModel, error, modelParam

# Test different models, find the best
[error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k = 5)
print bestModel, error, modelParam

# Test different models, find the best
[error, bestModel, modelParam] = GreedySearch(df_X, df_y, fold_k = 10)
print bestModel, error, modelParam

Great! appName(s) are identical!
0.22518518518518518 {'criterion': 'gini', 'max_depth': 10}
0.2765432098765432 {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'brute', 'p': 2}
0.2908641975308642 {'kernel': 'poly', 'C': 0.025, 'degree': 6}
0.3298765432098765 {'n_estimators': 50, 'max_features': 'log2', 'criterion': 'entropy'}
0.27851851851851855 {'alpha': 0.01, 'activation': 'identity', 'solver': 'adam', 'hidden_layer_sizes': (100, 100, 100)}
0.18814814814814815 {'n_estimators': 30, 'learning_rate': 0.01}
0.30617283950617286
0.32888888888888884
AdaBoost 0.18814814814814815 {'n_estimators': 30, 'learning_rate': 0.01}
0.20112745098039214 {'criterion': 'entropy', 'max_depth': 7}
0.2403921568627451 {'n_neighbors': 10, 'weights': 'distance', 'algorithm': 'brute', 'p': 2}
0.2897549019607843 {'kernel': 'poly', 'C': 0.025, 'degree': 6}
0.18779411764705883 {'n_estimators': 100, 'max_features': 'log2', 'criterion': 'gini'}
0.2403921568627451 {'alpha': 0.001, 'activation': 'tanh', 'solver':