In [1]:
import sys
import numpy as np
import pandas as pd
from sklearn import metrics

sys.path.insert(0, "../pycode")
from models import *

%matplotlib inline
pd.set_option("max_rows", 10)
np.set_printoptions(suppress=True)

from seaborn import set_style
set_style("darkgrid")
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.externals import joblib  # to save model to disk

from sklearn.tree import DecisionTreeClassifier      # decision tree
from sklearn.neighbors import KNeighborsClassifier  # KNN
#from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
#from sklearn.naive_bayes import GaussianNB
#from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


from sklearn.model_selection import StratifiedKFold  # StratifiedKFold
kf = StratifiedKFold(n_splits=5, random_state=314159, shuffle=True)    # 5 fold

In [2]:
def gen_dataset_df(df_robust, app2metric_dd):
    set1 = set(df_robust['appName'])
    set2 = set(app2metric_dd.keys())
    if set1 == set2: print "Great! appName(s) are identical!"
        
    #
    # generate an empty dataframe
    #
    app_pd_series = app2metric_dd[list(app2metric_dd.keys())[0]] # query the 1st key value
    feat_cols = list(app_pd_series.index)

    feat_cols.insert(0, 'appName')  # add appName to the beginning of the list
    feat_cols.insert(len(feat_cols), 'Robust')

    appNum = len(app2metric_dd.keys())
    df_feat = pd.DataFrame(index=np.arange(0, appNum), columns=feat_cols)

    #
    # fill in the data
    #
    
    feat_cols = list(app_pd_series.index)

    for idx, row in df_feat.iterrows():
        appName, robust = df_robust.loc[idx]['appName'], df_robust.loc[idx]['Robust']
        metric_list = app2metric_dd[appName]

        # update 
        df_feat.loc[idx, 'appName'] = appName
        df_feat.loc[idx, 'Robust'] = robust

        # update other metrics
        for metric in feat_cols:
            df_feat.loc[idx, metric] = metric_list[metric]
            
    return df_feat

In [3]:
def gen_model_input(df_dataset):
    df_X = df_dataset.drop(['appName', 'Robust'], axis=1)
    df_y = df_dataset['Robust']
    df_y = df_y.astype('int64')  # convert obj to int
    return df_X, df_y



In [4]:
# def GreedySearch(df_X, df_y, fold_k = 3):
#     modelErrorList = []
    
#     # decision tree
#     dt_tree_error, dt_tree_params = GreedySearch_dtree(df_X, df_y, fold_k=fold_k)
#     print dt_tree_error, dt_tree_params
#     modelErrorList.append((dt_tree_error, 'dtree', dt_tree_params))
    
#     # knn
#     knn_error, knn_params = GreedySearch_KNN(df_X, df_y, fold_k=fold_k)
#     print knn_error, knn_params
#     modelErrorList.append((knn_error, 'knn', knn_params))

#     # svm
#     svc_error, svc_params = GreedySearch_SVC(df_X, df_y, fold_k=fold_k)
#     print svc_error, svc_params
#     modelErrorList.append((svc_error, 'svc', svc_params))

#     # random forest
#     rf_error, rf_params = GreedySearch_RandomForest(df_X, df_y, fold_k=fold_k)
#     print rf_error, rf_params
#     modelErrorList.append((rf_error, 'random forest', rf_params))

#     nn_error, nn_params = GreedySearch_MLP(df_X, df_y, fold_k=fold_k)
#     print nn_error, nn_params
#     modelErrorList.append((nn_error, 'neural nets', nn_params))

#     ada_error, ada_params = GreedySearch_AdaBoost(df_X, df_y, fold_k=fold_k)
#     print ada_error, ada_params
#     modelErrorList.append((ada_error, 'AdaBoost', ada_params))

#     nb_error = GreedySearch_GaussianNB(df_X, df_y, fold_k=fold_k)
#     print nb_error
#     modelErrorList.append((nb_error, 'GaussianNB', ""))

#     qda_error = GreedySearch_QDA(df_X, df_y, fold_k=fold_k)
#     print qda_error
#     modelErrorList.append((qda_error, 'QDA', ""))
    
#     sort_model_by_error = sorted(modelErrorList, key=lambda x: x[0]) #  error is the 1st value
    
#     # output
#     return sort_model_by_error[0]

In [5]:
df_robust = pd.read_csv('../prepare/run2/app_classify.csv')
print type(df_robust)

<class 'pandas.core.frame.DataFrame'>


# featall

In [6]:
app2metric_featall = np.load('../prepare/app2metric_featAll.npy').item()
df_dataset = gen_dataset_df(df_robust, app2metric_featall)
df_dataset.to_csv("dataset_using_featall.csv", index=False, encoding='utf-8')

print df_dataset.groupby("Robust").size()

df_X, df_y = gen_model_input(df_dataset)

# AdaBoost 0.21299019607843137 {'n_estimators': 100, 'learning_rate': 0.01}

error_list = []
minError = 1.0
for train_index, test_index in kf.split(df_X, df_y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = df_X.loc[train_index], df_X.loc[test_index]
    y_train, y_test = df_y.loc[train_index], df_y.loc[test_index]

    clsfy = AdaBoostClassifier(n_estimators=100,
                               learning_rate=0.01,
                               random_state=0)

    clsfy.fit(X_train, y_train)
    err = metrics.mean_absolute_error(y_test, clsfy.predict(X_test))
    error_list.append(err)
    
    if err < minError: # update classified
        minError = err
        joblib.dump(clsfy, 'featall_bestmodel.pkl')
    
print error_list

Great! appName(s) are identical!
Robust
0    53
1    26
dtype: int64
[0.29411764705882354, 0.125, 0.3125, 0.2, 0.13333333333333333]


In [7]:
# featall_bestmodel = joblib.load('featall_bestmodel.pkl')

# for train_index, test_index in kf.split(df_X, df_y):
#     #print("TRAIN:", train_index, "TEST:", test_index)
#     X_train, X_test = df_X.loc[train_index], df_X.loc[test_index]
#     y_train, y_test = df_y.loc[train_index], df_y.loc[test_index]

#     err = metrics.mean_absolute_error(y_test, featall_bestmodel.predict(X_test))
#     print err

# feat64

In [8]:
app2metric_feat = np.load('../prepare/app2metric_feat64.npy').item()
df_dataset = gen_dataset_df(df_robust, app2metric_feat)
df_dataset.to_csv("dataset_using_feat64.csv", index=False, encoding='utf-8')

print df_dataset.groupby("Robust").size()

df_X, df_y = gen_model_input(df_dataset)

#AdaBoost 0.21299019607843137 {'n_estimators': 100, 'learning_rate': 0.01}

error_list = []
minError = 1.0
for train_index, test_index in kf.split(df_X, df_y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = df_X.loc[train_index], df_X.loc[test_index]
    y_train, y_test = df_y.loc[train_index], df_y.loc[test_index]

    clsfy = AdaBoostClassifier(n_estimators=100,
                               learning_rate=0.01,
                               random_state=0)

    clsfy.fit(X_train, y_train)
    err = metrics.mean_absolute_error(y_test, clsfy.predict(X_test))
    error_list.append(err)
    
    if err < minError: # update classified
        minError = err
        joblib.dump(clsfy, 'feat64_bestmodel.pkl')
    
print error_list

Great! appName(s) are identical!
Robust
0    53
1    26
dtype: int64
[0.29411764705882354, 0.125, 0.3125, 0.2, 0.13333333333333333]


# feat42

In [9]:
app2metric_feat = np.load('../prepare/app2metric_feat42.npy').item()
df_dataset = gen_dataset_df(df_robust, app2metric_feat)
df_dataset.to_csv("dataset_using_feat42.csv", index=False, encoding='utf-8')

print df_dataset.groupby("Robust").size()

df_X, df_y = gen_model_input(df_dataset)


# random forest 0.20039215686274509 {'n_estimators': 100, 'max_features': 'log2', 'criterion': 'gini'}
error_list = []
minError = 1.0
for train_index, test_index in kf.split(df_X, df_y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = df_X.loc[train_index], df_X.loc[test_index]
    y_train, y_test = df_y.loc[train_index], df_y.loc[test_index]

    clsfy = RandomForestClassifier(n_estimators=100,
                               max_features='log2',
                               criterion='gini')

    clsfy.fit(X_train, y_train)
    err = metrics.mean_absolute_error(y_test, clsfy.predict(X_test))
    error_list.append(err)
    
    if err < minError: # update classified
        minError = err
        joblib.dump(clsfy, 'feat42_bestmodel.pkl')
    
print error_list



Great! appName(s) are identical!
Robust
0    53
1    26
dtype: int64
[0.23529411764705882, 0.3125, 0.3125, 0.13333333333333333, 0.13333333333333333]


# feat26

In [10]:
app2metric_feat = np.load('../prepare/app2metric_feat26.npy').item()
df_dataset = gen_dataset_df(df_robust, app2metric_feat)
df_dataset.to_csv("dataset_using_feat26.csv", index=False, encoding='utf-8')

print df_dataset.groupby("Robust").size()

df_X, df_y = gen_model_input(df_dataset)



# neural nets 0.21372549019607842 {'alpha': 1.0, 'activation': 'identity', 'solver': 'sgd', 'hidden_layer_sizes': (60, 60, 60)}
error_list = []
minError = 1.0
for train_index, test_index in kf.split(df_X, df_y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = df_X.loc[train_index], df_X.loc[test_index]
    y_train, y_test = df_y.loc[train_index], df_y.loc[test_index]

    clsfy = MLPClassifier(alpha=1.0, activation='identity', 
                          solver='sgd', hidden_layer_sizes=(60, 60, 60))

    clsfy.fit(X_train, y_train)
    err = metrics.mean_absolute_error(y_test, clsfy.predict(X_test))
    error_list.append(err)
    
    if err < minError: # update classified
        minError = err
        joblib.dump(clsfy, 'feat26_bestmodel.pkl')
    
print error_list

Great! appName(s) are identical!
Robust
0    53
1    26
dtype: int64




[0.35294117647058826, 0.3125, 0.3125, 0.3333333333333333, 0.3333333333333333]


# feat18

In [11]:
app2metric_feat = np.load('../prepare/app2metric_feat18.npy').item()
df_dataset = gen_dataset_df(df_robust, app2metric_feat)
df_dataset.to_csv("dataset_using_feat18.csv", index=False, encoding='utf-8')

print df_dataset.groupby("Robust").size()

df_X, df_y = gen_model_input(df_dataset)

# neural nets 0.26215686274509803 {'alpha': 1.0, 'activation': 'identity', 'solver': 'adam', 'hidden_layer_sizes': (60, 60, 60)}

error_list = []
minError = 1.0
for train_index, test_index in kf.split(df_X, df_y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = df_X.loc[train_index], df_X.loc[test_index]
    y_train, y_test = df_y.loc[train_index], df_y.loc[test_index]

    clsfy = MLPClassifier(alpha=1.0, activation='identity', 
                          solver='adam', hidden_layer_sizes=(60, 60, 60))

    clsfy.fit(X_train, y_train)
    err = metrics.mean_absolute_error(y_test, clsfy.predict(X_test))
    error_list.append(err)
    
    if err < minError: # update classified
        minError = err
        joblib.dump(clsfy, 'feat18_bestmodel.pkl')
    
print error_list


Great! appName(s) are identical!
Robust
0    53
1    26
dtype: int64
[0.23529411764705882, 0.5, 0.375, 0.2, 0.13333333333333333]


# feat14

In [12]:
app2metric_feat = np.load('../prepare/app2metric_feat14.npy').item()
df_dataset = gen_dataset_df(df_robust, app2metric_feat)
df_dataset.to_csv("dataset_using_feat14.csv", index=False, encoding='utf-8')
#print df_dataset.groupby("Robust").size()

df_X, df_y = gen_model_input(df_dataset)

#  knn 0.24975490196078431 {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'brute', 'p': 2}

error_list = []
minError = 1.0
for train_index, test_index in kf.split(df_X, df_y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = df_X.loc[train_index], df_X.loc[test_index]
    y_train, y_test = df_y.loc[train_index], df_y.loc[test_index]

    clsfy = KNeighborsClassifier(n_neighbors=3, weights='distance', algorithm='brute', p=2) 

    clsfy.fit(X_train, y_train)
    err = metrics.mean_absolute_error(y_test, clsfy.predict(X_test))
    error_list.append(err)
    
    if err < minError: # update classified
        minError = err
        joblib.dump(clsfy, 'feat14_bestmodel.pkl')
    
print error_list


Great! appName(s) are identical!
[0.35294117647058826, 0.375, 0.1875, 0.2, 0.13333333333333333]


# feat12

In [13]:
app2metric_feat = np.load('../prepare/app2metric_feat12.npy').item()
df_dataset = gen_dataset_df(df_robust, app2metric_feat)
df_dataset.to_csv("dataset_using_feat12.csv", index=False, encoding='utf-8')
#print df_dataset.groupby("Robust").size()

df_X, df_y = gen_model_input(df_dataset)

# knn 0.27558823529411763 {'n_neighbors': 10, 'weights': 'distance', 'algorithm': 'brute', 'p': 1}


error_list = []
minError = 1.0
for train_index, test_index in kf.split(df_X, df_y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = df_X.loc[train_index], df_X.loc[test_index]
    y_train, y_test = df_y.loc[train_index], df_y.loc[test_index]

    clsfy = KNeighborsClassifier(n_neighbors=10, weights='distance', algorithm='brute', p=1) 

    clsfy.fit(X_train, y_train)
    err = metrics.mean_absolute_error(y_test, clsfy.predict(X_test))
    error_list.append(err)
    
    if err < minError: # update classified
        minError = err
        joblib.dump(clsfy, 'feat12_bestmodel.pkl')
    
print error_list

Great! appName(s) are identical!
[0.35294117647058826, 0.3125, 0.3125, 0.26666666666666666, 0.13333333333333333]


# feat9

In [14]:
app2metric_feat = np.load('../prepare/app2metric_feat9.npy').item()
df_dataset = gen_dataset_df(df_robust, app2metric_feat)
df_dataset.to_csv("dataset_using_feat9.csv", index=False, encoding='utf-8')
#print df_dataset.groupby("Robust").size()

df_X, df_y = gen_model_input(df_dataset)

# knn 0.23715686274509803 {'n_neighbors': 4, 'weights': 'uniform', 'algorithm': 'brute', 'p': 1}
error_list = []
minError = 1.0
for train_index, test_index in kf.split(df_X, df_y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = df_X.loc[train_index], df_X.loc[test_index]
    y_train, y_test = df_y.loc[train_index], df_y.loc[test_index]

    clsfy = KNeighborsClassifier(n_neighbors=4, weights='uniform', algorithm='brute', p=1) 

    clsfy.fit(X_train, y_train)
    err = metrics.mean_absolute_error(y_test, clsfy.predict(X_test))
    error_list.append(err)
    
    if err < minError: # update classified
        minError = err
        joblib.dump(clsfy, 'feat9_bestmodel.pkl')
    
print error_list


Great! appName(s) are identical!
[0.29411764705882354, 0.375, 0.25, 0.13333333333333333, 0.13333333333333333]


# featMystic

In [15]:
app2metric_feat = np.load('../prepare/app2metric_featMystic.npy').item()
df_dataset = gen_dataset_df(df_robust, app2metric_feat)
df_dataset.to_csv("dataset_using_featMystic.csv", index=False, encoding='utf-8')
#print df_dataset.groupby("Robust").size()

df_X, df_y = gen_model_input(df_dataset)

# random forest 0.18779411764705883 {'n_estimators': 100, 'max_features': 'log2', 'criterion': 'gini'}

error_list = []
minError = 1.0
for train_index, test_index in kf.split(df_X, df_y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = df_X.loc[train_index], df_X.loc[test_index]
    y_train, y_test = df_y.loc[train_index], df_y.loc[test_index]

    clsfy = RandomForestClassifier(n_estimators=100,
                               max_features='log2',
                               criterion='gini')

    clsfy.fit(X_train, y_train)
    err = metrics.mean_absolute_error(y_test, clsfy.predict(X_test))
    error_list.append(err)
    
    if err < minError: # update classified
        minError = err
        joblib.dump(clsfy, 'featMystic_bestmodel.pkl')
    
print error_list


Great! appName(s) are identical!
[0.23529411764705882, 0.25, 0.375, 0.2, 0.13333333333333333]
