In [1]:
# Environment setup and module import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier




In [74]:
def load_encoded_data(filename):
    data = pd.read_csv('data/'+filename+'_encoded.csv')
    if 'HasDetections' in data.columns:
        labels = pd.DataFrame(data['HasDetections']).values.reshape(len(data))
        data.drop(columns=['HasDetections'], inplace=True)
    else:
        labels=None
    return data, labels

In [75]:
print("loading mini train ...")
mini_train_data, mini_train_labels = load_encoded_data('mini_train')
print("loading mini dev ...")
mini_dev_data, mini_dev_labels = load_encoded_data('mini_dev')
print("loading mini test ...")
mini_test_data, mini_test_labels = load_encoded_data('mini_test')



loading mini train ...
loading mini dev ...
loading mini test ...


In [4]:
def find_correlations(data, labels):
    correlated_cols = []
    for i, col in enumerate(data.columns):
        if len(data[col].unique())>1:
            corr = np.corrcoef(data[col].values.reshape(len(labels)), labels)[0][1]
            correlated_cols.append([col, np.abs(corr), corr])
            
    return pd.DataFrame(correlated_cols, columns=['col', 'abscorr', 'corr']).sort_values('abscorr', ascending=False)

In [144]:
corr = find_correlations(mini_train_data, mini_train_labels)
display(corr)
corr_cols = corr['col'].values.reshape(len(corr))


Unnamed: 0,col,abscorr,corr
67,AVProductsInstalled,0.144179,-0.144179
128,AVProductStatesIdentifier,0.118651,0.118651
29,OsBuildLab_platform,0.074611,-0.074611
6,Processor,0.074339,-0.074339
20,Census_OSArchitecture,0.074035,-0.074035
56,AppVersion_build1,0.073126,-0.073126
88,Census_IsAlwaysOnAlwaysConnectedCapable,0.061639,-0.061639
41,EngineVersion_build2,0.061433,-0.061433
71,IsProtected,0.055379,0.055379
134,Census_TotalPhysicalRAM,0.055017,0.055017


In [6]:
def score_log_reg(train_data, train_labels, dev_data, dev_labels):
    lrc = LogisticRegression(solver='lbfgs', max_iter=150)
    lrc.fit(X=train_data, y=train_labels)

    # predict and score on the dev set
    pred = lrc.predict(dev_data)
    return metrics.accuracy_score(y_true=dev_labels, y_pred=pred)


In [145]:
print("using only top n correlated columns")
print("fitting and scoring logistic regressions ...")
print()

cols = []
best = 0
for n in range(1,139): 
    print(n, end=" ")
    print('adding column',corr_cols[n-1], '...', end=' ')
    cols.append(corr_cols[n-1])
    data = mini_train_data.filter(cols)
    dev_data = mini_dev_data.filter(cols)
    acc = score_tree(data, mini_train_labels, dev_data, mini_dev_labels)
    print('accuracy =', acc, end=" ")
    if acc > best:
        print("keeping this column")
        best = acc
    else:
        cols.remove(corr_cols[n-1])
        print()
        
print("cols_selected =[")
for col in cols:
    print('    \''+col+'\',')
print("]")



using only top n correlated columns
fitting and scoring logistic regressions ...

1 adding column AVProductsInstalled ... accuracy = 0.5660205347401772 keeping this column
2 adding column AVProductStatesIdentifier ... accuracy = 0.5682772638280701 keeping this column
3 adding column OsBuildLab_platform ... accuracy = 0.5796057449447773 keeping this column
4 adding column Processor ... accuracy = 0.5796505806220204 keeping this column
5 adding column Census_OSArchitecture ... accuracy = 0.5795011283645439 
6 adding column AppVersion_build1 ... accuracy = 0.5686060587945181 
7 adding column Census_IsAlwaysOnAlwaysConnectedCapable ... accuracy = 0.5815635695177176 keeping this column
8 adding column EngineVersion_build2 ... accuracy = 0.5867495628521469 keeping this column
9 adding column IsProtected ... accuracy = 0.5866897819491563 
10 adding column Census_TotalPhysicalRAM ... accuracy = 0.49257969541629926 
11 adding column AvSigVersion ... accuracy = 0.5185395525399411 
12 adding colu

104 adding column Census_OEMNameIdentifier_wasna ... accuracy = 0.5982424414520782 
105 adding column Census_OEMModelIdentifier_wasna ... accuracy = 0.5975848515191822 
106 adding column OsVer_combined ... accuracy = 0.5985114555155355 keeping this column
107 adding column OsVer_major ... accuracy = 0.5985562911927784 keeping this column
108 adding column GeoNameIdentifier ... accuracy = 0.579949485136973 
109 adding column OsVer_build1 ... accuracy = 0.5981826605490876 
110 adding column UacLuaenable_wasna ... accuracy = 0.5979435369371254 
111 adding column OsVer ... accuracy = 0.5983171675808163 
112 adding column GeoNameIdentifier_wasna ... accuracy = 0.5984666198382926 
113 adding column Census_IsFlightsDisabled ... accuracy = 0.5986011268700214 keeping this column
114 adding column AvSigVersion_major ... accuracy = 0.5985114555155355 
115 adding column Census_OSInstallLanguageIdentifier_wasna ... accuracy = 0.5978837560341349 
116 adding column Census_IsPortableOperatingSystem ..

In [142]:
cols_selected =[
    'AVProductsInstalled',
    'AVProductStatesIdentifier',
    'OsBuildLab_platform',
    'Census_IsAlwaysOnAlwaysConnectedCapable',
    'EngineVersion_build2',
    'Census_ProcessorCoreCount',
    'Census_IsVirtualDevice',
    'AppVersion_minor',
    'Census_IsTouchEnabled',
    'SMode_wasna',
#     'RtpStateBitfield',
#     'AVProductsEnabled',
#     'AVProductsEnabled_wasna',
#     'RtpStateBitfield_wasna',
#     'Wdft_IsGamer_wasna',
#     'Census_IsAlwaysOnAlwaysConnectedCapable_wasna',
#     'AvSigVersion_combined',
#     'AppVersion_combined',
#     'Platform',
#     'Census_IsPortableOperatingSystem',
#     'OsVer_minor',
#     'UacLuaenable',
#     'Census_IsVirtualDevice_wasna',
#     'Census_InternalPrimaryDisplayResolutionHorizontal_wasna',
]


train_data_selected = mini_train_data.filter(cols_selected)
dev_data_selected = mini_dev_data.filter(cols_selected)
test_data_selected = mini_test_data.filter(cols_selected)

In [18]:
def score_tree(train_data, train_labels, dev_data, dev_labels):
    dtc = DecisionTreeClassifier()
    dtc.fit(X=train_data, y=train_labels)

    # predict and score on the dev set
    pred = dtc.predict(dev_data)
    return metrics.accuracy_score(y_true=dev_labels, y_pred=pred)


In [143]:
print('using decision tree with final selected columns ...', end=" ")
acc = score_tree(train_data_selected, mini_train_labels, dev_data_selected, mini_dev_labels)
print('accuracy on dev set =', round(100*acc,4), "% ")


using decision tree with final selected columns ... accuracy on dev set = 59.1517 % 


In [108]:
acc = score_tree(train_data_selected, mini_train_labels, test_data_selected, mini_test_labels)
print('accuracy on test set =', round(100*acc,4), "% ")


accuracy on test set = 59.9453 % 


In [109]:
dtc = DecisionTreeClassifier()
cross_val_score(dtc, train_data_selected, mini_train_labels, cv=2, scoring='accuracy')

array([0.59999616, 0.59711129])