In [13]:
# Environment setup and module import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier




In [14]:
def load_encoded_data(filename):
    data = pd.read_csv('data/'+filename+'_encoded.csv')
    if 'HasDetections' in data.columns:
        labels = pd.DataFrame(data['HasDetections']).values.reshape(len(data))
        data.drop(columns=['HasDetections'], inplace=True)
    else:
        labels=None
    return data, labels

In [15]:
print("loading mini train ...")
mini_train_data, mini_train_labels = load_encoded_data('mini_train')
print("loading mini dev ...")
mini_dev_data, mini_dev_labels = load_encoded_data('mini_dev')
print("loading mini test ...")
mini_test_data, mini_test_labels = load_encoded_data('mini_test')



loading mini train ...
loading mini dev ...
loading mini test ...


In [16]:
def find_correlations(data, labels):
    correlated_cols = []
    for i, col in enumerate(data.columns):
        if len(data[col].unique())>1:
            corr = np.corrcoef(data[col].values.reshape(len(labels)), labels)[0][1]
            correlated_cols.append([col, np.abs(corr), corr])
            
    return pd.DataFrame(correlated_cols, columns=['col', 'abscorr', 'corr']).sort_values('abscorr', ascending=False)

In [17]:
corr = find_correlations(mini_train_data, mini_train_labels)
display(corr)
corr_cols = corr['col'].values.reshape(len(corr))


Unnamed: 0,col,abscorr,corr
13,SmartScreen,0.193585,-0.193585
103,AVProductsInstalled,0.144179,-0.144179
123,AVProductStatesIdentifier,0.118651,0.118651
67,OsBuildLab_platform,0.074611,-0.074611
7,Processor,0.074339,-0.074339
...,...,...,...
113,Census_OSInstallLanguageIdentifier,0.000499,0.000499
0,MachineIdentifier,0.000450,-0.000450
75,Census_IsSecureBootEnabled,0.000421,0.000421
83,Census_OSUILocaleIdentifier,0.000375,0.000375


In [18]:
def score_log_reg(train_data, train_labels, dev_data, dev_labels):
    lrc = LogisticRegression(solver='lbfgs', max_iter=150)
    lrc.fit(X=train_data, y=train_labels)

    # predict and score on the dev set
    pred = lrc.predict(dev_data)
    return metrics.accuracy_score(y_true=dev_labels, y_pred=pred)


In [27]:
def score_tree(train_data, train_labels, dev_data, dev_labels):
    dtc = DecisionTreeClassifier()
    dtc.fit(X=train_data, y=train_labels)

    # predict and score on the dev set
    pred = dtc.predict(dev_data)
    print()
    print("pred:",pred[:30])
    print("true:",dev_labels[:30])
    print()
    return metrics.accuracy_score(y_true=dev_labels, y_pred=pred)


In [20]:
print("using only top n correlated columns")
print("fitting and scoring logistic regressions ...")
print()

cols = []
best = 0
for n in range(1,139): 
    print(n, end=" ")
    print('adding column',corr_cols[n-1], '...', end=' ')
    cols.append(corr_cols[n-1])
    data = mini_train_data.filter(cols)
    dev_data = mini_dev_data.filter(cols)
    acc = score_tree(data, mini_train_labels, dev_data, mini_dev_labels)
    print('accuracy =', acc, end=" ")
    if acc > best:
        print("keeping this column")
        best = acc
    else:
        cols.remove(corr_cols[n-1])
        print()
        
print("cols_selected =[")
for col in cols:
    print('    \''+col+'\',')
print("]")



using only top n correlated columns
fitting and scoring logistic regressions ...

1 adding column SmartScreen ... accuracy = 0.5741806279983859 keeping this column
2 adding column AVProductsInstalled ... accuracy = 0.5828488589320142 keeping this column
3 adding column AVProductStatesIdentifier ... accuracy = 0.5883337567813962 keeping this column
4 adding column OsBuildLab_platform ... accuracy = 0.6006037871202045 keeping this column
5 adding column Processor ... accuracy = 0.6005440062172139 
6 adding column Census_OSArchitecture ... accuracy = 0.600499170539971 
7 adding column AppVersion_build1 ... accuracy = 0.6085397019921986 keeping this column
8 adding column Census_IsAlwaysOnAlwaysConnectedCapable ... accuracy = 0.6123058988806026 keeping this column
9 adding column EngineVersion_build2 ... accuracy = 0.6128439270075174 keeping this column
10 adding column IsProtected ... accuracy = 0.6127841461045269 
11 adding column Census_TotalPhysicalRAM ... accuracy = 0.6116483089477066

103 adding column CityIdentifier_wasna ... accuracy = 0.6177609062784893 
104 adding column Platform ... accuracy = 0.6194796072394674 
105 adding column Census_OEMNameIdentifier_wasna ... accuracy = 0.6194646620137197 
106 adding column Census_OEMModelIdentifier_wasna ... accuracy = 0.6192105931760099 
107 adding column OsBuildLab_release ... accuracy = 0.6122610632033597 
108 adding column OsVer_combined ... accuracy = 0.6196440047226913 
109 adding column OsVer_major ... accuracy = 0.6199429092376441 
110 adding column GeoNameIdentifier ... accuracy = 0.5989747575137122 
111 adding column OsVer_build1 ... accuracy = 0.619658949948439 
112 adding column UacLuaenable_wasna ... accuracy = 0.6197486213029247 
113 adding column OsVer ... accuracy = 0.6198084022059153 
114 adding column GeoNameIdentifier_wasna ... accuracy = 0.61977851175442 
115 adding column Census_IsFlightsDisabled ... accuracy = 0.61977851175442 
116 adding column AvSigVersion_major ... accuracy = 0.6196141142711961 


In [21]:
cols_selected =[
    'SmartScreen',
    'AVProductsInstalled',
    'AVProductStatesIdentifier',
    'OsBuildLab_platform',
    'AppVersion_build1',
    'Census_IsAlwaysOnAlwaysConnectedCapable',
    'EngineVersion_build2',
    'Census_ProcessorCoreCount',
    'EngineVersion_build1',
    'Census_IsVirtualDevice',
    'EngineVersion',
    'SMode_wasna',
    'IsSxsPassiveMode',
    'Census_InternalBatteryNumberOfCharges_wasna',
    'Census_ProcessorManufacturerIdentifier_wasna',
    'Census_ProcessorCoreCount_wasna',
    'Wdft_IsGamer_wasna',
    'Wdft_RegionIdentifier_wasna',
    'ProductName',
    'Firewall_wasna',
    'Census_IsVirtualDevice_wasna',
]

train_data_selected = mini_train_data.filter(cols_selected)
dev_data_selected = mini_dev_data.filter(cols_selected)
test_data_selected = mini_test_data.filter(cols_selected)

In [28]:
print('using decision tree with final selected columns ...', end=" ")
acc = score_tree(train_data_selected, mini_train_labels, dev_data_selected, mini_dev_labels)
print('accuracy on dev set =', round(100*acc,4), "% ")



using decision tree with final selected columns ... 
pred: [1 1 0 0 1 1 1 0 0 0 1 0 0 0 1 1 1 1 1 1 1 1 0 0 0 1 0 0 0 0]
true: [0 1 1 0 1 0 1 1 1 0 0 1 1 0 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0]

accuracy on dev set = 62.0048 % 


In [23]:
acc = score_tree(train_data_selected, mini_train_labels, test_data_selected, mini_test_labels)
print('accuracy on test set =', round(100*acc,4), "% ")


accuracy on test set = 62.0302 % 


In [24]:
dtc = DecisionTreeClassifier()
cross_val_score(dtc, train_data_selected, mini_train_labels, cv=10, scoring='accuracy')

array([0.61532057, 0.61546837, 0.61755004, 0.61860689, 0.62043235,
       0.61316253, 0.61527622, 0.61357886, 0.61450761, 0.61815853])