In [1]:
# Environment setup and module import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier




In [2]:
def load_encoded_data(filename):
    data = pd.read_csv('data/'+filename+'_encoded.csv')
    if 'HasDetections' in data.columns:
        labels = pd.DataFrame(data['HasDetections']).values.reshape(len(data))
        data.drop(columns=['HasDetections'], inplace=True)
    else:
        labels=None
    return data, labels

In [3]:
print("loading mini train ...")
mini_train_data, mini_train_labels = load_encoded_data('mini_train')
print("loading mini dev ...")
mini_dev_data, mini_dev_labels = load_encoded_data('mini_dev')
print("loading mini test ...")
mini_test_data, mini_test_labels = load_encoded_data('mini_test')



loading mini train ...
loading mini dev ...
loading mini test ...


In [4]:
def find_correlations(data, labels):
    correlated_cols = []
    for i, col in enumerate(data.columns):
        if len(data[col].unique())>1:
            corr = np.corrcoef(data[col].values.reshape(len(labels)), labels)[0][1]
            correlated_cols.append([col, np.abs(corr), corr])
            
    return pd.DataFrame(correlated_cols, columns=['col', 'abscorr', 'corr']).sort_values('abscorr', ascending=False)

In [5]:
corr = find_correlations(mini_train_data, mini_train_labels)
display(corr)
corr_cols = corr['col'].values.reshape(len(corr))


Unnamed: 0,col,abscorr,corr
13,SmartScreen,0.193585,-0.193585
103,AVProductsInstalled,0.144179,-0.144179
123,AVProductStatesIdentifier,0.118651,0.118651
67,OsBuildLab_platform,0.074611,-0.074611
7,Processor,0.074339,-0.074339
...,...,...,...
113,Census_OSInstallLanguageIdentifier,0.000499,0.000499
0,MachineIdentifier,0.000450,-0.000450
75,Census_IsSecureBootEnabled,0.000421,0.000421
83,Census_OSUILocaleIdentifier,0.000375,0.000375


In [6]:
def score_log_reg(train_data, train_labels, dev_data, dev_labels):
    lrc = LogisticRegression(solver='lbfgs', max_iter=150)
    lrc.fit(X=train_data, y=train_labels)

    # predict and score on the dev set
    pred = lrc.predict(dev_data)
    return metrics.accuracy_score(y_true=dev_labels, y_pred=pred)


In [7]:
def score_tree(train_data, train_labels, dev_data, dev_labels):
    dtc = DecisionTreeClassifier()
    dtc.fit(X=train_data, y=train_labels)

    # predict and score on the dev set
    pred = dtc.predict(dev_data)
    return metrics.accuracy_score(y_true=dev_labels, y_pred=pred)


In [8]:
print("using only top n correlated columns")
print("fitting and scoring logistic regressions ...")
print()

cols = []
best = 0
for n in range(1,139): 
    print(n, end=" ")
    print('adding column',corr_cols[n-1], '...', end=' ')
    cols.append(corr_cols[n-1])
    data = mini_train_data.filter(cols)
    dev_data = mini_dev_data.filter(cols)
    acc = score_tree(data, mini_train_labels, dev_data, mini_dev_labels)
    print('accuracy =', acc, end=" ")
    if acc > best:
        print("keeping this column")
        best = acc
    else:
        cols.remove(corr_cols[n-1])
        print()
        
print("cols_selected =[")
for col in cols:
    print('    \''+col+'\',')
print("]")



using only top n correlated columns
fitting and scoring logistic regressions ...

1 adding column SmartScreen ... accuracy = 0.5760039455395973 keeping this column
2 adding column AVProductsInstalled ... accuracy = 0.5849262453109354 keeping this column
3 adding column AVProductStatesIdentifier ... accuracy = 0.5879302356862101 keeping this column
4 adding column OsBuildLab_platform ... accuracy = 0.5996024569951129 keeping this column
5 adding column Processor ... accuracy = 0.5996024569951129 
6 adding column Census_OSArchitecture ... accuracy = 0.5995127856406272 
7 adding column AppVersion_build1 ... accuracy = 0.6067612201282301 keeping this column
8 adding column Census_IsAlwaysOnAlwaysConnectedCapable ... accuracy = 0.6097801557292523 keeping this column
9 adding column EngineVersion_build2 ... accuracy = 0.6109757737890631 keeping this column
10 adding column IsProtected ... accuracy = 0.6109757737890631 
11 adding column Census_TotalPhysicalRAM ... accuracy = 0.611334459207006

104 adding column Platform ... accuracy = 0.6149960395151769 
105 adding column Census_OEMNameIdentifier_wasna ... accuracy = 0.6148316420319528 
106 adding column Census_OEMModelIdentifier_wasna ... accuracy = 0.6143832852595238 
107 adding column OsBuildLab_release ... accuracy = 0.6087190447011702 
108 adding column OsVer_combined ... accuracy = 0.6151454917726532 
109 adding column OsVer_major ... accuracy = 0.615235163127139 
110 adding column GeoNameIdentifier ... accuracy = 0.5930713933433964 
111 adding column OsVer_build1 ... accuracy = 0.6150408751924198 
112 adding column UacLuaenable_wasna ... accuracy = 0.614741970677467 
113 adding column OsVer ... accuracy = 0.6153098892558773 
114 adding column GeoNameIdentifier_wasna ... accuracy = 0.6148914229349435 
115 adding column Census_IsFlightsDisabled ... accuracy = 0.6146971350002242 
116 adding column AvSigVersion_major ... accuracy = 0.6152501083528866 
117 adding column Census_FlightRing ... accuracy = 0.6141591068733093 


In [13]:
cols_selected =[
    'SmartScreen',
    'AVProductsInstalled',
    'AVProductStatesIdentifier',
    'OsBuildLab_platform',
    'AppVersion_build1',
    'Census_IsAlwaysOnAlwaysConnectedCapable',
    'EngineVersion_build2',
    'Census_TotalPhysicalRAM',
    'Census_IsVirtualDevice',
    'AppVersion_minor',
    'RtpStateBitfield',
    'SMode_wasna',
    'IsSxsPassiveMode',
    'AVProductsEnabled',
    'Census_FirmwareVersionIdentifier_wasna',
    'Census_IsPenCapable',
    'Wdft_IsGamer_wasna',
    'Census_PrimaryDiskTotalCapacity_wasna',
    'Census_IsVirtualDevice_wasna',
]

train_data_selected = mini_train_data.filter(cols_selected)
dev_data_selected = mini_dev_data.filter(cols_selected)
test_data_selected = mini_test_data.filter(cols_selected)

In [14]:
print('using decision tree with final selected columns ...', end=" ")
acc = score_tree(train_data_selected, mini_train_labels, dev_data_selected, mini_dev_labels)
print('accuracy on dev set =', round(100*acc,4), "% ")


using decision tree with final selected columns ... accuracy on dev set = 61.5385 % 


In [15]:
acc = score_tree(train_data_selected, mini_train_labels, test_data_selected, mini_test_labels)
print('accuracy on test set =', round(100*acc,4), "% ")


accuracy on test set = 61.386 % 


In [17]:
dtc = DecisionTreeClassifier()
cross_val_score(dtc, train_data_selected, mini_train_labels, cv=10, scoring='accuracy')

array([0.60923589, 0.61508407, 0.61396317, 0.61463571, 0.61617294,
       0.6101201 , 0.6128743 , 0.61194556, 0.61072858, 0.6152442 ])