In [1]:
%matplotlib inline

# Import a bunch of libraries.
import time
from os import path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_openml
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, zero_one_loss
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

from sklearn.tree import DecisionTreeClassifier

# some PCA and GMM stuff
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from matplotlib.colors import LogNorm

import sklearn.ensemble as ske
from sklearn.feature_selection import SelectFromModel

In [2]:
dtypes = {
        'MachineIdentifier':                                    'str',
        'ProductName':                                          'str',
        'EngineVersion':                                        'str',
        'AppVersion':                                           'str',
        'AvSigVersion':                                         'str',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float64',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float32',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int16',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float64', 
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float32', 
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float32', 
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float64', 
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float64', 
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32', 
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32', 
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32', 
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float64', 
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

In [3]:
if not (path.exists("data/mini_initial.csv")):
    train_df = pd.read_csv("data/train.csv", dtype=dtypes)
    train_labels = train_df['HasDetections']
    train_ids = train_df['MachineIdentifier']
    train_data = train_df.drop(columns=['HasDetections', 'MachineIdentifier']) 
    df = train_data.sample(100000, random_state=123)
    df.to_csv("data/mini_initial.csv")
    print(len(train_data))
else:
    df = pd.read_csv("data/mini_initial.csv", dtype=dtypes)
    
display(df.head())



Unnamed: 0.1,Unnamed: 0,ProductName,EngineVersion,AppVersion,AvSigVersion,IsBeta,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,...,Census_FirmwareManufacturerIdentifier,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier
0,3082108,win8defender,1.1.15100.1,4.18.1806.18062,1.273.520.0,0,7.0,0,,53447.0,...,628.0,27767.0,1,,0.0,0,0,0.0,1.0,15.0
1,3553333,win8defender,1.1.15200.1,4.18.1807.18075,1.275.453.0,0,7.0,0,,7945.0,...,500.0,14353.0,0,,0.0,0,0,0.0,0.0,10.0
2,6768315,win8defender,1.1.15100.1,4.18.1807.18075,1.273.689.0,0,7.0,0,,53447.0,...,628.0,8941.0,1,,0.0,0,0,0.0,1.0,1.0
3,7098147,win8defender,1.1.15200.1,4.12.16299.15,1.275.483.0,0,7.0,0,,68585.0,...,142.0,46589.0,1,,0.0,0,0,0.0,1.0,7.0
4,716616,win8defender,1.1.15200.1,4.18.1807.18075,1.275.215.0,0,7.0,0,,53447.0,...,142.0,52530.0,0,,0.0,0,0,0.0,,


In [87]:
#
# load initial mini set to work on cleaning:
#

df = pd.read_csv("data/mini_initial.csv", dtype=dtypes) 


In [88]:
# import math
# #
# # deal with obvious NaNs
# #

# for col in df.columns:
#     if str(df[col].dtype).startswith("float") and df[col].hasnans:
#         print ("df['"+col+"'] = df['"+col+"'].astype(np.int"+("8" if len(df[col].unique())<4 else "16")+")")



#
# first, keep the information that some things were nans
#
df["Rtp_NaN"] = df["RtpStateBitfield"].isna()


#
# add categories where necessary so we can replace the NaNs
#

df["PuaMode"].cat.add_categories(["unknown"], inplace=True)
df["SmartScreen"].cat.add_categories(["unknown"], inplace=True)
df["Census_ProcessorClass"].cat.add_categories(["unknown"], inplace=True)
df["Census_PrimaryDiskTypeName"].cat.add_categories(["unknown"], inplace=True)
df["Census_ChassisTypeName"].cat.add_categories(["unknown"], inplace=True)
df["Census_InternalBatteryType"].cat.add_categories(["unknown"], inplace=True)


#
# Then, some more explicit fill choices
#
nanfill = {
    "RtpStateBitfield":0,
    "DefaultBrowsersIdentifier":0,
    'AVProductStatesIdentifier':0,
    'AVProductsInstalled':-1,
    'AVProductsEnabled':-1,
    'CityIdentifier':-1,
    'OrganizationIdentifier':-1,
    'GeoNameIdentifier':-1,
    'IsProtected':-1,
    'SMode':-1,
    'IeVerIdentifier':-1,
    'PuaMode':'unknown',
    'SmartScreen':'unknown',
    'Firewall':-1,
    'UacLuaenable':-1,
    'Census_OEMNameIdentifier':-1,
    'Census_OEMModelIdentifier':-1,
    'Census_ProcessorCoreCount':-1,
    'Census_ProcessorManufacturerIdentifier':-1,
    'Census_ProcessorModelIdentifier':-1,
    'Census_PrimaryDiskTotalCapacity':-1,
    'Census_SystemVolumeTotalCapacity':-1,
    'Census_TotalPhysicalRAM':-1,
    'Census_InternalPrimaryDiagonalDisplaySizeInInches':-1,
    'Census_InternalPrimaryDisplayResolutionHorizontal':-1,
    'Census_InternalPrimaryDisplayResolutionVertical':-1,
    'Census_InternalBatteryNumberOfCharges':-1,
    'Census_OSInstallLanguageIdentifier':-1,
    'Census_IsFlightingInternal':-1,
    'Census_IsFlightsDisabled':-1,
    'Census_ThresholdOptIn':-1,
    'Census_FirmwareManufacturerIdentifier':-1,
    'Census_FirmwareVersionIdentifier':-1,
    'Census_IsWIMBootEnabled':-1,
    'Census_IsVirtualDevice':-1,
    'Census_IsAlwaysOnAlwaysConnectedCapable':-1,
    'Wdft_IsGamer':-1,
    'Wdft_RegionIdentifier':-1,
    'Census_ProcessorClass':'unknown',
    'Census_PrimaryDiskTypeName':'unknown',
    'Census_ChassisTypeName':'unknown',
    'Census_InternalBatteryType':'unknown',
}

df = df.fillna(value=nanfill)

In [89]:
#
# then some of these columns can become ints, not floats
#

df["DefaultBrowsersIdentifier"] = df["DefaultBrowsersIdentifier"].astype(np.int64)
df['AVProductStatesIdentifier'] = df['AVProductStatesIdentifier'].astype(np.int64)
df['AVProductsInstalled'] = df['AVProductsInstalled'].astype(np.int8)
df['AVProductsEnabled'] = df['AVProductsEnabled'].astype(np.int8)
df['CityIdentifier'] = df['CityIdentifier'].astype(np.int64)
df['OrganizationIdentifier'] = df['OrganizationIdentifier'].astype(np.int64)
df['GeoNameIdentifier'] = df['GeoNameIdentifier'].astype(np.int64)
df['IsProtected'] = df['IsProtected'].astype(np.int8)
df['SMode'] = df['SMode'].astype(np.int8)
df['IeVerIdentifier'] = df['IeVerIdentifier'].astype(np.int8)
df['Firewall'] = df['Firewall'].astype(np.int8)
df['UacLuaenable'] = df['UacLuaenable'].astype(np.int8)
df['Census_OEMNameIdentifier'] = df['Census_OEMNameIdentifier'].astype(np.int8)
df['Census_OEMModelIdentifier'] = df['Census_OEMModelIdentifier'].astype(np.int16)
df['Census_ProcessorCoreCount'] = df['Census_ProcessorCoreCount'].astype(np.int16)
df['Census_ProcessorManufacturerIdentifier'] = df['Census_ProcessorManufacturerIdentifier'].astype(np.int16)
df['Census_ProcessorModelIdentifier'] = df['Census_ProcessorModelIdentifier'].astype(np.int16)
df['Census_PrimaryDiskTotalCapacity'] = df['Census_PrimaryDiskTotalCapacity'].astype(np.int16)
df['Census_SystemVolumeTotalCapacity'] = df['Census_SystemVolumeTotalCapacity'].astype(np.int16)
df['Census_TotalPhysicalRAM'] = df['Census_TotalPhysicalRAM'].astype(np.int16)
df['Census_InternalPrimaryDiagonalDisplaySizeInInches'] = df['Census_InternalPrimaryDiagonalDisplaySizeInInches'].astype(np.int16)
df['Census_InternalPrimaryDisplayResolutionHorizontal'] = df['Census_InternalPrimaryDisplayResolutionHorizontal'].astype(np.int16)
df['Census_InternalPrimaryDisplayResolutionVertical'] = df['Census_InternalPrimaryDisplayResolutionVertical'].astype(np.int16)
df['Census_InternalBatteryNumberOfCharges'] = df['Census_InternalBatteryNumberOfCharges'].astype(np.int16)
df['Census_OSInstallLanguageIdentifier'] = df['Census_OSInstallLanguageIdentifier'].astype(np.int16)
df['Census_IsFlightingInternal'] = df['Census_IsFlightingInternal'].astype(np.int8)
df['Census_IsFlightsDisabled'] = df['Census_IsFlightsDisabled'].astype(np.int8)
df['Census_ThresholdOptIn'] = df['Census_ThresholdOptIn'].astype(np.int8)
df['Census_FirmwareManufacturerIdentifier'] = df['Census_FirmwareManufacturerIdentifier'].astype(np.int16)
df['Census_FirmwareVersionIdentifier'] = df['Census_FirmwareVersionIdentifier'].astype(np.int16)
df['Census_IsWIMBootEnabled'] = df['Census_IsWIMBootEnabled'].astype(np.int8)
df['Census_IsVirtualDevice'] = df['Census_IsVirtualDevice'].astype(np.int8)
df['Census_IsAlwaysOnAlwaysConnectedCapable'] = df['Census_IsAlwaysOnAlwaysConnectedCapable'].astype(np.int8)
df['Wdft_IsGamer'] = df['Wdft_IsGamer'].astype(np.int8)
df['Wdft_RegionIdentifier'] = df['Wdft_RegionIdentifier'].astype(np.int16)


In [90]:
#
# deal with version numbers
#

def map_version(df, col):
    df[col+'_major'] = df[col].map(lambda x: np.fromstring(x, sep=".", dtype=np.uint8)[0])
    df[col+'_minor'] = df[col].map(lambda x: np.fromstring(x, sep=".", dtype=np.uint8)[1])
    df[col+'_build1'] = df[col].map(lambda x: np.fromstring(x, sep=".", dtype=np.uint16)[2])
    df[col+'_build2'] = df[col].map(lambda x: np.fromstring(x, sep=".", dtype=np.uint16)[2])
    return df.drop(columns=[col])


df = map_version(df, "EngineVersion")
df = map_version(df, "AppVersion")
df = map_version(df, "AvSigVersion")

In [91]:
#
# encode the values of RtpStateBitfield (NaN, 0, 1, 3, 5, 7, 8, 35) as indicators
#

df["Rtp_0"] = df["RtpStateBitfield"].map(lambda x: 0 if np.isnan(x) else 1 if int(x)&1 else 0).astype(np.uint8)
df["Rtp_1"] = df["RtpStateBitfield"].map(lambda x: 0 if np.isnan(x) else 1 if int(x)&2 else 0).astype(np.uint8)
df["Rtp_2"] = df["RtpStateBitfield"].map(lambda x: 0 if np.isnan(x) else 1 if int(x)&4 else 0).astype(np.uint8)
df["Rtp_3"] = df["RtpStateBitfield"].map(lambda x: 0 if np.isnan(x) else 1 if int(x)&8 else 0).astype(np.uint8)
df["Rtp_4"] = df["RtpStateBitfield"].map(lambda x: 0 if np.isnan(x) else 1 if int(x)&16 else 0).astype(np.uint8)
df["Rtp_5"] = df["RtpStateBitfield"].map(lambda x: 0 if np.isnan(x) else 1 if int(x)&32 else 0).astype(np.uint8)
df = df.drop(columns=["RtpStateBitfield"])

In [92]:
df.head()

Unnamed: 0.1,Unnamed: 0,ProductName,IsBeta,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,...,AvSigVersion_major,AvSigVersion_minor,AvSigVersion_build1,AvSigVersion_build2,Rtp_0,Rtp_1,Rtp_2,Rtp_3,Rtp_4,Rtp_5
0,3082108,win8defender,0,0,0,53447,1,1,1,65,...,1,17,520,520,1,1,1,0,0,0
1,3553333,win8defender,0,0,0,7945,2,1,1,29,...,1,19,453,453,1,1,1,0,0,0
2,6768315,win8defender,0,0,0,53447,1,1,1,158,...,1,17,689,689,1,1,1,0,0,0
3,7098147,win8defender,0,0,0,68585,2,1,1,43,...,1,19,483,483,1,1,1,0,0,0
4,716616,win8defender,0,0,0,53447,1,1,1,107,...,1,19,215,215,1,1,1,0,0,0


In [93]:
#
# save
#

df.to_csv("data/mini_cleaned.csv")

In [94]:
for col in df.columns:
    if  df[col].hasnans:
        print (col)
