In [0]:
%matplotlib inline

# Import a bunch of libraries.
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
import sklearn.ensemble as ske
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_openml
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, zero_one_loss
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

from sklearn.tree import DecisionTreeClassifier

# some PCA and GMM stuff
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from matplotlib.colors import LogNorm

import sklearn.ensemble as ske

In [0]:
train_file = 'https://people.ischool.berkeley.edu/~andrew.morris/data/w209/mini_train.csv'
test_file = 'https://people.ischool.berkeley.edu/~andrew.morris/data/w209/mini_test.csv'
dev_file = 'https://people.ischool.berkeley.edu/~andrew.morris/data/w209/mini_dev.csv'

In [0]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float32',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int16',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float64', 
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float32', 
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float32', 
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float64', 
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float64', 
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32', 
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32', 
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32', 
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float64', 
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

In [36]:
train_df = pd.read_csv(train_file, dtype=dtypes)
dev_df = pd.read_csv(dev_file, dtype=dtypes)
test_df = pd.read_csv(test_file, dtype=dtypes)

train_df.head()

Unnamed: 0.1,Unnamed: 0,MachineIdentifier,ProductName,EngineVersion,AppVersion,AvSigVersion,IsBeta,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,OrganizationIdentifier,GeoNameIdentifier,LocaleEnglishNameIdentifier,Platform,Processor,OsVer,OsBuild,OsSuite,OsPlatformSubRelease,OsBuildLab,SkuEdition,IsProtected,AutoSampleOptIn,PuaMode,SMode,IeVerIdentifier,SmartScreen,Firewall,UacLuaenable,Census_MDC2FormFactor,Census_DeviceFamily,Census_OEMNameIdentifier,Census_OEMModelIdentifier,Census_ProcessorCoreCount,...,Census_PrimaryDiskTypeName,Census_SystemVolumeTotalCapacity,Census_HasOpticalDiskDrive,Census_TotalPhysicalRAM,Census_ChassisTypeName,Census_InternalPrimaryDiagonalDisplaySizeInInches,Census_InternalPrimaryDisplayResolutionHorizontal,Census_InternalPrimaryDisplayResolutionVertical,Census_PowerPlatformRoleName,Census_InternalBatteryType,Census_InternalBatteryNumberOfCharges,Census_OSVersion,Census_OSArchitecture,Census_OSBranch,Census_OSBuildNumber,Census_OSBuildRevision,Census_OSEdition,Census_OSSkuName,Census_OSInstallTypeName,Census_OSInstallLanguageIdentifier,Census_OSUILocaleIdentifier,Census_OSWUAutoUpdateOptionsName,Census_IsPortableOperatingSystem,Census_GenuineStateName,Census_ActivationChannel,Census_IsFlightingInternal,Census_IsFlightsDisabled,Census_FlightRing,Census_ThresholdOptIn,Census_FirmwareManufacturerIdentifier,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
0,3082108,586d40804b950d0376575fdf10ee89ae,win8defender,1.1.15100.1,4.18.1806.18062,1.273.520.0,0,7.0,0,,53447.0,1.0,1.0,1,65,148451.0,,192.0,162,windows10,x64,10.0.0.0,17134,768,rs4,17134.1.amd64fre.rs4_release.180410-1804,Home,1.0,0,,0.0,137.0,,1.0,1.0,Notebook,Windows.Desktop,2668.0,171197.0,4.0,...,HDD,912078.0,0,8192.0,Notebook,17.200001,1600.0,900.0,Mobile,,0.0,10.0.17134.165,amd64,rs4_release,17134,165,Core,CORE,Reset,23.0,105,FullAuto,0,IS_GENUINE,Retail,,0.0,Retail,,628.0,27767.0,1,,0.0,0,0,0.0,1.0,15.0,1
1,3553333,65fb3fae2d37f90e6b3174592f2490a8,win8defender,1.1.15200.1,4.18.1807.18075,1.275.453.0,0,7.0,0,,7945.0,2.0,1.0,1,29,143155.0,27.0,35.0,171,windows10,x64,10.0.0.0,16299,256,rs3,16299.431.amd64fre.rs3_release_svc_escrow.1805...,Pro,1.0,0,,0.0,117.0,RequireAdmin,1.0,1.0,Notebook,Windows.Desktop,2206.0,249972.0,4.0,...,SSD,113121.0,0,4096.0,Notebook,14.0,1280.0,1024.0,Mobile,,0.0,10.0.16299.611,amd64,rs3_release_svc_escrow,16299,611,Professional,PROFESSIONAL,UUPUpgrade,26.0,119,AutoInstallAndRebootAtMaintenanceTime,0,IS_GENUINE,Retail,,0.0,Retail,,500.0,14353.0,0,,0.0,0,0,0.0,0.0,10.0,0
2,6768315,c23aa37fb69e00afe2668ed150dee1ea,win8defender,1.1.15100.1,4.18.1807.18075,1.273.689.0,0,7.0,0,,53447.0,1.0,1.0,1,158,109370.0,27.0,202.0,70,windows10,x64,10.0.0.0,16299,768,rs3,16299.431.amd64fre.rs3_release_svc_escrow.1805...,Home,1.0,0,,0.0,117.0,RequireAdmin,1.0,1.0,Notebook,Windows.Desktop,2668.0,171356.0,4.0,...,HDD,913414.0,0,4096.0,Notebook,14.0,1366.0,768.0,Mobile,,0.0,10.0.16299.551,amd64,rs3_release_svc_escrow,16299,551,CoreSingleLanguage,CORE_SINGLELANGUAGE,Upgrade,8.0,31,Notify,0,IS_GENUINE,OEM:DM,,0.0,Retail,,628.0,8941.0,1,,0.0,0,0,0.0,1.0,1.0,1
3,7098147,cba75d6c4d9b6533591e94b9cb8a5df5,win8defender,1.1.15200.1,4.12.16299.15,1.275.483.0,0,7.0,0,,68585.0,2.0,1.0,1,43,29575.0,18.0,53.0,42,windows10,x64,10.0.0.0,16299,768,rs3,16299.15.amd64fre.rs3_release.170928-1534,Home,1.0,0,,0.0,111.0,,1.0,1.0,Notebook,Windows.Desktop,2903.0,331590.0,12.0,...,SSD,121087.0,0,8192.0,Notebook,15.5,1920.0,1080.0,Mobile,,0.0,10.0.16299.15,amd64,rs3_release,16299,15,CoreCountrySpecific,CORE_COUNTRYSPECIFIC,IBSClean,37.0,158,UNKNOWN,0,IS_GENUINE,OEM:DM,,0.0,Retail,,142.0,46589.0,1,,0.0,0,0,0.0,1.0,7.0,1
4,716616,149746364c6b763662d03e1f263029fd,win8defender,1.1.15200.1,4.18.1807.18075,1.275.215.0,0,7.0,0,,53447.0,1.0,1.0,1,107,58489.0,27.0,138.0,134,windows10,x64,10.0.0.0,15063,256,rs2,15063.0.amd64fre.rs2_release.170317-1834,Pro,1.0,0,,0.0,108.0,,1.0,1.0,Desktop,Windows.Desktop,4909.0,317701.0,6.0,...,HDD,953867.0,0,8192.0,Desktop,23.1,1024.0,768.0,Desktop,,4294967000.0,10.0.15063.0,amd64,rs2_release,15063,0,Core,CORE,IBSClean,20.0,83,UNKNOWN,0,OFFLINE,Retail,,0.0,Retail,,142.0,52530.0,0,,0.0,0,0,0.0,,,0


In [0]:
train_labels = pd.DataFrame()
train_labels["HasDetections"] = train_df["HasDetections"]
train_data = pd.DataFrame()
train_data = train_df.drop(columns="HasDetections")
mini_train_labels = np.array(train_labels)
mini_train_data = np.array(train_data)

dev_labels = pd.DataFrame()
dev_labels["HasDetections"] = dev_df["HasDetections"]
dev_data = pd.DataFrame()
dev_data = dev_df.drop(columns="HasDetections")
dev_labels = np.array(dev_labels)
dev_data = np.array(dev_data)

test_labels = pd.DataFrame()
test_labels["HasDetections"] = test_df["HasDetections"]
test_data = pd.DataFrame()
test_data = test_df.drop(columns="HasDetections")
test_labels = np.array(test_labels)
test_data = np.array(test_data)

In [63]:
classifier = DecisionTreeClassifier(random_state=0)
classifier.fit(train_data.drop(columns=["MachineIdentifier","ProductName"]), train_labels)

ValueError: ignored