In [15]:
%matplotlib inline

# Import a bunch of libraries.
import time
import numpy as np
import pandas as pd
 
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_openml
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, zero_one_loss
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

from sklearn.tree import DecisionTreeClassifier

# some PCA and GMM stuff
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from matplotlib.colors import LogNorm

import sklearn.ensemble as ske
from sklearn.feature_selection import SelectFromModel

import seaborn as sns
sns.set()

# Definitions

# Classes

In [35]:
class Data_Obj:
    def __init__(self, filename):
        self.load_data(filename)
        self.define_cols()
        return
    
    def load_data(self, filename):
        self.dtypes = {
            'ProductName':                                          'int64',
            'EngineVersion':                                        'int64',
            'AppVersion':                                           'int64',
            'AvSigVersion':                                         'int64',
            'RtpStateBitfield':                                     'int64',
            'Platform':                                             'int64',
            'Processor':                                            'int64',
            'OsVer':                                                'int64',
            'OsPlatformSubRelease':                                 'int64',
            'OsBuildLab':                                           'int64',
            'SkuEdition':                                           'int64',
            'PuaMode':                                              'int64',
            'SmartScreen':                                          'int64',
            'Census_MDC2FormFactor':                                'int64',
            'Census_DeviceFamily':                                  'int64',
            'Census_ProcessorClass':                                'int64',
            'Census_PrimaryDiskTypeName':                           'int64',
            'Census_ChassisTypeName':                               'int64',
            'Census_PowerPlatformRoleName':                         'int64',
            'Census_InternalBatteryType':                           'int64',
            'Census_OSVersion':                                     'int64',
            'Census_OSArchitecture':                                'int64',
            'Census_OSBranch':                                      'int64',
            'Census_OSEdition':                                     'int64',
            'Census_OSSkuName':                                     'int64',
            'Census_OSInstallTypeName':                             'int64',
            'Census_OSWUAutoUpdateOptionsName':                     'int64',
            'Census_GenuineStateName':                              'int64',
            'Census_ActivationChannel':                             'int64',
            'Census_FlightRing':                                    'int64',
            'RtpStateBitfield_wasna':                               'int64',
            'DefaultBrowsersIdentifier_wasna':                      'int64',
            'AVProductStatesIdentifier_wasna':                      'int64',
            'AVProductsInstalled_wasna':                            'int64',
            'AVProductsEnabled_wasna':                              'int64',
            'CityIdentifier_wasna':                                 'int64',
            'OrganizationIdentifier_wasna':                         'int64',
            'GeoNameIdentifier_wasna':                              'int64',
            'IsProtected_wasna':                                    'int64',
            'SMode_wasna':                                          'int64',
            'IeVerIdentifier_wasna':                                'int64',
            'Firewall_wasna':                                       'int64',
            'UacLuaenable_wasna':                                   'int64',
            'Census_OEMNameIdentifier_wasna':                       'int64',
            'Census_OEMModelIdentifier_wasna':                      'int64',
            'Census_ProcessorCoreCount_wasna':                      'int64',
            'Census_ProcessorManufacturerIdentifier_wasna':         'int64',
            'Census_ProcessorModelIdentifier_wasna':                'int64',
            'Census_PrimaryDiskTotalCapacity_wasna':                'int64',
            'Census_SystemVolumeTotalCapacity_wasna':               'int64',
            'Census_TotalPhysicalRAM_wasna':                        'int64',
            'Census_InternalPrimaryDiagonalDisplaySizeInInches_wasna': 'int64',
            'Census_InternalPrimaryDisplayResolutionHorizontal_wasna': 'int64',
            'Census_InternalPrimaryDisplayResolutionVertical_wasna': 'int64',
            'Census_InternalBatteryNumberOfCharges_wasna':          'int64',
            'Census_OSInstallLanguageIdentifier_wasna':             'int64',
            'Census_IsFlightingInternal_wasna':                     'int64',
            'Census_IsFlightsDisabled_wasna':                       'int64',
            'Census_ThresholdOptIn_wasna':                          'int64',
            'Census_FirmwareManufacturerIdentifier_wasna':          'int64',
            'Census_IsWIMBootEnabled_wasna':                        'int64',
            'Census_IsVirtualDevice_wasna':                         'int64',
            'Census_IsAlwaysOnAlwaysConnectedCapable_wasna':        'int64',
            'Wdft_IsGamer_wasna':                                   'int64',
            'Wdft_RegionIdentifier_wasna':                          'int64',
            'Census_FirmwareVersionIdentifier_wasna':               'int64',
            'OsBuildLab_platform':                                  'float64',
            'OsBuildLab_release':                                   'float64',
            'IsBeta':                                               'int8',
            'IsSxsPassiveMode':                                     'int8',
            'HasTpm':                                               'int8',
            'AutoSampleOptIn':                                      'int8',
            'Census_HasOpticalDiskDrive':                           'int8',
            'Census_IsPortableOperatingSystem':                     'int8',
            'Census_IsSecureBootEnabled':                           'int8',
            'Census_IsTouchEnabled':                                'int8',
            'Census_IsPenCapable':                                  'int8',
            'CountryIdentifier':                                    'float64',
            'LocaleEnglishNameIdentifier':                          'float64',
            'OsBuild':                                              'float64',
            'OsSuite':                                              'int16',
            'Census_OSBuildNumber':                                 'float64',
            'Census_OSUILocaleIdentifier':                          'float64',
            'EngineVersion_major':                                  'int16',
            'EngineVersion_minor':                                  'int16',
            'EngineVersion_build1':                                 'int16',
            'EngineVersion_build2':                                 'int16',
            'AppVersion_major':                                     'int16',
            'AppVersion_minor':                                     'int16',
            'AppVersion_build1':                                    'int16',
            'AppVersion_build2':                                    'int16',
            'AvSigVersion_major':                                   'int16',
            'AvSigVersion_minor':                                   'int16',
            'AvSigVersion_build1':                                  'int16',
            'AvSigVersion_build2':                                  'int16',
            'Census_OSVersion_major':                               'int16',
            'Census_OSVersion_minor':                               'int16',
            'Census_OSVersion_build1':                              'int16',
            'Census_OSVersion_build2':                              'int16',
            'OsVer_major':                                          'int16',
            'OsVer_minor':                                          'int16',
            'OsVer_build1':                                         'int16',
            'OsVer_build2':                                         'int16',
            'OsBuildLab_major':                                     'float64',
            'OsBuildLab_minor':                                     'float64',
            'Census_OSBuildRevision':                               'int32',
            'OsBuildLab_build1':                                    'int32',
            'OsBuildLab_build2':                                    'float64',
            'AVProductsInstalled':                                  'float16',
            'AVProductsEnabled':                                    'float16',
            'OrganizationIdentifier':                               'float16',
            'GeoNameIdentifier':                                    'float64',
            'IsProtected':                                          'float16',
            'SMode':                                                'float16',
            'IeVerIdentifier':                                      'float64',
            'Firewall':                                             'float16',
            'Census_ProcessorCoreCount':                            'float64',
            'Census_ProcessorManufacturerIdentifier':               'float16',
            'Census_OSInstallLanguageIdentifier':                   'float64',
            'Census_IsFlightingInternal':                           'float16',
            'Census_IsFlightsDisabled':                             'float16',
            'Census_ThresholdOptIn':                                'float16',
            'Census_FirmwareManufacturerIdentifier':                'float64',
            'Census_IsWIMBootEnabled':                              'float16',
            'Census_IsVirtualDevice':                               'float16',
            'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
            'Wdft_IsGamer':                                         'float16',
            'Wdft_RegionIdentifier':                                'float64',
            'DefaultBrowsersIdentifier':                            'float32',
            'AVProductStatesIdentifier':                            'float64',
            'CityIdentifier':                                       'float64',
            'Census_OEMNameIdentifier':                             'float64',
            'Census_OEMModelIdentifier':                            'float64',
            'Census_ProcessorModelIdentifier':                      'float64',
            'Census_TotalPhysicalRAM':                              'float64',
            'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float64',
            'Census_InternalPrimaryDisplayResolutionHorizontal':    'float64',
            'Census_InternalPrimaryDisplayResolutionVertical':      'float64',
            'Census_FirmwareVersionIdentifier':                     'float64',
            'UacLuaenable':                                         'float64',
            'Census_PrimaryDiskTotalCapacity':                      'float64',
            'Census_SystemVolumeTotalCapacity':                     'float64',
            'Census_InternalBatteryNumberOfCharges':                'float64',
            'EngineVersion_combined':                               'float64',
            'AppVersion_combined':                                  'float64',
            'AvSigVersion_combined':                                'float64',
            'Census_OSVersion_combined':                            'float64',
            'OsVer_combined':                                       'float64',
            'OsBuildLab_combined':                                  'float64',
            'HasDetections':                                        'int8'
            }

        self.df = pd.read_csv(filename, dtype=self.dtypes, engine='c')
        self.labels = self.df['HasDetections']
        self.df = self.df.drop(columns='HasDetections')
        self.df = self.df.drop(columns='MachineIdentifier')
        self.all_cols = []
        for item in self.dtypes.items():
            self.all_cols.append(item[0])
        return
    
    def define_cols(self):
        self.numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        self.numeric_cols = [c for c,v in self.df.dtypes.items() if v in self.numerics and c in self.df.columns]
        self.nominal_cols = [c for c in self.df.columns if (c not in self.numeric_cols)]
        self.binary_cols = [c for c in self.df.columns if (self.df[c].nunique() == 2 and c not in self.nominal_cols)]
    
    def reduce_mem(self, verbose=True):
        start_mem = self.df.memory_usage().sum() / 1024**2
        print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

        for col in self.df.columns:
            col_type = self.df[col].dtype

            if col_type != object and str(col_type) != 'category':
                c_min = self.df[col].min()
                c_max = self.df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        self.df[col] = self.df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        self.df[col] = self.df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        self.df[col] = self.df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        self.df[col] = self.df[col].astype(np.int64)
            else:
                self.df[col] = self.df[col].astype('category')

        end_mem = self.df.memory_usage().sum() / 1024**2
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

        return


In [36]:
train_file = 'data/mini_train_encoded.csv'
dev_file = 'data/mini_dev_encoded.csv'
validate_file = 'data/mini_validate_encoded.csv'

train_data = Data_Obj(train_file)
dev_data = Data_Obj(dev_file)
validate_data = Data_Obj(validate_file)
