In [1]:
%matplotlib inline

# Import a bunch of libraries.
from os import path
import numpy as np
import pandas as pd
import time

In [2]:
debug_wrangling = False # if True, uses existing mini_initial.csv file so this runs fast


In [3]:
def load_data(filename):
    dtypes = {
        'MachineIdentifier':                                    'str',
        'ProductName':                                          'str',
        'EngineVersion':                                        'str',
        'AppVersion':                                           'str',
        'AvSigVersion':                                         'str',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float64',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float32',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int16',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float64', 
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float32', 
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float32', 
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float64', 
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float64', 
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32', 
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32', 
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32', 
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float64', 
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

    df = pd.read_csv(filename, dtype=dtypes, engine='c')
    return df

In [4]:
def clean_data(df):
    #
    # deal with obvious NaNs
    #


    #
    # first, keep the information that some things were nans
    #
    df["Rtp_NaN"] = df["RtpStateBitfield"].isna()


    #
    # add categories where necessary so we can replace the NaNs
    #

    df["PuaMode"].cat.add_categories(["unknown"], inplace=True)
    df["SmartScreen"].cat.add_categories(["unknown"], inplace=True)
    df["Census_ProcessorClass"].cat.add_categories(["unknown"], inplace=True)
    df["Census_PrimaryDiskTypeName"].cat.add_categories(["unknown"], inplace=True)
    df["Census_ChassisTypeName"].cat.add_categories(["unknown"], inplace=True)
    df["Census_InternalBatteryType"].cat.add_categories(["unknown"], inplace=True)
    df["OsBuildLab"].cat.add_categories(["unknown"], inplace=True)
    df["Census_PowerPlatformRoleName"].cat.add_categories(["unknown"], inplace=True)
    df["Census_OSEdition"].cat.add_categories(["unknown"], inplace=True)
    df["Census_GenuineStateName"].cat.add_categories(["unknown"], inplace=True)



    #
    # Then, some more explicit fill choices
    #
    nanfill = {
        "RtpStateBitfield":0,
        "DefaultBrowsersIdentifier":0,
        'AVProductStatesIdentifier':0,
        'AVProductsInstalled':-1,
        'AVProductsEnabled':-1,
        'CityIdentifier':-1,
        'OrganizationIdentifier':-1,
        'GeoNameIdentifier':-1,
        'IsProtected':-1,
        'SMode':-1,
        'IeVerIdentifier':-1,
        'PuaMode':'unknown',
        'SmartScreen':'unknown',
        'Firewall':-1,
        'UacLuaenable':-1,
        'Census_OEMNameIdentifier':-1,
        'Census_OEMModelIdentifier':-1,
        'Census_ProcessorCoreCount':-1,
        'Census_ProcessorManufacturerIdentifier':-1,
        'Census_ProcessorModelIdentifier':-1,
        'Census_PrimaryDiskTotalCapacity':-1,
        'Census_SystemVolumeTotalCapacity':-1,
        'Census_TotalPhysicalRAM':-1,
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':-1,
        'Census_InternalPrimaryDisplayResolutionHorizontal':-1,
        'Census_InternalPrimaryDisplayResolutionVertical':-1,
        'Census_InternalBatteryNumberOfCharges':-1,
        'Census_OSInstallLanguageIdentifier':-1,
        'Census_IsFlightingInternal':-1,
        'Census_IsFlightsDisabled':-1,
        'Census_ThresholdOptIn':-1,
        'Census_FirmwareManufacturerIdentifier':-1,
        'Census_FirmwareVersionIdentifier':-1,
        'Census_IsWIMBootEnabled':-1,
        'Census_IsVirtualDevice':-1,
        'Census_IsAlwaysOnAlwaysConnectedCapable':-1,
        'Wdft_IsGamer':-1,
        'Wdft_RegionIdentifier':-1,
        'Census_ProcessorClass':'unknown',
        'Census_PrimaryDiskTypeName':'unknown',
        'Census_ChassisTypeName':'unknown',
        'Census_InternalBatteryType':'unknown',
        'OsBuildLab':'unknown',
        'Census_PowerPlatformRoleName':'unknown',
        'Census_OSEdition':'unknown',
        'Census_GenuineStateName':'unknown'
    }

    df = df.fillna(value=nanfill)
    
    #
    # then some of these columns can become ints, not floats
    #

    df["DefaultBrowsersIdentifier"] = df["DefaultBrowsersIdentifier"].astype(np.int64)
    df['AVProductStatesIdentifier'] = df['AVProductStatesIdentifier'].astype(np.int64)
    df['AVProductsInstalled'] = df['AVProductsInstalled'].astype(np.int8)
    df['AVProductsEnabled'] = df['AVProductsEnabled'].astype(np.int8)
    df['CityIdentifier'] = df['CityIdentifier'].astype(np.int64)
    df['OrganizationIdentifier'] = df['OrganizationIdentifier'].astype(np.int64)
    df['GeoNameIdentifier'] = df['GeoNameIdentifier'].astype(np.int64)
    df['IsProtected'] = df['IsProtected'].astype(np.int8)
    df['SMode'] = df['SMode'].astype(np.int8)
    df['IeVerIdentifier'] = df['IeVerIdentifier'].astype(np.int8)
    df['Firewall'] = df['Firewall'].astype(np.int8)
    df['UacLuaenable'] = df['UacLuaenable'].astype(np.int8)
    df['Census_OEMNameIdentifier'] = df['Census_OEMNameIdentifier'].astype(np.int8)
    df['Census_OEMModelIdentifier'] = df['Census_OEMModelIdentifier'].astype(np.int16)
    df['Census_ProcessorCoreCount'] = df['Census_ProcessorCoreCount'].astype(np.int16)
    df['Census_ProcessorManufacturerIdentifier'] = df['Census_ProcessorManufacturerIdentifier'].astype(np.int16)
    df['Census_ProcessorModelIdentifier'] = df['Census_ProcessorModelIdentifier'].astype(np.int16)
    df['Census_PrimaryDiskTotalCapacity'] = df['Census_PrimaryDiskTotalCapacity'].astype(np.int16)
    df['Census_SystemVolumeTotalCapacity'] = df['Census_SystemVolumeTotalCapacity'].astype(np.int16)
    df['Census_TotalPhysicalRAM'] = df['Census_TotalPhysicalRAM'].astype(np.int16)
    df['Census_InternalPrimaryDiagonalDisplaySizeInInches'] = df['Census_InternalPrimaryDiagonalDisplaySizeInInches'].astype(np.int16)
    df['Census_InternalPrimaryDisplayResolutionHorizontal'] = df['Census_InternalPrimaryDisplayResolutionHorizontal'].astype(np.int16)
    df['Census_InternalPrimaryDisplayResolutionVertical'] = df['Census_InternalPrimaryDisplayResolutionVertical'].astype(np.int16)
    df['Census_InternalBatteryNumberOfCharges'] = df['Census_InternalBatteryNumberOfCharges'].astype(np.int16)
    df['Census_OSInstallLanguageIdentifier'] = df['Census_OSInstallLanguageIdentifier'].astype(np.int16)
    df['Census_IsFlightingInternal'] = df['Census_IsFlightingInternal'].astype(np.int8)
    df['Census_IsFlightsDisabled'] = df['Census_IsFlightsDisabled'].astype(np.int8)
    df['Census_ThresholdOptIn'] = df['Census_ThresholdOptIn'].astype(np.int8)
    df['Census_FirmwareManufacturerIdentifier'] = df['Census_FirmwareManufacturerIdentifier'].astype(np.int16)
    df['Census_FirmwareVersionIdentifier'] = df['Census_FirmwareVersionIdentifier'].astype(np.int16)
    df['Census_IsWIMBootEnabled'] = df['Census_IsWIMBootEnabled'].astype(np.int8)
    df['Census_IsVirtualDevice'] = df['Census_IsVirtualDevice'].astype(np.int8)
    df['Census_IsAlwaysOnAlwaysConnectedCapable'] = df['Census_IsAlwaysOnAlwaysConnectedCapable'].astype(np.int8)
    df['Wdft_IsGamer'] = df['Wdft_IsGamer'].astype(np.int8)
    df['Wdft_RegionIdentifier'] = df['Wdft_RegionIdentifier'].astype(np.int16)
    df['RtpStateBitfield'] = df['RtpStateBitfield'].astype(np.uint8)

    
    
    #
    # deal with version numbers
    #

    def safeInt(s):
        try: 
            return int(s)
        except ValueError:
            return -1


    def version_component(s, n):
        v = s.split(".")
        return v[n] if len(v) > n else "-1"

    def vc_0(s):
        return safeInt(version_component(s,0))

    def vc_1(s):
        return safeInt(version_component(s,1))

    def vc_2(s):
        return safeInt(version_component(s,2))

    def vc_3(s):
        return safeInt(version_component(s,3))

    def vc_4_dot(s):
        x = version_component(s, 4)
        if x != "-1":
            ax = x.split("-")
            return safeInt(x[0])
        return -1

    def vc_5_dot(s):
        x = version_component(s, 4)
        if x != "-1":
            ax = x.split("-")
            return safeInt(x[1])
        return -1


    def map_version(df, col):
        df[col+'_major'] = df[col].map(vc_0).astype(np.int16)
        df[col+'_minor'] = df[col].map(vc_1).astype(np.int16)
        df[col+'_build1'] = df[col].map(vc_2).astype(np.int16)
        df[col+'_build2'] = df[col].map(vc_3).astype(np.int16)
        return df #.drop(columns=[col])

    df = map_version(df, "EngineVersion")
    df = map_version(df, "AppVersion")
    df = map_version(df, "AvSigVersion")
    df = map_version(df, "Census_OSVersion")
    df = map_version(df, "OsVer")

    
    #
    # this one is more complicated:
    # OsBuildLab: example: 7134.1.amd64fre.rs4_release.180410-1804
    #
    col = 'OsBuildLab'

    df['OsBuildLab_major'] = df[col].map(vc_0).astype(np.int16)
    df['OsBuildLab_minor'] = df[col].map(vc_1).astype(np.int16)
    df['OsBuildLab_platform'] = df[col].map(vc_2).astype('category')
    df['OsBuildLab_release'] = df[col].map(vc_3).astype('category')
    df['OsBuildLab_build1'] = df[col].map(vc_4_dot).astype(np.int32)
    df['OsBuildLab_build2'] = df[col].map(vc_5_dot).astype(np.int32)

    # df = df.drop(columns=["OsBuildLab"])
    
    return df


In [5]:
#
# verify that there are non nans left
#

def verify_no_nans(df):
    for col in df.columns:
        if  df[col].hasnans:
            print (col)

    print("done verifying absence of NaN")


In [6]:
# function to create new mini train, test and dev samples 
# pulling data from the training set using random stratification on the outcome variable

from sklearn.model_selection import train_test_split
def generate_train_test_dev_minis(n, features, labels):
    sample_size = n / features.shape[0]
    reserved_size = 1-sample_size
    X_train, X_test_and_dev_and_rest, y_train, y_test_and_dev_and_rest = train_test_split(features, labels, stratify=labels, test_size=reserved_size, random_state=0)
    reserved_size = 1-X_train.shape[0]/X_test_and_dev_and_rest.shape[0]
    X_test, X_dev_and_rest, y_test, y_dev_and_rest = train_test_split(X_test_and_dev_and_rest, y_test_and_dev_and_rest, stratify=y_test_and_dev_and_rest, test_size=reserved_size, random_state=0)
    reserved_size = 1-X_test.shape[0]/X_dev_and_rest.shape[0]
    X_dev, X_rest, y_dev, y_rest = train_test_split(X_dev_and_rest, y_dev_and_rest, stratify=y_dev_and_rest, test_size=reserved_size, random_state=0)
    return X_train, X_test, X_dev, y_train, y_test, y_dev




In [7]:
#
# function for saving frames into separate data/labels, also dropping machine ids
#

def save_files(df, name):
    
    # drop machine identifiers, so models don't train on them
    df = df.drop(columns=["MachineIdentifier"])
    
    # if it has labels, split them out and save them
    if "HasDetections" in df.columns:
        labels = df["HasDetections"]
        df = df.drop(columns=["HasDetections"])
        labels.to_csv("data/"+name+"_labels.csv", index=False, header=False)
        
    # save the file
    df.to_csv("data/"+name+"_clean.csv", index=False)
    
    

In [8]:
#
# main code part one: train dataset
#

start = time.time()

if debug_wrangling:
    print("using mini_initial.csv dataset")
    filename = "data/mini_initial.csv"
else:        
    print("using big dataset")
    filename = "data/train.csv"
    
#
# initial load and casting to desired types
#

df = load_data(filename)
print("done loading train data")
print("total rows in set:", len(df))

#
# make mini_initial, before cleaning and dropping, if necessary (to debug wrangler code)
#
   
if not debug_wrangling:
    df2 = df.sample(100000, random_state=123)
    df2.to_csv("data/mini_initial.csv", index=False)
    print("created mini_initial.csv data")
    
#
# main work is done here
#
    
df = clean_data(df)

# also split out a big dev and test file here, let's say 1,000,000
df_dev = df.sample(1000000 if not debug_wrangling else 500, random_state=123).copy()
df = df.drop(df_dev.index)

df_test = df.sample(1000000 if not debug_wrangling else 500, random_state=123).copy()
df = df.drop(df_test.index)

#
# save clean master file, all with labels
#

print("saving train_clean ...")
save_files(df, "train")

print("saving test files ...")
save_files(df_test, "test")
print("done saving clean test files")

print("saving dev files ...")
save_files(df_dev, "dev")
print("done saving clean dev files")

    
# make this work with a smaller size for the mini_initial set
sample_size = 50000 if len(df) > 1000000 else 500

# now call Kevin's code for the work
mini_train, mini_test, mini_dev, mini_train_labels, mini_test_labels, mini_dev_labels = \
    generate_train_test_dev_minis(sample_size, df, df['HasDetections'])
print("shape of mini_train:",mini_train.shape)
print("shape of mini_dev:",mini_dev.shape) 
print("shape of mini_test:",mini_test.shape)
print("done making mini sets")

#
# save clean mini files, all with labels
#

print("saving mini_train_clean ...")
save_files(mini_train, "mini_train")
print("saving mini_dev_clean ...")
save_files(mini_dev, "mini_dev")
print("saving mini_test_clean ...")
save_files(mini_test, "mini_test")
print("done saving mini files")


#
# need to also process big test file
#
    
if not debug_wrangling:
    print("loading test data ...")
    df_test = load_data("data/test.csv")
    print("done loading test data")
    print("total rows in test set:", len(df_test))
    clean_data(df_test)
    print("saving clean test data ...")
    save_files(df_test, "test")


print("done saving clean files")

elapsed = time.time()-start
print("seconds elapsed:", elapsed)


using mini_initial.csv dataset
done loading
total rows in set: 100000
saving train_clean ...
saving test files ...
done saving clean test files
saving dev files ...
done saving clean dev files
shape of mini_train: (500, 110)
shape of mini_dev: (500, 110)
shape of mini_test: (500, 110)
done making mini sets
saving mini_train_clean ...
saving mini_dev_clean ...
saving mini_test_clean ...
done saving clean files
seconds elapsed: 14.249373197555542
