In [1]:
%matplotlib inline

# Import a bunch of libraries.
import re
import numpy as np
import pandas as pd
import time

In [2]:
debug_wrangling = False # if True, uses existing mini_initial.csv file so this runs fast


In [3]:
def load_data(filename):
    
    dtypes = {
        'MachineIdentifier':                                    'str',
        'ProductName':                                          'str',
        'EngineVersion':                                        'str',
        'AppVersion':                                           'str',
        'AvSigVersion':                                         'str',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float64',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float32',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int16',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'str',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float64', 
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float32', 
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float32', 
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float64', 
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float64', 
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'str',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32', 
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32', 
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32', 
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'str',
        'Census_InternalBatteryNumberOfCharges':                'float64', 
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'str',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

    df = pd.read_csv(filename, dtype=dtypes, engine='c')
    return df

In [None]:
def clean_data(df):
        
    #
    # make all strings lower case
    # get rid of hex char codes, keep the actual code number
    #
    
    char_treatment = [
        'AvSigVersion',
        'SmartScreen',
        'Census_InternalBatteryType'
    ]
    
    case_treatment = [
        'SmartScreen',
        'Census_ChassisTypeName',
        'Census_OSEdition'
    ]
    
    print("-- replacing weird characters ...")
    for col in char_treatment:
        if df[col].dtype.name == 'object':
            df[col] = df[col].str.replace(r'&#x(\d\d);', '\1', regex=True)
            df[col] = df[col].str.replace(r'[\x00-\x1f]', '', regex=True)
            
    print("-- lower-casing where appropriate ...")
    for col in case_treatment:
        if df[col].dtype.name == 'object':
            df[col] = df[col].str.lower()
            
    #
    # make strings into categories
    #
    
    
    categories = [
        'SmartScreen',
        'Census_InternalBatteryType',
        'Census_ChassisTypeName',
        'Census_OSEdition',
    ]
    

    
    print("-- making categories ...")
    for col in categories:
        df[col] = df[col].astype('category')

                 
    #
    # add categories where necessary so we can replace the NAs
    #

    print("-- adding categories ..")
    df["OsBuildLab"].cat.add_categories(["0.0.-.-.0-0"], inplace=True)



    #
    # flag and fill selected NAs
    #
    
    print("-- replacing selected NA values")
    nafill = {
        "RtpStateBitfield":0,
        'OsBuildLab':'0.0.-.-.0-0',
    }

    for col in nafill:
        df[col+'_wasna'] = df[col].isna()
    df = df.fillna(value=nafill)
    
    #
    # then some of these columns can become ints, not floats
    #

    print("-- converting columns to int ...")
    df['RtpStateBitfield'] = df['RtpStateBitfield'].astype(np.uint8)

    #
    # deal with version numbers
    #
    
    def map_version2(df,col):
        #df_clean = np.where('&#x' in df[col],'',df[col])
        df_split = df[col].str.split(".", n=3, expand=True)
        df[col+'_major'] = df_split[0].astype(np.int16)
        df[col+'_minor'] = df_split[1].astype(np.int16)
        df[col+'_build1'] = df_split[2].astype(np.int16)
        df[col+'_build2'] = df_split[3].astype(np.int16)
        df[col+'_combined'] = 10000.0*df[col+'_major']+df[col+'_minor']+df[col+'_build1']/1000+df[col+'_build2']/1000000

    print("-- mapping version numbers ...")
    map_version2(df, "EngineVersion")
    map_version2(df, "AppVersion")
    map_version2(df, "AvSigVersion")
    map_version2(df, "Census_OSVersion")
    map_version2(df, "OsVer")

    #
    # this one is more complicated:
    # OsBuildLab: example: 7134.1.amd64fre.rs4_release.180410-1804
    #

    print("-- mapping OsBuildLab version numbers ...")
    df_split = df['OsBuildLab'].str.split(".", n=5, expand=True)
    df['OsBuildLab_major'] = df_split[0].astype(np.int16)
    df['OsBuildLab_minor'] = df_split[1].astype(np.int16)
    df['OsBuildLab_platform'] = df_split[2].astype('category')
    df['OsBuildLab_release'] = df_split[3].astype('category')
    df_build = df_split[4].str.split("-", n=1, expand=True)
    df['OsBuildLab_build1'] = df_build[0].astype(np.int32)
    df['OsBuildLab_build2'] = df_build[1].astype(np.int32)
    df['OsBuildLab_combined'] = 10000.0*df['OsBuildLab_major']+df['OsBuildLab_minor']+df['OsBuildLab_build1']/1000+df['OsBuildLab_build2']/1000000
    df_split = None
    df_build = None
    
    return df


In [None]:
# function to create new mini train, test and dev samples 
# pulling data from the training set using random stratification on the outcome variable

from sklearn.model_selection import train_test_split
def generate_train_test_dev_minis(n, features, labels):
    sample_size = n / features.shape[0]
    reserved_size = 1-sample_size
    X_train, X_test_and_dev_and_rest, y_train, y_test_and_dev_and_rest = train_test_split(features, labels, stratify=labels, test_size=reserved_size, random_state=0)
    reserved_size = 1-X_train.shape[0]/X_test_and_dev_and_rest.shape[0]
    X_test, X_dev_and_rest, y_test, y_dev_and_rest = train_test_split(X_test_and_dev_and_rest, y_test_and_dev_and_rest, stratify=y_test_and_dev_and_rest, test_size=reserved_size, random_state=0)
    reserved_size = 1-X_test.shape[0]/X_dev_and_rest.shape[0]
    X_dev, X_rest, y_dev, y_rest = train_test_split(X_dev_and_rest, y_dev_and_rest, stratify=y_dev_and_rest, test_size=reserved_size, random_state=0)
    return X_train, X_test, X_dev, y_train, y_test, y_dev




In [None]:
#
# function for saving frames, also dropping machine ids
#

def save_files(df, name):
    
    # drop machine identifiers, so models don't train on them
    df = df.drop(columns=["MachineIdentifier"])
        
    # save the file
    df.to_csv("data/"+name+"_clean.csv", index=False)
    
    

In [None]:
#
# main code part one: train dataset
#

start = time.time()

if debug_wrangling:
    print("using mini_initial.csv dataset")
    filename = "data/mini_initial.csv"
else:        
    print("using big dataset")
    filename = "data/train.csv"
    
#
# initial load and casting to desired types
#

df = load_data(filename)
print("done loading train data")
print("total rows in set:", len(df))
print("seconds elapsed:", time.time()-start)

#
# make mini_initial, before cleaning and dropping, if necessary (to debug wrangler code)
#
   
if not debug_wrangling:
    print("creating mini_initial.csv for debug purposes ...")
    df2 = df.sample(100000, random_state=123)
    df2.to_csv("data/mini_initial.csv", index=False)
    print("created mini_initial.csv data")
    print("seconds elapsed:", time.time()-start)
    
#
# main work is done here
#
    
print("cleaning data ...")
df = clean_data(df)
print("done cleaning train data")
print("seconds elapsed:", time.time()-start)

# also split out a big dev and test file here, let's say 1,000,000
df_dev = df.sample(1000000 if not debug_wrangling else 500, random_state=123).copy()
df = df.drop(df_dev.index)

df_test = df.sample(1000000 if not debug_wrangling else 500, random_state=123).copy()
df = df.drop(df_test.index)

#
# save clean master file
#

print("saving train_clean ...")
save_files(df, "train")

print("saving test files ...")
save_files(df_test, "test")
print("done saving clean test files")

print("saving dev files ...")
save_files(df_dev, "dev")
print("done saving clean dev files")

    
# make this work with a smaller size for the mini_initial set
sample_size = 50000 if len(df) > 1000000 else 500

# now call Kevin's code for the work
mini_train, mini_test, mini_dev, mini_train_labels, mini_test_labels, mini_dev_labels = \
    generate_train_test_dev_minis(sample_size, df, df['HasDetections'])
print("shape of mini_train:",mini_train.shape)
print("shape of mini_dev:",mini_dev.shape) 
print("shape of mini_test:",mini_test.shape)
print("done making mini sets")

#
# save clean mini files, all with labels
#

print("saving mini_train_clean ...")
save_files(mini_train, "mini_train")
print("saving mini_dev_clean ...")
save_files(mini_dev, "mini_dev")
print("saving mini_test_clean ...")
save_files(mini_test, "mini_test")
print("done saving mini files")


#
# need to also process big test file
#
    
if not debug_wrangling:
    print("loading test data ...")
    df_test = load_data("data/test.csv")
    print("done loading test data")
    print("seconds elapsed:", time.time()-start)
    print("total rows in test set:", len(df_test))
    print("cleaning test data ...")
    clean_data(df_test)
    print("done cleaning test data")
    print("seconds elapsed:", time.time()-start)
    print("saving clean test data ...")
    save_files(df_test, "real_test")


print("done saving clean files")
print("seconds elapsed:", time.time()-start)


using big dataset
done loading train data
total rows in set: 8921483
seconds elapsed: 161.6900098323822
creating mini_initial.csv for debug purposes ...
created mini_initial.csv data
seconds elapsed: 172.68184614181519
cleaning data ...
-- replacing weird characters ...
-- lower-casing where appropriate ...
-- making categories ...
-- adding categories ..
-- replacing selected NA values
-- converting columns to int ...
-- mapping version numbers ...
-- mapping OsBuildLab version numbers ...
done cleaning train data
seconds elapsed: 410.541775226593
saving train_clean ...
