In [0]:
import numpy as np 
import pandas as pd 
import dask.dataframe as dd
import gc
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder


pd.set_option('max_colwidth', 500)
pd.set_option('max_columns', 500)
pd.set_option('max_rows', 100)

import sys
if not sys.warnoptions:
  import warnings
  warnings.simplefilter('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [0]:
def reduce_mem_usage(df, verbose=True):
  """
  Iterate through all the columns of a dataframe and modify the data types
  to reduce memory usage.        
  """
  df = df.copy()
  
  start_mem = df.memory_usage().sum() / 1024**2
  start_mem_gb = start_mem / 1024
  
  numeric_dtype = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
  
  for col in df:
    col_type = str(df[col].dtypes)
        
    if col_type in numeric_dtype:
      c_min = df[col].min()
      c_max = df[col].max()
      if col_type[:3] == 'int':
        if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
          df[col] = df[col].astype(np.int8)
        elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
          df[col] = df[col].astype(np.int16)
        elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
          df[col] = df[col].astype(np.int32)
        elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
          df[col] = df[col].astype(np.int64)  
      else:  # column is not int
        if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
          df[col] = df[col].astype(np.float16)
        elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
          df[col] = df[col].astype(np.float32)
        else:
          df[col] = df[col].astype(np.float64)
    else:  # column is an object
      df[col] = df[col].astype('category')

  end_mem = df.memory_usage().sum() / 1024**2
  end_mem_gb = end_mem / 1024
  
  if verbose:
    print(f'Memory usage of dataframe is {start_mem:.2f} MB',
        f'/ {start_mem_gb:.2f} GB')
    print(f'Memory usage after optimization is: {end_mem:.2f} MB',
        f'/ {end_mem_gb:.2f} GB')
    mem_dec = 100 * (start_mem - end_mem) / start_mem
    print(f'Decreased by {mem_dec:.1f}%')
    
  return df


def import_data(file, dtypes=None):
  """
  Create a dataframe using dask for faster speed
  """

  ddf = dd.read_csv(file, dtype=dtypes)
  df = ddf.compute()
  return df

In [0]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float32',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int16',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float64',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float32',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float32',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float64',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float64',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float64',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'float16'
}

In [0]:
nan_minus1 = ['Census_SystemVolumeTotalCapacity',
 'AVProductStatesIdentifier',
 'AVProductsInstalled',
 'CountryIdentifier',
 'CityIdentifier',
 'OrganizationIdentifier',
 'GeoNameIdentifier',
 'LocaleEnglishNameIdentifier',
 'OsBuild',
 'OsSuite',
 'IeVerIdentifier',
 'Census_OEMNameIdentifier',
 'Census_OEMModelIdentifier',
 'Census_ProcessorCoreCount',
 'Census_ProcessorManufacturerIdentifier',
 'Census_ProcessorModelIdentifier',
 'Census_PrimaryDiskTotalCapacity',
 'Census_TotalPhysicalRAM',
 'Census_InternalPrimaryDiagonalDisplaySizeInInches',
 'Census_InternalPrimaryDisplayResolutionHorizontal',
 'Census_InternalPrimaryDisplayResolutionVertical',
 'Census_InternalBatteryNumberOfCharges',
 'Census_OSBuildNumber',
 'Census_OSBuildRevision',
 'Census_OSInstallLanguageIdentifier',
 'Census_OSUILocaleIdentifier',
 'Census_FirmwareManufacturerIdentifier',
 'Census_FirmwareVersionIdentifier',
 'Wdft_RegionIdentifier',
 'Census_SystemVolumeTotalCapacity',
 'AVProductStatesIdentifier',
 'AVProductsInstalled',
 'CountryIdentifier',
 'CityIdentifier',
 'OrganizationIdentifier',
 'GeoNameIdentifier',
 'LocaleEnglishNameIdentifier',
 'OsBuild',
 'OsSuite',
 'IeVerIdentifier',
 'Census_OEMNameIdentifier',
 'Census_OEMModelIdentifier',
 'Census_ProcessorCoreCount',
 'Census_ProcessorManufacturerIdentifier',
 'Census_ProcessorModelIdentifier',
 'Census_PrimaryDiskTotalCapacity',
 'Census_TotalPhysicalRAM',
 'Census_InternalPrimaryDiagonalDisplaySizeInInches',
 'Census_InternalPrimaryDisplayResolutionHorizontal',
 'Census_InternalPrimaryDisplayResolutionVertical',
 'Census_InternalBatteryNumberOfCharges',
 'Census_OSBuildNumber',
 'Census_OSBuildRevision',
 'Census_OSInstallLanguageIdentifier',
 'Census_OSUILocaleIdentifier',
 'Census_FirmwareManufacturerIdentifier',
 'Census_FirmwareVersionIdentifier',
 'Wdft_RegionIdentifier']

nan_0 = ['Census_HasOpticalDiskDrive',
 'Census_IsAlwaysOnAlwaysConnectedCapable',
 'Census_IsSecureBootEnabled',
 'Census_IsTouchEnabled',
 'SMode',
 'Wdft_IsGamer',
 'Census_HasOpticalDiskDrive',
 'Census_IsAlwaysOnAlwaysConnectedCapable',
 'Census_IsSecureBootEnabled',
 'Census_IsTouchEnabled',
 'SMode',
 'Wdft_IsGamer']

nan_1 = ['IsProtected', 'IsProtected']

In [0]:
for key in nan_minus1:
  dtypes[key] = 'int32'

for key in nan_1:
  dtypes[key] = 'int8'

for key in nan_0:
  dtypes[key] = 'int8'

In [0]:
target = 'HasDetections'
data_id = 'MachineIdentifier'
file_path = 'gdrive/My Drive/build-unit-2-data/'

In [8]:
%time train = import_data(file_path + 'train_clean.csv', dtypes=dtypes)
train.shape

CPU times: user 2min 43s, sys: 8.45 s, total: 2min 51s
Wall time: 1min 49s


(8921483, 61)

In [9]:
train = reduce_mem_usage(train)
train[target] = train[target].astype('int8')

Memory usage of dataframe is 1834.90 MB / 1.79 GB
Memory usage after optimization is: 1460.54 MB / 1.43 GB
Decreased by 20.4%


In [10]:
%time test = import_data(file_path + 'test_clean.csv', dtypes=dtypes)
test.shape

CPU times: user 2min 26s, sys: 5.16 s, total: 2min 31s
Wall time: 1min 29s


(7853253, 61)

In [11]:
test = reduce_mem_usage(test)
test[target] = test[target].fillna(-1)
test[target] = test[target].astype('int8')

Memory usage of dataframe is 1653.57 MB / 1.61 GB
Memory usage after optimization is: 1368.97 MB / 1.34 GB
Decreased by 17.2%


## Encode Features

In order to get the machine learning models to work, we need to encode the data into integers so that the model will accept it. We will achieve this through two types of encoding:

1. Label encoding: randomly transform the values to numbers between $0$ and $n-1$, where $n$ is the number of feature labels.
2. Frequency encoding: transform the values to numbers between $0$ and $m$, where $m$ is the number of feature labels with a frequency $\geq 2$, from highest to lowest frequency. This is a special type of label encoding.

In [12]:
df = pd.concat([train, test], sort=True)
df.shape

(16774736, 61)

In [13]:
gc.enable()
del test, train
gc.collect()

0

In [14]:
df = reduce_mem_usage(df)
df = df.reset_index()

Memory usage of dataframe is 3359.51 MB / 3.28 GB
Memory usage after optimization is: 2768.14 MB / 2.70 GB
Decreased by 17.6%


In [0]:
freq_encoding_list = [
 'EngineVersion',
 'AppVersion',
 'AvSigVersion',
 'OsBuildLab',
 'Census_OSVersion'
]

In [0]:
features_by_hand = [
 'MachineIdentifier',
 'EngineVersion',
 'AppVersion',
 'AvSigVersion',
 'Processor',
 'OsPlatformSubRelease',
 'OsBuildLab',
 'SkuEdition',
 'SmartScreen',
 'Census_MDC2FormFactor',
 'Census_PrimaryDiskTypeName',
 'Census_ChassisTypeName',
 'Census_PowerPlatformRoleName',
 'Census_OSVersion',
 'Census_OSArchitecture',
 'Census_OSBranch',
 'Census_OSEdition',
 'Census_OSSkuName',
 'Census_OSInstallTypeName',
 'Census_OSWUAutoUpdateOptionsName',
 'Census_GenuineStateName',
 'Census_ActivationChannel',
 'Census_FlightRing',
 'Census_MDC2FormFactor_new' 
]

In [0]:
label_encoding_list = list(set(features_by_hand) - set(freq_encoding_list))
label_encoding_list.remove(data_id)

In [0]:
# Create a function for frequency encoding
def frequency_encoding(feature):
  # Count the number of values of each feature and reset the indices
  t = df[feature].value_counts().reset_index()

  # Build up a new index (old index is set to 'level_0')
  t = t.reset_index()

  # Set the old index 'level_0' for all values which only occur once to NaN
  t.loc[t[feature] == 1, 'level_0'] = np.nan

  # Reset the original index (= the value name) as index
  t.set_index('index', inplace=True)

  # Return the number of values which occur two or more times, add 1
  max_label = t['level_0'].max() + 1

  # Fill all NaNs to maximum label
  t.fillna(max_label, inplace=True)

  return t.to_dict()['level_0']

In [0]:
encoding_dict = dict()

In [20]:
# Encode all features in freq_encoding_list
for feature in tqdm(freq_encoding_list):
  freq_encoding_dict = frequency_encoding(feature)
  df[feature] = df[feature].map(lambda x: freq_encoding_dict.get(x, np.nan))
  df[feature] = df[feature].astype('int32')

  encoding_dict[feature] = freq_encoding_dict

100%|██████████| 5/5 [00:01<00:00,  3.14it/s]


In [21]:
# Encode all features in label_encoding_list
for feature in tqdm(label_encoding_list):
  le = LabelEncoder()
  df[feature] = le.fit_transform(df[feature])
  df[feature] = df[feature].astype('int32')

  encoding_dict[feature] = le

100%|██████████| 18/18 [00:49<00:00,  2.72s/it]


In [29]:
# Restore training data
train = df[df[target] != -1]
train.shape

(8921483, 62)

In [27]:
# Restore test data
test = df[df[target] == -1]
test.shape

(7853253, 62)

In [30]:
# Save cleaned, feature engineered, and encoded data
%time train.to_csv(file_path + 'train_modelready.csv', index=False)
%time test.to_csv(file_path + 'test_modelready.csv', index=False)

CPU times: user 7min 4s, sys: 3.25 s, total: 7min 7s
Wall time: 7min 15s
CPU times: user 6min 16s, sys: 2.51 s, total: 6min 19s
Wall time: 6min 54s


In [32]:
encoding_dict['SmartScreen']

LabelEncoder()