In [9]:
import numpy as np 
import pandas as pd 
from tqdm import tqdm #progress meter
import gc

from sklearn.model_selection import KFold
import warnings
import gc #https://stackify.com/python-garbage-collection/
import time
import sys
import datetime

import os

In [10]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'float16', #'int8'
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'float16', #'int8'
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'float16', #'int8'
        'CountryIdentifier':                                    'float16', #'int16'
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'float16', #'int8'
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'float16', #'int16'
        'OsSuite':                                              'float16', #'int16'
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'float16', #'int8'
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'float16', #'int8'
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'float16', #'int16'
        'Census_OSBuildRevision':                               'float32', #'int32'
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'float16', #'int16'
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'float16', #'int8'
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'float16', #'int8'
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'float16', #'int8'
        'Census_IsPenCapable':                                  'float16', #'int8'
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'float16', #'int8'
        }

In [11]:
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_columns = [c for c,v in dtypes.items() if v in numerics]
categorical_columns = [c for c,v in dtypes.items() if v not in numerics]

In [12]:
retained_columns = numerical_columns + categorical_columns
train = pd.read_csv("../Raw data/train.csv",
                    usecols = retained_columns,
                    dtype = dtypes)

In [13]:
true_numerical_columns = [
    'Census_ProcessorCoreCount',
    'Census_PrimaryDiskTotalCapacity',
    'Census_SystemVolumeTotalCapacity',
    'Census_TotalPhysicalRAM',
    'Census_InternalPrimaryDiagonalDisplaySizeInInches',
    'Census_InternalPrimaryDisplayResolutionHorizontal',
    'Census_InternalPrimaryDisplayResolutionVertical',
    'Census_InternalBatteryNumberOfCharges'
]

binary_variables = [c for c in train.columns if train[c].nunique() == 2]

categorical_columns = [c for c in train.columns 
                       if (c not in true_numerical_columns) & (c not in binary_variables)]
                       

variables = {
    'categorical_columns': len(categorical_columns),
    'binary_variables': len(binary_variables),
    'true_numerical_columns': len(true_numerical_columns)
}

cardinality = []
for c in categorical_columns:
    if c == 'MachineIdentifier': continue
    cardinality.append([c, train[c].nunique()])
cardinality.sort(key = lambda x:x[1], reverse=False)

def frequency_encoding(variable):
    t = train[variable].value_counts().reset_index()
    t = t.reset_index()
    t.loc[t[variable] == 1, 'level_0'] = np.nan
    t.set_index('index', inplace=True)
    max_label = t['level_0'].max() + 1
    t.fillna(max_label, inplace=True)
    return t.to_dict()['level_0']

In [14]:
indexer = {}
for col in tqdm(categorical_columns):
    if col == 'MachineIdentifier': continue
    _, indexer[col] = pd.factorize(train[col])
    
for col in tqdm(categorical_columns):
    if col == 'MachineIdentifier': continue
    train[col] = indexer[col].get_indexer(train[col])

100%|██████████| 54/54 [00:08<00:00,  6.40it/s]
100%|██████████| 54/54 [36:54<00:00, 41.01s/it]


In [15]:
freq_enc_dict_dict = {}
for variable in tqdm(['Census_OEMModelIdentifier', 'CityIdentifier', 'Census_FirmwareVersionIdentifier']):
    freq_enc_dict_dict[variable] = frequency_encoding(variable)
    train[variable] = train[variable].map(lambda x: freq_enc_dict_dict[variable].get(x, np.nan))

100%|██████████| 3/3 [00:16<00:00,  5.51s/it]


In [16]:
verbose=True
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_mem = train.memory_usage().sum() / 1024**2    
for col in tqdm(train.columns):
    col_type = train[col].dtypes
    if col_type in numerics:
        c_min = train[col].min()
        c_max = train[col].max()
        if str(col_type)[:3] == 'int':
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                train[col] = train[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                train[col] = train[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                train[col] = train[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                train[col] = train[col].astype(np.int64)  
        else:
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                train[col] = train[col].astype(np.float16)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                train[col] = train[col].astype(np.float32)
            else:
                train[col] = train[col].astype(np.float64)    
end_mem = train.memory_usage().sum() / 1024**2
if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))

100%|██████████| 83/83 [00:08<00:00, 10.33it/s]

Mem. usage decreased to 1604.74 Mb (65.0% reduction)





In [17]:
gc.collect()
gc.collect()

0

In [18]:
train.to_csv('new_train.csv')

In [19]:
retained_columns.remove('HasDetections')
test = pd.read_csv('../Raw data/test.csv',
                   usecols = retained_columns,
                   dtype = dtypes)

In [20]:
for col in tqdm(categorical_columns):
    if col == 'MachineIdentifier': continue
    test[col] = indexer[col].get_indexer(test[col])

100%|██████████| 54/54 [30:58<00:00, 34.42s/it]


In [21]:
for variable in tqdm(['Census_OEMModelIdentifier', 'CityIdentifier', 'Census_FirmwareVersionIdentifier']):
    test[variable] = test[variable].map(lambda x: freq_enc_dict_dict[variable].get(x, np.nan))

100%|██████████| 3/3 [00:15<00:00,  5.10s/it]


In [22]:
verbose=True
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_mem = test.memory_usage().sum() / 1024**2    
for col in tqdm(test.columns):
    col_type = test[col].dtypes
    if col_type in numerics:
        c_min = test[col].min()
        c_max = test[col].max()
        if str(col_type)[:3] == 'int':
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                test[col] = test[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                test[col] = test[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                test[col] = test[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                test[col] = test[col].astype(np.int64)  
        else:
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                test[col] = test[col].astype(np.float16)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                test[col] = test[col].astype(np.float32)
            else:
                test[col] = test[col].astype(np.float64)    
end_mem = test.memory_usage().sum() / 1024**2
if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))

100%|██████████| 82/82 [00:08<00:00,  9.77it/s]

Mem. usage decreased to 1435.93 Mb (64.6% reduction)





In [23]:
gc.collect()
gc.collect()

0

In [24]:
test.to_csv('new_test.csv')

In [25]:
test.head()

Unnamed: 0,MachineIdentifier,ProductName,EngineVersion,AppVersion,AvSigVersion,IsBeta,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,...,Census_FirmwareManufacturerIdentifier,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier
0,0000010489e3af074adeac69c53e555e,0,-1,-1,-1,0.0,0,0.0,-1,0,...,24,4404.0,1.0,,0.0,0.0,0.0,0.0,0.0,5
1,00000176ac758d54827acd545b6315a5,0,-1,4,-1,0.0,0,0.0,-1,0,...,14,1.0,1.0,,0.0,0.0,0.0,0.0,1.0,8
2,0000019dcefc128c2d4387c1273dae1d,0,3,4,-1,0.0,0,0.0,-1,14,...,4,31.0,1.0,,0.0,0.0,0.0,0.0,1.0,6
3,0000055553dc51b1295785415f1a224d,0,-1,-1,-1,0.0,0,0.0,-1,79,...,0,210.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,00000574cefffeca83ec8adf9285b2bf,0,-1,4,-1,0.0,0,0.0,-1,0,...,4,22.0,1.0,,0.0,0.0,0.0,0.0,1.0,2
