In [1]:
# Environment setup and module import
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import gc
import matplotlib.pyplot as plt


%matplotlib inline

In [2]:
debug_encoding = False # if True, uses existing mini_train_clean.csv file so this runs fast
save_data = True # if false, will not save, which takes the most time


In [5]:
dtypes = {
    'MachineIdentifier':                                    'str',
    'ProductName':                                          'str',
    'EngineVersion':                                        'str',
    'AppVersion':                                           'str',
    'AvSigVersion':                                         'str',
    'IsBeta':                                               'int8',
    'RtpStateBitfield':                                     'float64',
    'IsSxsPassiveMode':                                     'int8',
    'DefaultBrowsersIdentifier':                            'float32',
    'AVProductStatesIdentifier':                            'float32',
    'AVProductsInstalled':                                  'float16',
    'AVProductsEnabled':                                    'float16',
    'HasTpm':                                               'int8',
    'CountryIdentifier':                                    'int16',
    'CityIdentifier':                                       'float32',
    'OrganizationIdentifier':                               'float16',
    'GeoNameIdentifier':                                    'float16',
    'LocaleEnglishNameIdentifier':                          'int16',
    'Platform':                                             'category',
    'Processor':                                            'category',
    'OsVer':                                                'category',
    'OsBuild':                                              'int16',
    'OsSuite':                                              'int16',
    'OsPlatformSubRelease':                                 'category',
    'OsBuildLab':                                           'category',
    'SkuEdition':                                           'category',
    'IsProtected':                                          'float16',
    'AutoSampleOptIn':                                      'int8',
    'PuaMode':                                              'category',
    'SMode':                                                'float16',
    'IeVerIdentifier':                                      'float16',
    'SmartScreen':                                          'str',
    'Firewall':                                             'float16',
    'UacLuaenable':                                         'float64', 
    'Census_MDC2FormFactor':                                'category',
    'Census_DeviceFamily':                                  'category',
    'Census_OEMNameIdentifier':                             'float32', 
    'Census_OEMModelIdentifier':                            'float32',
    'Census_ProcessorCoreCount':                            'float16',
    'Census_ProcessorManufacturerIdentifier':               'float16',
    'Census_ProcessorModelIdentifier':                      'float32', 
    'Census_ProcessorClass':                                'category',
    'Census_PrimaryDiskTotalCapacity':                      'float64', 
    'Census_PrimaryDiskTypeName':                           'category',
    'Census_SystemVolumeTotalCapacity':                     'float64', 
    'Census_HasOpticalDiskDrive':                           'int8',
    'Census_TotalPhysicalRAM':                              'float32',
    'Census_ChassisTypeName':                               'str',
    'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32', 
    'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32', 
    'Census_InternalPrimaryDisplayResolutionVertical':      'float32', 
    'Census_PowerPlatformRoleName':                         'category',
    'Census_InternalBatteryType':                           'str',
    'Census_InternalBatteryNumberOfCharges':                'float64', 
    'Census_OSVersion':                                     'category',
    'Census_OSArchitecture':                                'category',
    'Census_OSBranch':                                      'category',
    'Census_OSBuildNumber':                                 'int16',
    'Census_OSBuildRevision':                               'int32',
    'Census_OSEdition':                                     'str',
    'Census_OSSkuName':                                     'category',
    'Census_OSInstallTypeName':                             'category',
    'Census_OSInstallLanguageIdentifier':                   'float16',
    'Census_OSUILocaleIdentifier':                          'int16',
    'Census_OSWUAutoUpdateOptionsName':                     'category',
    'Census_IsPortableOperatingSystem':                     'int8',
    'Census_GenuineStateName':                              'category',
    'Census_ActivationChannel':                             'category',
    'Census_IsFlightingInternal':                           'float16',
    'Census_IsFlightsDisabled':                             'float16',
    'Census_FlightRing':                                    'category',
    'Census_ThresholdOptIn':                                'float16',
    'Census_FirmwareManufacturerIdentifier':                'float16',
    'Census_FirmwareVersionIdentifier':                     'float32',
    'Census_IsSecureBootEnabled':                           'int8',
    'Census_IsWIMBootEnabled':                              'float16',
    'Census_IsVirtualDevice':                               'float16',
    'Census_IsTouchEnabled':                                'int8',
    'Census_IsPenCapable':                                  'int8',
    'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
    'Wdft_IsGamer':                                         'float16',
    'Wdft_RegionIdentifier':                                'float16',
    'HasDetections':                                        'int8'
}



In [6]:
class Encoder:
    def __init__(self, in_file):
        self.df = pd.read_csv(in_file, dtype=dtypes)
        print("Completed read operation for", in_file)
        self.reduce_mem()
        gc.collect()
        self.make_subsets(self.df)
        self.encode_it()
        self.transform_df(self.df, self.nominal_cols)
        self.std_norm()
        
    
    def reduce_mem(self, verbose=True):
        start_mem = self.df.memory_usage().sum() / 1024**2
        print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

        for col in self.df.columns:
            col_type = self.df[col].dtype

            if col_type != object:
                c_min = self.df[col].min()
                c_max = self.df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        self.df[col] = self.df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        self.df[col] = self.df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        self.df[col] = self.df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        self.df[col] = self.df[col].astype(np.int64)  
                # leave floats alone because the downcasting is messing up our mapped values
                #else:
                #    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                #        df[col] = df[col].astype(np.float16)
                #    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                #        df[col] = df[col].astype(np.float32)
                #    else:
                #        df[col] = df[col].astype(np.float64)
                else:
                    self.df[col] = self.df[col].astype('category')

        end_mem = self.df.memory_usage().sum() / 1024**2
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

        return self.df

    def make_subsets(self, df):
        numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        self.numeric_cols = [c for c,v in self.df.dtypes.items() if v in numerics and c in self.df.columns]
        self.nominal_cols = [c for c in self.df.columns if (c not in self.numeric_cols)]
        # Andrew - still need to fix this
        self.nominal_cols.remove('SmartScreen')
        self.binary_cols = [c for c in self.df.columns if (self.df[c].nunique() == 2 and c not in self.nominal_cols)]
        self.unary_cols = [c for c in self.df.columns if (self.df[c].nunique() == 1 and c not in self.nominal_cols)]
        if "HasDetections" in df.columns:
            self.labels = df["HasDetections"].values
        print("subsets are complete")
        return

    def transform_df(self, in_df, nominal_cols):
        le = preprocessing.LabelEncoder()
        tmp_df = in_df[nominal_cols].apply(le.fit_transform)
        for c in in_df.loc[:, in_df.dtypes == np.int8].columns:
            tmp_df[c] = in_df[c]
        for c in in_df.loc[:, in_df.dtypes == np.int16].columns:
            tmp_df[c] = in_df[c]
        for c in in_df.loc[:, in_df.dtypes == np.int32].columns:
            tmp_df[c] = in_df[c]
        for c in in_df.loc[:, in_df.dtypes == np.float16].columns:
            tmp_df[c] = in_df[c]
        for c in in_df.loc[:, in_df.dtypes == np.float32].columns:
            tmp_df[c] = in_df[c]
        for c in in_df[in_df.select_dtypes(bool).columns]:
            tmp_df[c] = in_df[c]
        self.df = tmp_df
        print("completed transforming dtypes")
        return

    def std_norm(self):
        col_to_std = ['AVProductStatesIdentifier','CountryIdentifier','CityIdentifier','GeoNameIdentifier','LocaleEnglishNameIdentifier','OsBuild','IeVerIdentifier','Census_OEMNameIdentifier','Census_OEMModelIdentifier','Census_ProcessorCoreCount','Census_ProcessorModelIdentifier','Census_PrimaryDiskTotalCapacity','Census_SystemVolumeTotalCapacity','Census_TotalPhysicalRAM','Census_InternalPrimaryDiagonalDisplaySizeInInches','Census_InternalPrimaryDisplayResolutionHorizontal','Census_InternalPrimaryDisplayResolutionVertical','Census_InternalBatteryNumberOfCharges','Census_OSBuildNumber','Census_OSInstallLanguageIdentifier','Census_OSUILocaleIdentifier','Census_FirmwareManufacturerIdentifier','Census_FirmwareVersionIdentifier','Wdft_RegionIdentifier','OsBuildLab_major','OsBuildLab_minor','OsBuildLab_platform','OsBuildLab_release','OsBuildLab_build2']
        scaled_features = self.df.copy()
        features = scaled_features[col_to_std]
        scaler = StandardScaler().fit(features.values)
        features = scaler.transform(features.values)
        scaled_features[col_to_std] = features
        self.df = scaled_features
        print("completed standardization and normalization")
        return
    
    def encode_it(self):
        le = preprocessing.LabelEncoder()
        for n in self.nominal_cols:
            self.df[n] = le.fit_transform(self.df[n])
        print("completed encoding")
        return
    
    def export_it(self, out_file):
        self.df.to_csv(out_file, index=False)
        print("export complete")

In [None]:
in_files = ['data/train_clean.csv', 'data/real_test_clean.csv', 'data/dev_clean.csv']
out_files = ['data/train_encoded.csv', 'data/test_encoded.csv', 'data/dev_encoded.csv']
in_mini_files = ['data/mini_train_clean.csv', 'data/mini_test_clean.csv', 'data/mini_dev_clean.csv']
out_mini_files = ['data/mini_train_encoded.csv', 'data/mini_test_encoded.csv', 'data/mini_dev_encoded.csv']

for i in range(0,3):
    if debug_encoding == False:
        enc = Encoder(in_files[i])
        if save_data == True:
            enc.export_it(out_files[i])
    enc = Encoder(in_mini_files[i])
    if save_data == True:
        enc.export_it(out_mini_files[i])