In [1]:
# Environment setup and module import
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import gc
import matplotlib.pyplot as plt


%matplotlib inline

In [2]:
debug_encoding = False # if True, uses existing mini_train_clean.csv file so this runs fast
save_data = True # if false, will not save, which takes the most time


In [3]:
class Encoder:
    def __init__(self, in_file):
        self.df = pd.read_csv(in_file)
        print("Completed read operation for", in_file)
        self.reduce_mem()
        gc.collect()
        self.make_subsets(self.df)
        self.encode_it()
        self.transform_df(self.df, self.nominal_cols)
        self.std_norm()
        
    
    def reduce_mem(self, verbose=True):
        start_mem = self.df.memory_usage().sum() / 1024**2
        print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

        for col in self.df.columns:
            col_type = self.df[col].dtype

            if col_type != object:
                c_min = self.df[col].min()
                c_max = self.df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        self.df[col] = self.df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        self.df[col] = self.df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        self.df[col] = self.df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        self.df[col] = self.df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        self.df[col] = self.df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        self.df[col] = self.df[col].astype(np.float32)
                    else:
                        self.df[col] = self.df[col].astype(np.float64)
            else:
                self.df[col] = self.df[col].astype('category')

        end_mem = self.df.memory_usage().sum() / 1024**2
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

        return self.df

    def make_subsets(self, df):
        numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        self.numeric_cols = [c for c,v in self.df.dtypes.items() if v in numerics and c in self.df.columns]
        self.nominal_cols = [c for c in self.df.columns if (c not in self.numeric_cols)]
        # Andrew - still need to fix this
        self.nominal_cols.remove('SmartScreen')
        self.binary_cols = [c for c in self.df.columns if (self.df[c].nunique() == 2 and c not in self.nominal_cols)]
        self.labels = df["HasDetections"].values
        print("subsets are complete")
        return

    def transform_df(self, in_df, nominal_cols):
        le = preprocessing.LabelEncoder()
        tmp_df = in_df[nominal_cols].apply(le.fit_transform)
        for c in in_df.loc[:, in_df.dtypes == np.int8].columns:
            tmp_df[c] = in_df[c]
        for c in in_df.loc[:, in_df.dtypes == np.int16].columns:
            tmp_df[c] = in_df[c]
        for c in in_df.loc[:, in_df.dtypes == np.int32].columns:
            tmp_df[c] = in_df[c]
        for c in in_df.loc[:, in_df.dtypes == np.float16].columns:
            tmp_df[c] = in_df[c]
        for c in in_df.loc[:, in_df.dtypes == np.float32].columns:
            tmp_df[c] = in_df[c]
        for c in in_df[in_df.select_dtypes(bool).columns]:
            tmp_df[c] = in_df[c]
        self.df = tmp_df
        print("completed transforming dtypes")
        return

    def std_norm(self):
        col_to_std = ['AVProductStatesIdentifier','CountryIdentifier','CityIdentifier','GeoNameIdentifier','LocaleEnglishNameIdentifier','OsBuild','IeVerIdentifier','Census_OEMNameIdentifier','Census_OEMModelIdentifier','Census_ProcessorCoreCount','Census_ProcessorModelIdentifier','Census_PrimaryDiskTotalCapacity','Census_SystemVolumeTotalCapacity','Census_TotalPhysicalRAM','Census_InternalPrimaryDiagonalDisplaySizeInInches','Census_InternalPrimaryDisplayResolutionHorizontal','Census_InternalPrimaryDisplayResolutionVertical','Census_InternalBatteryNumberOfCharges','Census_OSBuildNumber','Census_OSInstallLanguageIdentifier','Census_OSUILocaleIdentifier','Census_FirmwareManufacturerIdentifier','Census_FirmwareVersionIdentifier','Wdft_RegionIdentifier','OsBuildLab_major','OsBuildLab_minor','OsBuildLab_platform','OsBuildLab_release','OsBuildLab_build2']
        scaled_features = self.df.copy()
        features = scaled_features[col_to_std]
        scaler = StandardScaler().fit(features.values)
        features = scaler.transform(features.values)
        scaled_features[col_to_std] = features
        self.df = scaled_features
        print("completed standardization and normalization")
        return
    
    def encode_it(self):
        le = preprocessing.LabelEncoder()
        for n in self.nominal_cols:
            self.df[n] = le.fit_transform(self.df[n])
        print("completed encoding")
        return
    
    def export_it(self, out_file):
        self.df.to_csv(out_file)
        print("export complete")

In [None]:
if debug_encoding == False:
    train_enc = Encoder('data/train_clean.csv')
    test_enc = Encoder('data/real_test_clean.csv')
    dev_enc = Encoder('data/dev_clean.csv')
    if save_data == True:
        train_enc.export_it('data/train_encoded.csv')
        test_enc.export_it('data/test_encoded.csv')
        dev_enc.export_it('data/dev_encoded.csv')
        print("Full output saved.")
train_enc = Encoder('data/mini_train_clean.csv')
test_enc = Encoder('data/mini_test_clean.csv')
dev_enc = Encoder('data/mini_dev_clean.csv')
if save_data == True:
    train_enc.export_it('data/mini_train_encoded.csv')
    test_enc.export_it('data/mini_test_encoded.csv')
    dev_enc.export_it('data/mini_dev_encoded.csv')
    print("Mini output saved.")

Completed read operation for data/train_clean.csv
Memory usage of dataframe is 5640.08 MB
