In [1]:
# Environment setup and module import
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import gc
import matplotlib.pyplot as plt


%matplotlib inline

In [2]:
debug_encoding = False # if True, uses existing mini_train_clean.csv file so this runs fast
save_data = True # if false, will not save, which takes the most time


In [None]:
# Import DF from CSV

if debug_encoding == True:
    train_file = 'data/mini_train_clean.csv'
    test_file = 'data/mini_test_clean.csv'
    dev_file = 'data/mini_dev_clean.csv'
else:
    train_file = 'data/train_clean.csv'
    test_file = 'data/real_test_clean.csv'
    dev_file = 'data/dev_clean.csv'

train_df = pd.read_csv(train_file, low_memory=False)
test_df = pd.read_csv(test_file, low_memory=False)
dev_df = pd.read_csv(dev_file, low_memory=False)

In [None]:
def reduce_mem(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def make_subsets(df):
    numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numeric_cols = [c for c,v in df.dtypes.items() if v in numerics and c in df.columns]
    nominal_cols = [c for c in df.columns if (c not in numeric_cols)]
    binary_cols = [c for c in df.columns if (df[c].nunique() == 2 and c not in nominal_cols)]
    labels = df["HasDetections"].values
    return numeric_cols, nominal_cols, binary_cols, labels

def transform_df(in_df, nominal_cols):
    le = preprocessing.LabelEncoder()
    df = in_df[nominal_cols].apply(le.fit_transform)
    for c in in_df.loc[:, in_df.dtypes == np.int8].columns:
        df[c] = in_df[c]
    for c in in_df.loc[:, in_df.dtypes == np.int16].columns:
        df[c] = train_df[c]
    for c in in_df.loc[:, in_df.dtypes == np.int32].columns:
        df[c] = in_df[c]
    for c in in_df.loc[:, in_df.dtypes == np.float16].columns:
        df[c] = in_df[c]
    for c in in_df.loc[:, in_df.dtypes == np.float32].columns:
        df[c] = in_df[c]
    for c in in_df[in_df.select_dtypes(bool).columns]:
        df[c] = in_df[c]
    return df

def make_encoded_version_column(df, col):
    df[col] = df[col+'_major'] * 10000000 + df[col+'_minor'] * 100000 + df[col+'_build1'] * 1000 + df[col+'_build2']
    return df#.drop(columns=[col+'_major', col+'_minor',col+'_build1', col+'_build2'])

def std_norm(df):
    col_to_std = ['AVProductStatesIdentifier','CountryIdentifier','CityIdentifier','GeoNameIdentifier','LocaleEnglishNameIdentifier','OsBuild','IeVerIdentifier','Census_OEMNameIdentifier','Census_OEMModelIdentifier','Census_ProcessorCoreCount','Census_ProcessorModelIdentifier','Census_PrimaryDiskTotalCapacity','Census_SystemVolumeTotalCapacity','Census_TotalPhysicalRAM','Census_InternalPrimaryDiagonalDisplaySizeInInches','Census_InternalPrimaryDisplayResolutionHorizontal','Census_InternalPrimaryDisplayResolutionVertical','Census_InternalBatteryNumberOfCharges','Census_OSBuildNumber','Census_OSInstallLanguageIdentifier','Census_OSUILocaleIdentifier','Census_FirmwareManufacturerIdentifier','Census_FirmwareVersionIdentifier','Wdft_RegionIdentifier','OsBuildLab_major','OsBuildLab_minor','OsBuildLab_platform','OsBuildLab_release','OsBuildLab_build2']
    scaled_features = df.copy()
    features = scaled_features[col_to_std]
    scaler = StandardScaler().fit(features.values)
    features = scaler.transform(features.values)
    scaled_features[col_to_std] = features
    df = scaled_features
    return df

In [None]:
# Clean up memory
train_df = reduce_mem(train_df)
test_df = reduce_mem(test_df)
dev_df = reduce_mem(dev_df)
gc.collect()

In [None]:
# Define dtypes and references to column subsets

train_numeric_cols, train_nominal_cols, train_binary_cols, train_labels = make_subsets(train_df)
test_numeric_cols, test_nominal_cols, test_binary_cols, test_labels = make_subsets(test_df)
dev_numeric_cols, dev_nominal_cols, dev_binary_cols, dev_labels = make_subsets(dev_df)

print("done selecting column types")

In [None]:
# Encode select features with version numbers

train_df = make_encoded_version_column(train_df, "EngineVersion")
train_df = make_encoded_version_column(train_df, "AppVersion")
train_df = make_encoded_version_column(train_df, "AvSigVersion")
train_df = make_encoded_version_column(train_df, "Census_OSVersion")

test_df = make_encoded_version_column(train_df, "EngineVersion")
test_df = make_encoded_version_column(train_df, "AppVersion")
test_df = make_encoded_version_column(train_df, "AvSigVersion")
test_df = make_encoded_version_column(train_df, "Census_OSVersion")

dev_df = make_encoded_version_column(train_df, "EngineVersion")
dev_df = make_encoded_version_column(train_df, "AppVersion")
dev_df = make_encoded_version_column(train_df, "AvSigVersion")
dev_df = make_encoded_version_column(train_df, "Census_OSVersion")

# Encode remaining nominal variables as labels

# TO REMOVE AFTER CLEANUP, ANDREW:
train_nominal_cols.remove('SmartScreen')
test_nominal_cols.remove('SmartScreen')
dev_nominal_cols.remove('SmartScreen')

le = preprocessing.LabelEncoder()
for n in train_nominal_cols:
    train_df[n] = le.fit_transform(train_df[n])
le = preprocessing.LabelEncoder()
for n in test_nominal_cols:
    test_df[n] = le.fit_transform(test_df[n])
le = preprocessing.LabelEncoder()
for n in dev_nominal_cols:
    dev_df[n] = le.fit_transform(dev_df[n])
    
print("done encoding version columns")

In [None]:
# Encode labels so that they can be flattened into an arra
train_df = transform_df(train_df, train_nominal_cols)
test_df = transform_df(test_df, test_nominal_cols)
dev_df = transform_df(dev_df, dev_nominal_cols)


In [None]:
# Standardize and normalize

train_df = std_norm(train_df)
test_df = std_norm(test_df)
test_df = std_norm(test_df)



In [None]:
#Verify output

print(train_df.shape)

for n in train_df.columns:
    print(n,"plot:")
    plt.hist(train_df[n].dropna())
    plt.show()

In [None]:
# Save to export file
if save_data == True:
    if debug_encoding == True:
        train_file = 'data/mini_train_encoded.csv'
        test_file = 'data/mini_test_encoded.csv'
        dev_file = 'data/mini_dev_encoded.csv'
    else:
        train_file = 'data/train_encoded.csv'
        test_file = 'data/test_encoded.csv'
        dev_file = 'data/dev_encoded.csv'
    train_df.to_csv(train_file)
    test_df.to_csv(test_file)
    dev_df.to_csv(dev_file)
    print("Output saved.")
else:
    print("Output not saved.")