In [17]:
# Environment setup and module import
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder


In [61]:
# Import DF from CSV

train_file = 'data/train_clean.csv'
#train_file = 'data/mini_train.csv'
train_df = pd.read_csv(train_file)


In [62]:
def reduce_mem(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [63]:
train_df = reduce_mem(train_df)

Memory usage of dataframe is 7155.39 MB
Memory usage after optimization is: 1723.87 MB
Decreased by 75.9%


In [64]:
import gc
gc.collect()

15

In [65]:
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_cols = [c for c,v in train_df.dtypes.items() if v in numerics and c in train_df.columns]
#numeric_cols

In [66]:
nominal_cols = [c for c in train_df.columns if (c not in numeric_cols)]
#nominal_cols

In [67]:
binary_cols = [c for c in train_df.columns if (train_df[c].nunique() == 2 and c not in nominal_cols)]
#binary_cols

In [68]:
binary_cols

['IsBeta',
 'IsSxsPassiveMode',
 'HasTpm',
 'AutoSampleOptIn',
 'Census_HasOpticalDiskDrive',
 'Census_IsPortableOperatingSystem',
 'Census_IsSecureBootEnabled',
 'Census_IsTouchEnabled',
 'Census_IsPenCapable',
 'HasDetections',
 'Rtp_NaN',
 'AvSigVersion_major',
 'AvSigVersion_minor',
 'AvSigVersion_build1',
 'AvSigVersion_build2',
 'Census_OSVersion_major',
 'Census_OSVersion_minor',
 'Census_OSVersion_build1',
 'Census_OSVersion_build2',
 'Rtp_0',
 'Rtp_1',
 'Rtp_2',
 'Rtp_3',
 'Rtp_5']

In [70]:
#
# encode features with version numbers
#
def make_encoded_version_column(df, col):
    df[col] = df[col+'_major'] * 10000000 + df[col+'_minor'] * 100000 + df[col+'_build1'] * 1000 + df[col+'_build2']
    return df#.drop(columns=[col+'_major', col+'_minor',col+'_build1', col+'_build2'])

train_df = make_encoded_version_column(train_df, "EngineVersion")
train_df = make_encoded_version_column(train_df, "AppVersion")
train_df = make_encoded_version_column(train_df, "AvSigVersion")
train_df = make_encoded_version_column(train_df, "Census_OSVersion")
print("done encoding version columns")

done encoding version columns


In [75]:
#train_df['Census_OSVersion'].unique()
#train_df['AvSigVersion'].unique()
#train_df['EngineVersion'].unique()
train_df['AppVersion'].unique()



array([40404004])

In [None]:
import matplotlib.pyplot as plt
train_df[numeric_cols].hist(figsize=(48,48), bins=50)
plt.show()

In [12]:
# Encode nominal variables as labels
le = preprocessing.LabelEncoder()
train_df = train_df[nominal_cols].apply(le.fit_transform)
train_df[nominal_cols].head()

Unnamed: 0,MachineIdentifier,ProductName,Platform,Processor,OsVer,OsPlatformSubRelease,SkuEdition,PuaMode,SmartScreen,Census_MDC2FormFactor,...,Census_InternalBatteryType,Census_OSArchitecture,Census_OSBranch,Census_OSEdition,Census_OSSkuName,Census_OSInstallTypeName,Census_OSWUAutoUpdateOptionsName,Census_GenuineStateName,Census_ActivationChannel,Census_FlightRing
0,34678,1,0,2,0,4,4,1,1,7,...,18,2,5,4,4,6,2,1,2,3
1,22670,1,0,1,0,3,4,1,5,7,...,18,0,3,4,4,8,3,1,0,3
2,46569,1,0,1,0,2,4,1,9,7,...,18,0,1,1,1,3,3,1,0,3
3,4316,1,0,1,0,4,6,1,9,7,...,18,0,5,11,12,2,5,2,2,3
4,9818,1,0,1,0,4,4,1,9,7,...,18,0,5,1,1,5,2,1,1,3


In [28]:
# split out labels and ids
#train_labels = train_df["HasDetections"].values
#train_ids = train_df["MachineIdentifier"].values
#train_df = train_df.drop(columns=["HasDetections", "MachineIdentifier"])


In [29]:
# Perform whatever conversions are necessary to get everything into proper format: ndarray

le = preprocessing.LabelEncoder()
df = train_df[train_df.select_dtypes(object).columns].apply(le.fit_transform)
for c in train_df[train_df.select_dtypes(int).columns]:
    df[c] = train_df[c]
for c in train_df[train_df.select_dtypes(bool).columns]:
    df[c] = train_df[c]

# To do: Perform standardization for input parameters


In [None]:
#
# encode the values of RtpStateBitfield (NaN, 0, 1, 3, 5, 7, 8, 35) as indicators
#

#df["Rtp_0"] = df["RtpStateBitfield"].map(lambda x: 0 if np.isnan(x) else 1 if int(x)&1 else 0).astype(np.uint8)
#df["Rtp_1"] = df["RtpStateBitfield"].map(lambda x: 0 if np.isnan(x) else 1 if int(x)&2 else 0).astype(np.uint8)
#df["Rtp_2"] = df["RtpStateBitfield"].map(lambda x: 0 if np.isnan(x) else 1 if int(x)&4 else 0).astype(np.uint8)
#df["Rtp_3"] = df["RtpStateBitfield"].map(lambda x: 0 if np.isnan(x) else 1 if int(x)&8 else 0).astype(np.uint8)
#df["Rtp_4"] = df["RtpStateBitfield"].map(lambda x: 0 if np.isnan(x) else 1 if int(x)&16 else 0).astype(np.uint8)
#df["Rtp_5"] = df["RtpStateBitfield"].map(lambda x: 0 if np.isnan(x) else 1 if int(x)&32 else 0).astype(np.uint8)
#df = df.drop(columns=["RtpStateBitfield"])
#print("done encoding RtpStateBitfield")

In [None]:

#x_train = df
#y_train = train_labels


In [30]:
# Testing Cell
#print(x_train.shape)
#print(y_train.shape)

(8921483, 104)
(8921483,)


In [31]:
#
# save
#

train_file = 'data/train_encoded.csv'
#train_file = 'data/mini_train_encoded.csv'
#x_train.to_csv(train_file)

df.to_csv(train_file)
