In [13]:
%matplotlib inline

# Import a bunch of libraries.
import time
import numpy as np
import pandas as pd
 
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_openml
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, zero_one_loss
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

# some PCA and GMM stuff
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from matplotlib.colors import LogNorm

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble._hist_gradient_boosting.utils import (get_equivalent_estimator)

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble._hist_gradient_boosting.utils import (get_equivalent_estimator)

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble._hist_gradient_boosting.utils import (get_equivalent_estimator)

import sklearn.ensemble as ske
from sklearn.feature_selection import SelectFromModel

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVC

import seaborn as sns
sns.set()

# Packages that may require installation

#!pip install lightgbm --user
#!pip install setuptools wheel numpy scipy scikit-learn --user
#!pip install xgboost --user
#!pip install catboost --user


# Definitions

In [20]:
def run_KNN(tdf, ddf):
    train_data = tdf.df
    dev_data = ddf.df
    train_data_corr = train_data[tdf.correlated_cols]
    dev_data_corr = dev_data[tdf.correlated_cols]

    X_train = train_data_corr
    y_train = tdf.labels
    X_test = dev_data_corr
    y_test = ddf.labels

    classifier =  KNeighborsClassifier(n_neighbors=9, n_jobs=-1)
    classifier.fit(X_train, y_train)

    pred_y = classifier.predict(X_test)

    results = confusion_matrix(y_test, pred_y)
    error = zero_one_loss(y_test, pred_y)
    accuracy = metrics.accuracy_score(y_test, pred_y)
    print("Confusion Matrix:")
    print(results)
    print("Error:")
    print(error)
    print("Accuracy:")
    print(accuracy)
    return

def run_GradientBooster(tdf, ddf):
    train_data = tdf.df
    dev_data = ddf.df
    train_data_corr = train_data[tdf.correlated_cols]
    dev_data_corr = dev_data[tdf.correlated_cols]

    X_train = train_data_corr
    y_train = tdf.labels
    X_test = dev_data_corr
    y_test = ddf.labels

    clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.6, max_depth=3, random_state=0)
    clf.fit(X_train, y_train)

    # predict and score on the dev set
    pred_y = clf.predict(X_test)

    results = confusion_matrix(y_test, pred_y)
    error = zero_one_loss(y_test, pred_y)
    accuracy = metrics.accuracy_score(y_test, pred_y)

    print("Confusion Matrix:")
    print(results)
    print("Error:")
    print(error)
    print("Accuracy:")
    print(accuracy)
    return

def run_AdaBooster(tdf, ddf):
    train_data = tdf.df
    dev_data = ddf.df
    train_data_corr = train_data[tdf.correlated_cols]
    dev_data_corr = dev_data[tdf.correlated_cols]

    X_train = train_data_corr
    y_train = tdf.labels
    X_test = dev_data_corr
    y_test = ddf.labels

    clf = AdaBoostClassifier(n_estimators=100)
    clf.fit(X_train, y_train)

    # predict and score on the dev set
    pred_y = clf.predict(X_test)

    results = confusion_matrix(y_test, pred_y)
    error = zero_one_loss(y_test, pred_y)
    accuracy = metrics.accuracy_score(y_test, pred_y)

    print("Confusion Matrix:")
    print(results)
    print("Error:")
    print(error)
    print("Accuracy:")
    print(accuracy)
    return

def run_LightGBM(tdf, ddf):
    train_data = tdf.df
    dev_data = ddf.df
    train_data_corr = train_data[tdf.correlated_cols]
    dev_data_corr = dev_data[tdf.correlated_cols]

    X_train = train_data_corr
    y_train = tdf.labels
    X_test = dev_data_corr
    y_test = ddf.labels

    est = HistGradientBoostingClassifier(learning_rate=.1,
                    max_iter=100,
                    max_bins=100,
                    max_leaf_nodes=31,
                    n_iter_no_change=None,
                    random_state=0,
                    verbose=0)

    est.set_params(loss='binary_crossentropy')

    lightgbm_est = get_equivalent_estimator(est, lib='lightgbm')
    lightgbm_est.fit(X_train, y_train)

    # predict and score on the dev set
    pred_y = lightgbm_est.predict(X_test)

    results = confusion_matrix(y_test, pred_y)
    error = zero_one_loss(y_test, pred_y)
    accuracy = metrics.accuracy_score(y_test, pred_y)
    print("Confusion Matrix:")
    print(results)
    print("Error:")
    print(error)
    print("Accuracy:")
    print(accuracy)
    return

def run_XGBoost(tdf, ddf):
    train_data = tdf.df
    dev_data = ddf.df
    train_data_corr = train_data[tdf.correlated_cols]
    dev_data_corr = dev_data[tdf.correlated_cols]

    X_train = train_data_corr
    y_train = tdf.labels
    X_test = dev_data_corr
    y_test = ddf.labels
    est = HistGradientBoostingClassifier(learning_rate=.1,
                    max_iter=100,
                    max_bins=100,
                    max_leaf_nodes=31,
                    n_iter_no_change=None,
                    random_state=0,
                    verbose=0)

    est.set_params(loss='binary_crossentropy')

    lightgbm_est = get_equivalent_estimator(est, lib='xgboost')
    lightgbm_est.fit(X_train, y_train)

    # predict and score on the dev set
    pred_y = lightgbm_est.predict(X_test)

    results = confusion_matrix(y_test, pred_y)
    error = zero_one_loss(y_test, pred_y)
    accuracy = metrics.accuracy_score(y_test, pred_y)
    print("Confusion Matrix:")
    print(results)
    print("Error:")
    print(error)
    print("Accuracy:")
    print(accuracy)
    return

def run_CatBoost(tdf, ddf):
    train_data = tdf.df
    dev_data = ddf.df
    train_data_corr = train_data[tdf.correlated_cols]
    dev_data_corr = dev_data[tdf.correlated_cols]

    X_train = train_data_corr
    y_train = tdf.labels
    X_test = dev_data_corr
    y_test = ddf.labels
    est = HistGradientBoostingClassifier(learning_rate=.1,
                    max_iter=100,
                    max_bins=100,
                    max_leaf_nodes=31,
                    n_iter_no_change=None,
                    random_state=0,
                    verbose=0)

    est.set_params(loss='binary_crossentropy')

    lightgbm_est = get_equivalent_estimator(est, lib='catboost')
    lightgbm_est.fit(X_train, y_train)

    # predict and score on the dev set
    pred_y = lightgbm_est.predict(X_test)

    results = confusion_matrix(y_test, pred_y)
    error = zero_one_loss(y_test, pred_y)
    accuracy = metrics.accuracy_score(y_test, pred_y)
    print("Confusion Matrix:")
    print(results)
    print("Error:")
    print(error)
    print("Accuracy:")
    print(accuracy)
    return


def run_RFC(tdf, ddf):
    '''Random Forest Classifier'''
    train_data = tdf.df
    dev_data = ddf.df
    train_data_corr = train_data[tdf.correlated_cols]
    dev_data_corr = dev_data[tdf.correlated_cols]

    X_train = train_data_corr
    y_train = tdf.labels
    X_test = dev_data_corr
    y_test = ddf.labels
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(X_train, y_train)

    # predict and score on the dev set
    pred_y = clf.predict(X_test)

    results = confusion_matrix(y_test, pred_y)
    error = zero_one_loss(y_test, pred_y)
    accuracy = metrics.accuracy_score(y_test, pred_y)
    print("Confusion Matrix:")
    print(results)
    print("Error:")
    print(error)
    print("Accuracy:")
    print(accuracy)
    return


def run_ETC(tdf, ddf):
    '''Extra Trees Classifier'''
    train_data = tdf.df
    dev_data = ddf.df
    train_data_corr = train_data[tdf.correlated_cols]
    dev_data_corr = dev_data[tdf.correlated_cols]

    X_train = train_data_corr
    y_train = tdf.labels
    X_test = dev_data_corr
    y_test = ddf.labels
    clf = ExtraTreesClassifier(n_estimators=100, min_samples_split=2, random_state=0)
    clf.fit(X_train, y_train)

    # predict and score on the dev set
    pred_y = clf.predict(X_test)

    results = confusion_matrix(y_test, pred_y)
    error = zero_one_loss(y_test, pred_y)
    accuracy = metrics.accuracy_score(y_test, pred_y)
    print("Confusion Matrix:")
    print(results)
    print("Error:")
    print(error)
    print("Accuracy:")
    print(accuracy)
    return


def run_DTC(tdf, ddf):
    '''Decision Trees Classifier'''
    train_data = tdf.df
    dev_data = ddf.df
    train_data_corr = train_data[tdf.correlated_cols]
    dev_data_corr = dev_data[tdf.correlated_cols]

    X_train = train_data_corr
    y_train = tdf.labels
    X_test = dev_data_corr
    y_test = ddf.labels
    dtc = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0)
    dtc.fit(X_train, y_train)

    # predict and score on the dev set
    pred_y = dtc.predict(X_test)

    results = confusion_matrix(y_test, pred_y)
    error = zero_one_loss(y_test, pred_y)
    accuracy = metrics.accuracy_score(y_test, pred_y)
    print("Confusion Matrix:")
    print(results)
    print("Error:")
    print(error)
    print("Accuracy:")
    print(accuracy)
    return


def run_LinSVC(tdf, ddf):
    '''Decision Trees Classifier'''
    train_data = tdf.df
    dev_data = ddf.df
    train_data_corr = train_data[tdf.correlated_cols]
    dev_data_corr = dev_data[tdf.correlated_cols]

    X_train = train_data_corr
    y_train = tdf.labels
    X_test = dev_data_corr
    y_test = ddf.labels
    classifier = LinearSVC(max_iter=5000)
    classifier.fit(X_train, y_train)

    pred_y = classifier.predict(X_test)

    results = confusion_matrix(y_test, pred_y)
    error = zero_one_loss(y_test, pred_y)
    accuracy = metrics.accuracy_score(y_test, pred_y)
    print("Confusion Matrix:")
    print(results)
    print("Error:")
    print(error)
    print("Accuracy:")
    print(accuracy)
    return

def run_SFM(tdf, ddf):
    train_data = tdf.df
    dev_data = ddf.df
    train_data_corr = train_data[tdf.correlated_cols]
    dev_data_corr = dev_data[tdf.correlated_cols]

    X_train = train_data_corr
    y_train = tdf.labels
    X_test = dev_data_corr
    y_test = ddf.labels
    
    fsel = ske.ExtraTreesClassifier(n_estimators=100).fit(train_data, tdf.labels)
    model = SelectFromModel(fsel, prefit=True)
    X_train_data_new = model.transform(train_data)
    X_test_data_new = model.transform(dev_data)
    nb_features = X_train_data_new.shape[1]
    
    est = HistGradientBoostingClassifier(learning_rate=.1,
                max_iter=100,
                max_bins=100,
                max_leaf_nodes=31,
                n_iter_no_change=None,
                random_state=0,
                verbose=0)
    
    est.set_params(loss='binary_crossentropy')

    lightgbm_est = get_equivalent_estimator(est, lib='lightgbm')
    lightgbm_est.fit(X_train_data_new, y_train)

    # predict and score on the dev set
    pred_y = lightgbm_est.predict(X_test_data_new)

    results = confusion_matrix(y_test, pred_y)
    error = zero_one_loss(y_test, pred_y)
    accuracy = metrics.accuracy_score(y_test, pred_y)
    print("LightGBM Results follow with 0.1 learning rate:")
    print("Confusion Matrix:")
    print(results)
    print("Error:")
    print(error)
    print("Accuracy:")
    print(accuracy)
    
    clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.6, max_depth=3, random_state=0)
    clf.fit(X_train_data_new, y_train)

    pred_y = clf.predict(X_test_data_new)

    results = confusion_matrix(y_test, pred_y)
    error = zero_one_loss(y_test, pred_y)
    accuracy = metrics.accuracy_score(y_test, pred_y)
    
    print("GradientBoostingClassifier Results follow")
    print("Confusion Matrix:")
    print(results)
    print("Error:")
    print(error)
    print("Accuracy:")
    print(accuracy)
    
    est = HistGradientBoostingClassifier(learning_rate=.2,
                max_iter=100,
                max_bins=500,
                max_depth=-1,
                max_leaf_nodes=100,
                n_iter_no_change=None,
                random_state=0,
                verbose=0)
    est.set_params(loss='binary_crossentropy')

    lightgbm_est = get_equivalent_estimator(est, lib='lightgbm')
    lightgbm_est.fit(X_train_data_new, y_train)

    # predict and score on the dev set
    pred_y = lightgbm_est.predict(X_test_data_new)

    results = confusion_matrix(y_test, pred_y)
    error = zero_one_loss(y_test, pred_y)
    accuracy = metrics.accuracy_score(y_test, pred_y)
    print("LightGBM Results follow with 0.2 learning rate:")
    print("Confusion Matrix:")
    print(results)
    print("Error:")
    print(error)
    print("Accuracy:")
    print(accuracy)
    


# Classes

In [3]:
class Data_Obj:
    def __init__(self, filename):
        self.isTrain = False
        self.load_data(filename)
        self.define_cols()
        self.iscorrfixed = False
        return
    
    def getDF(self):
        return self.df
    
    def getCols(self, cols):
        return self.cols
    
    def load_data(self, filename):
        self.dtypes = {
            'ProductName':                                          'int64',
            'EngineVersion':                                        'int64',
            'AppVersion':                                           'int64',
            'AvSigVersion':                                         'int64',
            'RtpStateBitfield':                                     'int64',
            'Platform':                                             'int64',
            'Processor':                                            'int64',
            'OsVer':                                                'int64',
            'OsPlatformSubRelease':                                 'int64',
            'OsBuildLab':                                           'int64',
            'SkuEdition':                                           'int64',
            'PuaMode':                                              'int64',
            'SmartScreen':                                          'int64',
            'Census_MDC2FormFactor':                                'int64',
            'Census_DeviceFamily':                                  'int64',
            'Census_ProcessorClass':                                'int64',
            'Census_PrimaryDiskTypeName':                           'int64',
            'Census_ChassisTypeName':                               'int64',
            'Census_PowerPlatformRoleName':                         'int64',
            'Census_InternalBatteryType':                           'int64',
            'Census_OSVersion':                                     'int64',
            'Census_OSArchitecture':                                'int64',
            'Census_OSBranch':                                      'int64',
            'Census_OSEdition':                                     'int64',
            'Census_OSSkuName':                                     'int64',
            'Census_OSInstallTypeName':                             'int64',
            'Census_OSWUAutoUpdateOptionsName':                     'int64',
            'Census_GenuineStateName':                              'int64',
            'Census_ActivationChannel':                             'int64',
            'Census_FlightRing':                                    'int64',
            'RtpStateBitfield_wasna':                               'int64',
            'DefaultBrowsersIdentifier_wasna':                      'int64',
            'AVProductStatesIdentifier_wasna':                      'int64',
            'AVProductsInstalled_wasna':                            'int64',
            'AVProductsEnabled_wasna':                              'int64',
            'CityIdentifier_wasna':                                 'int64',
            'OrganizationIdentifier_wasna':                         'int64',
            'GeoNameIdentifier_wasna':                              'int64',
            'IsProtected_wasna':                                    'int64',
            'SMode_wasna':                                          'int64',
            'IeVerIdentifier_wasna':                                'int64',
            'Firewall_wasna':                                       'int64',
            'UacLuaenable_wasna':                                   'int64',
            'Census_OEMNameIdentifier_wasna':                       'int64',
            'Census_OEMModelIdentifier_wasna':                      'int64',
            'Census_ProcessorCoreCount_wasna':                      'int64',
            'Census_ProcessorManufacturerIdentifier_wasna':         'int64',
            'Census_ProcessorModelIdentifier_wasna':                'int64',
            'Census_PrimaryDiskTotalCapacity_wasna':                'int64',
            'Census_SystemVolumeTotalCapacity_wasna':               'int64',
            'Census_TotalPhysicalRAM_wasna':                        'int64',
            'Census_InternalPrimaryDiagonalDisplaySizeInInches_wasna': 'int64',
            'Census_InternalPrimaryDisplayResolutionHorizontal_wasna': 'int64',
            'Census_InternalPrimaryDisplayResolutionVertical_wasna': 'int64',
            'Census_InternalBatteryNumberOfCharges_wasna':          'int64',
            'Census_OSInstallLanguageIdentifier_wasna':             'int64',
            'Census_IsFlightingInternal_wasna':                     'int64',
            'Census_IsFlightsDisabled_wasna':                       'int64',
            'Census_ThresholdOptIn_wasna':                          'int64',
            'Census_FirmwareManufacturerIdentifier_wasna':          'int64',
            'Census_IsWIMBootEnabled_wasna':                        'int64',
            'Census_IsVirtualDevice_wasna':                         'int64',
            'Census_IsAlwaysOnAlwaysConnectedCapable_wasna':        'int64',
            'Wdft_IsGamer_wasna':                                   'int64',
            'Wdft_RegionIdentifier_wasna':                          'int64',
            'Census_FirmwareVersionIdentifier_wasna':               'int64',
            'OsBuildLab_platform':                                  'float64',
            'OsBuildLab_release':                                   'float64',
            'IsBeta':                                               'int8',
            'IsSxsPassiveMode':                                     'int8',
            'HasTpm':                                               'int8',
            'AutoSampleOptIn':                                      'int8',
            'Census_HasOpticalDiskDrive':                           'int8',
            'Census_IsPortableOperatingSystem':                     'int8',
            'Census_IsSecureBootEnabled':                           'int8',
            'Census_IsTouchEnabled':                                'int8',
            'Census_IsPenCapable':                                  'int8',
            'CountryIdentifier':                                    'float64',
            'LocaleEnglishNameIdentifier':                          'float64',
            'OsBuild':                                              'float64',
            'OsSuite':                                              'int16',
            'Census_OSBuildNumber':                                 'float64',
            'Census_OSUILocaleIdentifier':                          'float64',
            'EngineVersion_major':                                  'int16',
            'EngineVersion_minor':                                  'int16',
            'EngineVersion_build1':                                 'int16',
            'EngineVersion_build2':                                 'int16',
            'AppVersion_major':                                     'int16',
            'AppVersion_minor':                                     'int16',
            'AppVersion_build1':                                    'int16',
            'AppVersion_build2':                                    'int16',
            'AvSigVersion_major':                                   'int16',
            'AvSigVersion_minor':                                   'int16',
            'AvSigVersion_build1':                                  'int16',
            'AvSigVersion_build2':                                  'int16',
            'Census_OSVersion_major':                               'int16',
            'Census_OSVersion_minor':                               'int16',
            'Census_OSVersion_build1':                              'int16',
            'Census_OSVersion_build2':                              'int16',
            'OsVer_major':                                          'int16',
            'OsVer_minor':                                          'int16',
            'OsVer_build1':                                         'int16',
            'OsVer_build2':                                         'int16',
            'OsBuildLab_major':                                     'float64',
            'OsBuildLab_minor':                                     'float64',
            'Census_OSBuildRevision':                               'int32',
            'OsBuildLab_build1':                                    'int32',
            'OsBuildLab_build2':                                    'float64',
            'AVProductsInstalled':                                  'float16',
            'AVProductsEnabled':                                    'float16',
            'OrganizationIdentifier':                               'float16',
            'GeoNameIdentifier':                                    'float64',
            'IsProtected':                                          'float16',
            'SMode':                                                'float16',
            'IeVerIdentifier':                                      'float64',
            'Firewall':                                             'float16',
            'Census_ProcessorCoreCount':                            'float64',
            'Census_ProcessorManufacturerIdentifier':               'float16',
            'Census_OSInstallLanguageIdentifier':                   'float64',
            'Census_IsFlightingInternal':                           'float16',
            'Census_IsFlightsDisabled':                             'float16',
            'Census_ThresholdOptIn':                                'float16',
            'Census_FirmwareManufacturerIdentifier':                'float64',
            'Census_IsWIMBootEnabled':                              'float16',
            'Census_IsVirtualDevice':                               'float16',
            'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
            'Wdft_IsGamer':                                         'float16',
            'Wdft_RegionIdentifier':                                'float64',
            'DefaultBrowsersIdentifier':                            'float32',
            'AVProductStatesIdentifier':                            'float64',
            'CityIdentifier':                                       'float64',
            'Census_OEMNameIdentifier':                             'float64',
            'Census_OEMModelIdentifier':                            'float64',
            'Census_ProcessorModelIdentifier':                      'float64',
            'Census_TotalPhysicalRAM':                              'float64',
            'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float64',
            'Census_InternalPrimaryDisplayResolutionHorizontal':    'float64',
            'Census_InternalPrimaryDisplayResolutionVertical':      'float64',
            'Census_FirmwareVersionIdentifier':                     'float64',
            'UacLuaenable':                                         'float64',
            'Census_PrimaryDiskTotalCapacity':                      'float64',
            'Census_SystemVolumeTotalCapacity':                     'float64',
            'Census_InternalBatteryNumberOfCharges':                'float64',
            'EngineVersion_combined':                               'float64',
            'AppVersion_combined':                                  'float64',
            'AvSigVersion_combined':                                'float64',
            'Census_OSVersion_combined':                            'float64',
            'OsVer_combined':                                       'float64',
            'OsBuildLab_combined':                                  'float64',
            'HasDetections':                                        'int8'
            }

        self.df = pd.read_csv(filename, dtype=self.dtypes, engine='c')
        if 'HasDetections' in self.df.columns:
            self.labels = self.df['HasDetections']
            self.isTrain = True
        self.df = self.df.drop(columns='HasDetections')
        self.df = self.df.drop(columns='MachineIdentifier')
        self.all_cols = []
        for item in self.dtypes.items():
            self.all_cols.append(item[0])
        return
    
    def define_cols(self):
        self.numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        self.numeric_cols = [c for c,v in self.df.dtypes.items() if v in self.numerics and c in self.df.columns]
        self.nominal_cols = [c for c in self.df.columns if (c not in self.numeric_cols)]
        self.binary_cols = [c for c in self.df.columns if (self.df[c].nunique() == 2 and c not in self.nominal_cols)]
        self.unary_cols = [c for c in self.df.columns if (self.df[c].nunique() == 1 and c not in self.nominal_cols)]
        self.continuous_cols = [c for c in self.numeric_cols if (c not in np.concatenate((self.binary_cols, self.unary_cols)))]
        self.correlated_cols=[]
        for i, col in enumerate(self.numeric_cols):
            corr = self.labels.corr(self.df[col])
            if(corr > 0.05  or corr < -0.05):
                star='(*)' # flag the highly correlated numeric variables
                self.correlated_cols.append(col)
            else:
                star=''
        self.set_of_cols =['EngineVersion_combined','EngineVersion','AppVersion_combined', 'AppVersion', 'AvSigVersion_combined', 'AvSigVersion', 'Census_OSVersion_combined', 'Census_OSVersion', 'OsVer_combined', 'OsVer', 'OsBuildLab_combined', 'OsBuildLab']
        return
        
    def reduce_mem(self, verbose=True):
        start_mem = self.df.memory_usage().sum() / 1024**2
        print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

        for col in self.df.columns:
            col_type = self.df[col].dtype

            if col_type != object and str(col_type) != 'category':
                c_min = self.df[col].min()
                c_max = self.df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        self.df[col] = self.df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        self.df[col] = self.df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        self.df[col] = self.df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        self.df[col] = self.df[col].astype(np.int64)
            else:
                self.df[col] = self.df[col].astype('category')

        end_mem = self.df.memory_usage().sum() / 1024**2
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
        return
    
    def display_correlated(self):
        for i, col in enumerate(self.correlated_cols):
            plt.figure(i)
            self.df[col].value_counts(sort=False).sort_values(ascending=False).nlargest(50).plot(kind='barh', figsize=(20,10), title=col)
        return
    
    def display_correlated_heatmap(self, withLabels=False):
        '''Note that withLabels also displays the training labels'''
        if withLabels == True:
            if self.isTrain == True:
                labels = pd.DataFrame()
                labels["HasDetections"] = self.labels
                tmpdf = pd.concat([self.df[self.correlated_cols], labels], axis=1)
                sns.heatmap(tmpdf.corr(), cmap='RdBu_r', annot=True, center=0.0)
            else:
                print("Labels do not exist.  This is not the training set.")
        else:
            sns.set(rc={'figure.figsize':(20,20)})
            sns.heatmap(self.df[self.correlated_cols].corr(), cmap='RdBu_r', annot=True, center=0.0)
        return
    
    def fix_correlated_vars(self):
        if self.iscorrfixed == False:
            self.iscorrfixed = True
            self.correlated_cols.remove('EngineVersion_build1')
            self.correlated_cols.remove('AvSigVersion')
            self.correlated_cols.remove('Processor')
            self.correlated_cols.remove('Census_OSArchitecture')
            self.correlated_cols.remove('AppVersion_build1')
            self.correlated_cols.append('AppVersion_combined')
            self.correlated_cols.append('AvSigVersion_combined')
        else:
            print("Correlation has already been corrected")
        return

    

In [4]:
train_file = 'data/mini_train_encoded.csv'
dev_file = 'data/mini_dev_encoded.csv'
validate_file = 'data/mini_validate_encoded.csv'

train_data = Data_Obj(train_file)
dev_data = Data_Obj(dev_file)
validate_data = Data_Obj(validate_file)


In [5]:
# Below are all the exciting things we can run as part of the EDA:

#train_data.display_correlated()
#train_data.display_correlated_heatmap()
train_data.fix_correlated_vars()
#train_data.display_correlated_heatmap()
#dev_data.display_correlated_heatmap()


In [22]:
# Below are all the exciting models we can run:

#run_KNN(train_data, dev_data)
#run_GradientBooster(train_data, dev_data)
#run_AdaBooster(train_data, dev_data)
#run_LightGBM(train_data, dev_data)
#run_XGBoost(train_data, dev_data)
#run_CatBoost(train_data, dev_data)
#run_RFC(train_data, dev_data)
#run_ETC(train_data, dev_data)
#run_DTC(train_data, dev_data)
#run_LinSVC(train_data, dev_data)
#run_SFM(train_data, dev_data)