In [2]:
#run

%matplotlib inline

# Import a bunch of libraries.
import time
import numpy as np
import pandas as pd
 
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_openml
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, zero_one_loss
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

from sklearn.tree import DecisionTreeClassifier

# some PCA and GMM stuff
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from matplotlib.colors import LogNorm

import sklearn.ensemble as ske
from sklearn.feature_selection import SelectFromModel

import seaborn as sns
sns.set()

# Very basic wrangling (no real cleaning)

In [5]:
#run

def load_data(filename):
    dtypes = {
        'ProductName':                                          'int64',
        'EngineVersion':                                        'int64',
        'AppVersion':                                           'int64',
        'AvSigVersion':                                         'int64',
        'RtpStateBitfield':                                     'int64',
        'Platform':                                             'int64',
        'Processor':                                            'int64',
        'OsVer':                                                'int64',
        'OsPlatformSubRelease':                                 'int64',
        'OsBuildLab':                                           'int64',
        'SkuEdition':                                           'int64',
        'PuaMode':                                              'int64',
        'SmartScreen':                                          'int64',
        'Census_MDC2FormFactor':                                'int64',
        'Census_DeviceFamily':                                  'int64',
        'Census_ProcessorClass':                                'int64',
        'Census_PrimaryDiskTypeName':                           'int64',
        'Census_ChassisTypeName':                               'int64',
        'Census_PowerPlatformRoleName':                         'int64',
        'Census_InternalBatteryType':                           'int64',
        'Census_OSVersion':                                     'int64',
        'Census_OSArchitecture':                                'int64',
        'Census_OSBranch':                                      'int64',
        'Census_OSEdition':                                     'int64',
        'Census_OSSkuName':                                     'int64',
        'Census_OSInstallTypeName':                             'int64',
        'Census_OSWUAutoUpdateOptionsName':                     'int64',
        'Census_GenuineStateName':                              'int64',
        'Census_ActivationChannel':                             'int64',
        'Census_FlightRing':                                    'int64',
        'RtpStateBitfield_wasna':                               'int64',
        'DefaultBrowsersIdentifier_wasna':                      'int64',
        'AVProductStatesIdentifier_wasna':                      'int64',
        'AVProductsInstalled_wasna':                            'int64',
        'AVProductsEnabled_wasna':                              'int64',
        'CityIdentifier_wasna':                                 'int64',
        'OrganizationIdentifier_wasna':                         'int64',
        'GeoNameIdentifier_wasna':                              'int64',
        'IsProtected_wasna':                                    'int64',
        'SMode_wasna':                                          'int64',
        'IeVerIdentifier_wasna':                                'int64',
        'Firewall_wasna':                                       'int64',
        'UacLuaenable_wasna':                                   'int64',
        'Census_OEMNameIdentifier_wasna':                       'int64',
        'Census_OEMModelIdentifier_wasna':                      'int64',
        'Census_ProcessorCoreCount_wasna':                      'int64',
        'Census_ProcessorManufacturerIdentifier_wasna':         'int64',
        'Census_ProcessorModelIdentifier_wasna':                'int64',
        'Census_PrimaryDiskTotalCapacity_wasna':                'int64',
        'Census_SystemVolumeTotalCapacity_wasna':               'int64',
        'Census_TotalPhysicalRAM_wasna':                        'int64',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches_wasna': 'int64',
        'Census_InternalPrimaryDisplayResolutionHorizontal_wasna': 'int64',
        'Census_InternalPrimaryDisplayResolutionVertical_wasna': 'int64',
        'Census_InternalBatteryNumberOfCharges_wasna':          'int64',
        'Census_OSInstallLanguageIdentifier_wasna':             'int64',
        'Census_IsFlightingInternal_wasna':                     'int64',
        'Census_IsFlightsDisabled_wasna':                       'int64',
        'Census_ThresholdOptIn_wasna':                          'int64',
        'Census_FirmwareManufacturerIdentifier_wasna':          'int64',
        'Census_IsWIMBootEnabled_wasna':                        'int64',
        'Census_IsVirtualDevice_wasna':                         'int64',
        'Census_IsAlwaysOnAlwaysConnectedCapable_wasna':        'int64',
        'Wdft_IsGamer_wasna':                                   'int64',
        'Wdft_RegionIdentifier_wasna':                          'int64',
        'Census_FirmwareVersionIdentifier_wasna':               'int64',
        'OsBuildLab_platform':                                  'float64',
        'OsBuildLab_release':                                   'float64',
        'IsBeta':                                               'int8',
        'IsSxsPassiveMode':                                     'int8',
        'HasTpm':                                               'int8',
        'AutoSampleOptIn':                                      'int8',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'CountryIdentifier':                                    'float64',
        'LocaleEnglishNameIdentifier':                          'float64',
        'OsBuild':                                              'float64',
        'OsSuite':                                              'int16',
        'Census_OSBuildNumber':                                 'float64',
        'Census_OSUILocaleIdentifier':                          'float64',
        'EngineVersion_major':                                  'int16',
        'EngineVersion_minor':                                  'int16',
        'EngineVersion_build1':                                 'int16',
        'EngineVersion_build2':                                 'int16',
        'AppVersion_major':                                     'int16',
        'AppVersion_minor':                                     'int16',
        'AppVersion_build1':                                    'int16',
        'AppVersion_build2':                                    'int16',
        'AvSigVersion_major':                                   'int16',
        'AvSigVersion_minor':                                   'int16',
        'AvSigVersion_build1':                                  'int16',
        'AvSigVersion_build2':                                  'int16',
        'Census_OSVersion_major':                               'int16',
        'Census_OSVersion_minor':                               'int16',
        'Census_OSVersion_build1':                              'int16',
        'Census_OSVersion_build2':                              'int16',
        'OsVer_major':                                          'int16',
        'OsVer_minor':                                          'int16',
        'OsVer_build1':                                         'int16',
        'OsVer_build2':                                         'int16',
        'OsBuildLab_major':                                     'float64',
        'OsBuildLab_minor':                                     'float64',
        'Census_OSBuildRevision':                               'int32',
        'OsBuildLab_build1':                                    'int32',
        'OsBuildLab_build2':                                    'float64',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float64',
        'IsProtected':                                          'float16',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float64',
        'Firewall':                                             'float16',
        'Census_ProcessorCoreCount':                            'float64',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_OSInstallLanguageIdentifier':                   'float64',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float64',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float64',
        'DefaultBrowsersIdentifier':                            'float32',
        'AVProductStatesIdentifier':                            'float64',
        'CityIdentifier':                                       'float64',
        'Census_OEMNameIdentifier':                             'float64',
        'Census_OEMModelIdentifier':                            'float64',
        'Census_ProcessorModelIdentifier':                      'float64',
        'Census_TotalPhysicalRAM':                              'float64',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float64',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float64',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float64',
        'Census_FirmwareVersionIdentifier':                     'float64',
        'UacLuaenable':                                         'float64',
        'Census_PrimaryDiskTotalCapacity':                      'float64',
        'Census_SystemVolumeTotalCapacity':                     'float64',
        'Census_InternalBatteryNumberOfCharges':                'float64',
        'EngineVersion_combined':                               'float64',
        'AppVersion_combined':                                  'float64',
        'AvSigVersion_combined':                                'float64',
        'Census_OSVersion_combined':                            'float64',
        'OsVer_combined':                                       'float64',
        'OsBuildLab_combined':                                  'float64',
        'HasDetections':                                        'int8'
        }

    df = pd.read_csv(filename, dtype=dtypes, engine='c')
    return df

In [6]:
def reduce_mem(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object and str(col_type) != 'category':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [7]:
# function to create train, dev and test sets from supplied ratios 
from sklearn.model_selection import train_test_split
def generate_train_test_dev_sets(train_ratio, test_ratio, features, labels):
    reserved_size = 1-train_ratio
    X_train, X_test_and_dev, y_train, y_test_and_dev = train_test_split(features, labels, stratify=labels, test_size=reserved_size, random_state=0)
    reserved_size = 1 - (test_ratio / reserved_size)
    X_test, X_dev, y_test, y_dev = train_test_split(X_test_and_dev, y_test_and_dev, stratify=y_test_and_dev, test_size=reserved_size, random_state=0)
    return X_train, X_test, X_dev, y_train, y_test, y_dev


In [8]:
def save_file(df, name):
    df.to_csv("data.nosync/"+name+".csv", index=False)

In [None]:
filename = "data/train_clean.csv"
df = load_data(filename)

In [None]:
df = reduce_mem(df)
import gc
gc.collect()

In [None]:
#code to spit out a nicely formated set of definitions for dtypes (for use in loading the sets again as above)
for dtype in df.dtypes.items():
    print("'{:} '{:}',".format((dtype[0] + "':").ljust(54), dtype[1]))

In [None]:
train, test, dev, train_labels, test_labels, dev_labels = \
    generate_train_test_dev_sets(.7, .15, df, df['HasDetections'])

print("shape of train set:",train.shape)
print("shape of dev set:",dev.shape) 
print("shape of test set:",test.shape)

# save cleaned files
#
print("saving train_clean ...")
save_file(train, "train_clean")
print("saving dev_clean ...")
save_file(dev, "dev_clean")
print("saving test_clean ...")
save_file(test, "test_clean")
print("done saving files")

## create minis from the same files

In [None]:
from sklearn.model_selection import train_test_split
def generate_mini(n, features, labels):
    sample_size = n / features.shape[0]
    reserved_size = 1-sample_size
    X_mini, X_rest, y_mini, y_rest = train_test_split(features, labels, stratify=labels, test_size=reserved_size, random_state=0)
    return X_mini, X_rest, y_mini, y_rest

In [None]:
mini_ratio = .05

mini_train, remaining, mini_train_labels, remaining_labels = \
    generate_mini(train.shape[0]*mini_ratio, train, train['HasDetections'])

mini_dev, remaining, mini_dev_labels, remaining_labels = \
    generate_mini(dev.shape[0]*mini_ratio, dev, dev['HasDetections'])

mini_test, remaining, mini_test_labels, remaining_labels = \
    generate_mini(test.shape[0]*mini_ratio, test, test['HasDetections'])


In [None]:
print("shape of mini train set:",mini_train.shape)
print("shape of mini dev set:",mini_dev.shape)
print("shape of mini test set:",mini_test.shape)

# save mini files
#
print("saving mini_train_clean ...")
save_file(mini_train, "mini_train_clean")
print("saving mini_dev_clean ...")
save_file(mini_dev, "mini_dev_clean")
print("saving mini_test_clean ...")
save_file(mini_test, "mini_test_clean")
print("done saving files")

# Work in progress steps for an analysis / encoding / feature engineering notebook. Move/replace/reuse when we are ready.

In [9]:
#run down from 

train_file = 'data/mini_train_encoded.csv'
dev_file = 'data/mini_dev_encoded.csv'
validate_file = 'data/mini_validate_encoded.csv'

In [10]:
drop_cols = ['EngineVersion', 'AppVersion', 'AvSigVersion', 'Census_OSVersion', 'OsVer', 'OsBuildLab', 
             'EngineVersion_major','EngineVersion_minor','EngineVersion_build1','EngineVersion_build2',
             'AppVersion_major','AppVersion_minor','AppVersion_build1','AppVersion_build2', 'AvSigVersion_major',
             'AvSigVersion_minor', 'AvSigVersion_build1', 'AvSigVersion_build2', 'Census_OSVersion_major', 
             'Census_OSVersion_minor', 'Census_OSVersion_build1', 'Census_OSVersion_build2', 'OsVer_major',
             'OsVer_minor', 'OsVer_build1', 'OsVer_build2', 'OsBuildLab_major', 'OsBuildLab_minor', 'OsBuildLab_platform',
             'OsBuildLab_release', 'OsBuildLab_build1', 'OsBuildLab_build2', 'HasDetections']

In [11]:
train_data = load_data(train_file)
train_labels = train_data['HasDetections']
train_data = train_data.drop(columns='HasDetections')
train_data = train_data.drop(columns='MachineIdentifier')

In [12]:
train_data.describe()

Unnamed: 0,ProductName,EngineVersion,AppVersion,AvSigVersion,RtpStateBitfield,Platform,Processor,OsVer,OsPlatformSubRelease,OsBuildLab,...,UacLuaenable,Census_PrimaryDiskTotalCapacity,Census_SystemVolumeTotalCapacity,Census_InternalBatteryNumberOfCharges,EngineVersion_combined,AppVersion_combined,AvSigVersion_combined,Census_OSVersion_combined,OsVer_combined,OsBuildLab_combined
count,312251.0,312251.0,312251.0,312251.0,312251.0,312251.0,312251.0,312251.0,312251.0,312251.0,...,312251.0,312251.0,312251.0,312251.0,312251.0,312251.0,312251.0,312251.0,312251.0,312251.0
mean,3.969323,64.108858,52.199644,7755.423329,3.898902,0.086648,1.091462,1.32576,3.526006,257.063865,...,21.35275,-0.000577,0.001224,0.000798,101015100.0,41591660.0,10272.313818,10015830.0,98738.533295,15723660000.0
std,0.30176,5.349947,19.373868,921.431191,0.63055,0.478735,0.288432,7.345239,1.355107,86.07053,...,11376.39,8.4e-05,1.001193,1.000453,277.9171,328277.5,37.257928,1960.701,6969.152451,2182564000.0
min,1.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,-0.000692,-1.149985,-0.583153,101011500.0,40400300.0,0.0,10010240.0,60100.0,7600164000.0
25%,4.0,65.0,53.0,7700.0,4.0,0.0,1.0,0.0,3.0,258.0,...,1.0,-0.000638,-0.780973,-0.583153,101015100.0,41317130.0,10273.0665,10015060.0,100000.0,15063000000.0
50%,4.0,65.0,59.0,7973.0,4.0,0.0,1.0,0.0,4.0,262.0,...,1.0,-0.000584,-0.386382,-0.583153,101015100.0,41801810.0,10273.1664,10016300.0,100000.0,16299000000.0
75%,4.0,66.0,59.0,8176.0,4.0,0.0,1.0,0.0,4.0,273.0,...,1.0,-0.000477,0.309136,1.714817,101015200.0,41801810.0,10275.0845,10017130.0,100000.0,17134000000.0
max,4.0,68.0,108.0,8530.0,5.0,3.0,2.0,52.0,8.0,662.0,...,6357062.0,0.009623,33.934742,1.714817,101015300.0,41801810.0,10277.0067,10018240.0,100004.0,18242010000.0


In [13]:
dev_data = load_data(dev_file)
dev_labels = dev_data['HasDetections']
dev_data = dev_data.drop(columns='HasDetections')
dev_data = dev_data.drop(columns='MachineIdentifier')

In [14]:
validate_data = load_data(validate_file)
validate_labels = validate_data['HasDetections']
validate_data = validate_data.drop(columns='HasDetections')
validate_data = validate_data.drop(columns='MachineIdentifier')

FileNotFoundError: [Errno 2] File b'data/mini_validate_encoded.csv' does not exist: b'data/mini_validate_encoded.csv'

In [15]:
all_cols = []
for item in train_data.dtypes.items():
    print("'{:}',".format(item[0]))
    all_cols.append(item[0])

'ProductName',
'EngineVersion',
'AppVersion',
'AvSigVersion',
'RtpStateBitfield',
'Platform',
'Processor',
'OsVer',
'OsPlatformSubRelease',
'OsBuildLab',
'SkuEdition',
'PuaMode',
'SmartScreen',
'Census_MDC2FormFactor',
'Census_DeviceFamily',
'Census_ProcessorClass',
'Census_PrimaryDiskTypeName',
'Census_ChassisTypeName',
'Census_PowerPlatformRoleName',
'Census_InternalBatteryType',
'Census_OSVersion',
'Census_OSArchitecture',
'Census_OSBranch',
'Census_OSEdition',
'Census_OSSkuName',
'Census_OSInstallTypeName',
'Census_OSWUAutoUpdateOptionsName',
'Census_GenuineStateName',
'Census_ActivationChannel',
'Census_FlightRing',
'RtpStateBitfield_wasna',
'DefaultBrowsersIdentifier_wasna',
'AVProductStatesIdentifier_wasna',
'AVProductsInstalled_wasna',
'AVProductsEnabled_wasna',
'CityIdentifier_wasna',
'OrganizationIdentifier_wasna',
'GeoNameIdentifier_wasna',
'IsProtected_wasna',
'SMode_wasna',
'IeVerIdentifier_wasna',
'Firewall_wasna',
'UacLuaenable_wasna',
'Census_OEMNameIdentifier_wasna',
'

In [16]:
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_cols = [c for c,v in train_data.dtypes.items() if v in numerics and c in train_data.columns]

In [17]:
nominal_cols = [c for c in train_data.columns if (c not in numeric_cols)]

In [18]:
binary_cols = [c for c in train_data.columns if (train_data[c].nunique() == 2 and c not in nominal_cols)]

In [19]:
unary_cols = [c for c in train_data.columns if (train_data[c].nunique() == 1 and c not in nominal_cols)]
unary_cols

['EngineVersion_major',
 'EngineVersion_minor',
 'AppVersion_major',
 'AvSigVersion_build2',
 'Census_OSVersion_major',
 'Census_OSVersion_minor',
 'Census_IsWIMBootEnabled']

In [None]:
[c for c in dev_data.columns if (dev_data[c].nunique() == 1 and c not in nominal_cols)]

In [None]:
[c for c in validate_data.columns if (validate_data[c].nunique() == 1 and c not in nominal_cols)]

In [None]:
continuous_cols = [c for c in numeric_cols if (c not in np.concatenate((binary_cols, unary_cols)))]

In [None]:
#for c in unary_cols:
#    train_data = train_data.drop(columns=c)
#    test_data = test_data.drop(columns=c)
#    dev_data = dev_data.drop(columns=c)

In [None]:
set_of_cols =['EngineVersion_combined','EngineVersion','AppVersion_combined', 'AppVersion', 'AvSigVersion_combined', 'AvSigVersion', 'Census_OSVersion_combined', 'Census_OSVersion', 'OsVer_combined', 'OsVer', 'OsBuildLab_combined', 'OsBuildLab']
train_data[set_of_cols].head()

In [None]:
# no need to run until

#train_data = reduce_mem(train_data)
#train_data[set_of_cols].describe()

In [None]:
#code to spit out a nicely formated set of definitions for dtypes (for use in loading the sets again as above)
for dtype in train_data.dtypes.items():
    print("'{:} '{:}',".format((dtype[0] + "':").ljust(54), dtype[1]))

In [None]:
set_of_cols =['EngineVersion', 'AppVersion', 'AvSigVersion', 'Census_OSVersion', 'OsVer', 'OsBuildLab']
for col in set_of_cols:
    print(train_data[col].dtype.categories.get_values()[:50])
    
    
    

In [None]:
train_data['HasDetections'] = train_labels

filter=train_data['Platform']!='windows10'

groups = train_data[filter].groupby('Census_DeviceFamily').HasDetections

fig, ax = plt.subplots()

for k, v in groups:
    v.hist(label=k, alpha=.25, ax=ax, figsize=(48,48), bins=50)

ax.legend()
train_data = train_data.drop(columns='HasDetections')

In [None]:
# RESUME RUNNING
print("Columns to review NaN values:")
train_data.isna().any()[lambda x: x]

In [None]:
# Let's get a feeling for range of values in the nominal categories

In [None]:
train_data[nominal_cols].describe().transpose()

In [None]:
# don't run encoding

# Encode nominal variables as labeled strings
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
for i, col in enumerate(nominal_cols):
    train_data[col] = encoder.fit_transform(train_data[col].astype(str))
    dev_data[col] = encoder.fit_transform(dev_data[col].astype(str))
    test_data[col] = encoder.fit_transform(test_data[col].astype(str))

In [None]:
train_data[nominal_cols].hist(figsize=(48,48), bins=50)
plt.show()

In [None]:
for i, col in enumerate(nominal_cols):
    print('The number of values for {:} = {:}'.format(col, (len(list(train_data[col].unique())))))

In [None]:
correlated_cols = []
for i, col in enumerate(nominal_cols):
    corr = train_labels.corr(train_data[col])
    if(corr > 0.05  or corr < -0.05):
        star='(*)' # flag the highly correlated nominal variables
        correlated_cols.append(col)
    else:
        star=''
    print('The correlation of {:} with the outcome variable is {:7.6f}{:}'.format(col, corr, star))

In [None]:
correlated_cols

In [None]:
train_data[continuous_cols].describe().transpose()

In [None]:
# Andrew's temporary draft for StandardScaler.  Pasting in Kevin's variation, as well:

#inverse_cols = [i for i in train_data.columns if i not in numeric_cols]
#features = train_data[numeric_cols]
#ct = ColumnTransformer([('transform', StandardScaler(), numeric_cols)], remainder='passthrough')
#ct.fit_transform(features)
#train_data_scaled = pd.DataFrame(features, index=train_data.index, columns=numeric_cols)
#inverse_data = pd.DataFrame(train_data[inverse_cols], index=train_data.index, columns=inverse_cols)
#frames = [train_data_scaled, inverse_data]
#train_data2 = pd.concat(frames, sort=False)
#train_data2.describe()

# Kevin's version:
from sklearn.preprocessing import StandardScaler
features = train_data[continuous_cols]
scaler = StandardScaler().fit(features.values)
features = scaler.transform(features.values)
train_data[continuous_cols] = features

#from sklearn.preprocessing import StandardScaler
#train_data[numeric_cols] = data[[numeric_cols].apply(
#                           lambda x: StandardScaler().fit_transform(x))


features = dev_data[continuous_cols]
scaler = StandardScaler().fit(features.values)
features = scaler.transform(features.values)
dev_data[continuous_cols] = features

features = test_data[continuous_cols]
scaler = StandardScaler().fit(features.values)
features = scaler.transform(features.values)
test_data[continuous_cols] = features


#for i, col in enumerate(numeric_cols):
#    standard_scaler = StandardScaler().fit(train_data[col].values)
#    train_data[col] = standard_scaler.transform(train_data[col])
#    test_data[col] = standard_scaler.transform(test_data[col])



In [None]:
train_data[continuous_cols].hist(figsize=(48,48), bins=50)
plt.show()

In [None]:
train_data[binary_cols].describe().transpose()

In [None]:
train_data[binary_cols].hist(figsize=(48,48), bins=50)
plt.show()

In [None]:
# Resume starting here.

correlated_cols=[]
for i, col in enumerate(numeric_cols):
    corr = train_labels.corr(train_data[col])
    if(corr > 0.05  or corr < -0.05):
        star='(*)' # flag the highly correlated numeric variables
        correlated_cols.append(col)
    else:
        star=''
#    print('The correlation of {:} with the outcome variable is {:7.6f}{:}'.format(col, corr, star))

In [None]:
# Impute missing values for correlated columns

#from sklearn.impute import SimpleImputer
#imputer = SimpleImputer(strategy = 'median')

#imputer.fit(train_data[correlated_cols])
#train_data_corr = imputer.transform(train_data[correlated_cols])
#dev_data_corr = imputer.transform(dev_data[correlated_cols])
#test_data_corr = imputer.transform(test_data[correlated_cols])


In [None]:
#Display distributions for correlated variables

import matplotlib.pyplot as plt
for i, col in enumerate(correlated_cols):
    #count = train_data[col].value_counts(sort=False).sort_values(ascending=False).nlargest(50)
    plt.figure(i)
    train_data[col].value_counts(sort=False).sort_values(ascending=False).nlargest(50).plot(kind='barh', figsize=(20,10), title=col)


In [None]:
correlated_cols

In [None]:
sns.set(rc={'figure.figsize':(20,20)})
sns.heatmap(train_data[correlated_cols].corr(), cmap='RdBu_r', annot=True, center=0.0)

In [None]:
sns.pairplot(train_data[correlated_cols])#, hue=train_labels)

In [None]:
# Be sure to run this

correlated_cols.remove('EngineVersion_build1')
correlated_cols.remove('AvSigVersion')
correlated_cols.remove('Processor')
correlated_cols.remove('Census_OSArchitecture')
correlated_cols.remove('AppVersion_build1')

correlated_cols.append('AppVersion_combined')
correlated_cols.append('AvSigVersion_combined')

In [None]:
sns.heatmap(train_data[correlated_cols].corr(), cmap='RdBu_r', annot=True, center=0.0)

In [None]:
labels = pd.DataFrame()
labels["HasDetections"] = train_labels
df = pd.concat([train_data[correlated_cols], labels], axis=1)
sns.heatmap(df.corr(), cmap='RdBu_r', annot=True, center=0.0)

In [None]:
# Definitely run everything past here.  This is where the models run

train_data_corr = train_data[correlated_cols]
dev_data_corr = dev_data[correlated_cols]
validate_data_corr = validate_data[correlated_cols]

X_train = train_data_corr
y_train = train_labels
X_test = dev_data_corr
y_test = dev_labels

In [None]:
classifier =  KNeighborsClassifier(n_neighbors=9, n_jobs=-1)
classifier.fit(X_train, y_train)

pred_y = classifier.predict(X_test)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.6, max_depth=3, random_state=0)
clf.fit(X_train, y_train)

# predict and score on the dev set
pred_y = clf.predict(X_test)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)

In [None]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=100)
clf.fit(X_train, y_train)

# predict and score on the dev set
pred_y = clf.predict(X_test)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)

In [None]:
# install this once
!pip install lightgbm

In [None]:
# explicitly require this experimental feature
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
# now you can import normally from ensemble
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble._hist_gradient_boosting.utils import (get_equivalent_estimator)

est = HistGradientBoostingClassifier(learning_rate=.1,
                max_iter=100,
                max_bins=100,
                max_leaf_nodes=31,
                n_iter_no_change=None,
                random_state=0,
                verbose=0)

est.set_params(loss='binary_crossentropy')

lightgbm_est = get_equivalent_estimator(est, lib='lightgbm')
lightgbm_est.fit(X_train, y_train)

# predict and score on the dev set
pred_y = lightgbm_est.predict(X_test)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)

In [None]:
#Another installable

!pip install xgboost

In [None]:
# explicitly require this experimental feature
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
# now you can import normally from ensemble
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble._hist_gradient_boosting.utils import (get_equivalent_estimator)

est = HistGradientBoostingClassifier(learning_rate=.1,
                max_iter=100,
                max_bins=100,
                max_leaf_nodes=31,
                n_iter_no_change=None,
                random_state=0,
                verbose=0)

est.set_params(loss='binary_crossentropy')

lightgbm_est = get_equivalent_estimator(est, lib='xgboost')
lightgbm_est.fit(X_train, y_train)

# predict and score on the dev set
pred_y = lightgbm_est.predict(X_test)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)

In [None]:
#Another installable

!pip install catboost

In [None]:

# explicitly require this experimental feature
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
# now you can import normally from ensemble
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble._hist_gradient_boosting.utils import (get_equivalent_estimator)

est = HistGradientBoostingClassifier(learning_rate=.1,
                max_iter=100,
                max_bins=100,
                max_leaf_nodes=31,
                n_iter_no_change=None,
                random_state=0,
                verbose=0)

est.set_params(loss='binary_crossentropy')

lightgbm_est = get_equivalent_estimator(est, lib='catboost')
lightgbm_est.fit(X_train, y_train)

# predict and score on the dev set
pred_y = lightgbm_est.predict(X_test)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

# predict and score on the dev set
pred_y = clf.predict(X_test)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

clf = ExtraTreesClassifier(n_estimators=100, min_samples_split=2, random_state=0)
clf.fit(X_train, y_train)

# predict and score on the dev set
pred_y = clf.predict(X_test)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)

In [None]:
dtc = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0)
dtc.fit(X_train, y_train)

# predict and score on the dev set
pred_y = dtc.predict(X_test)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)

In [None]:
from sklearn.svm import LinearSVC

classifier = LinearSVC(max_iter=5000)
classifier.fit(X_train, y_train)

pred_y = classifier.predict(X_test)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)

# Using SelectFromModel to get features

In [None]:
fsel = ske.ExtraTreesClassifier(n_estimators=100).fit(train_data, train_labels)
model = SelectFromModel(fsel, prefit=True)
X_train_data_new = model.transform(train_data)
X_test_data_new = model.transform(dev_data)

In [None]:
nb_features = X_train_data_new.shape[1]
nb_features

In [None]:
# explicitly require this experimental feature
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
# now you can import normally from ensemble
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble._hist_gradient_boosting.utils import (get_equivalent_estimator)

est = HistGradientBoostingClassifier(learning_rate=.1,
                max_iter=100,
                max_bins=100,
                max_leaf_nodes=31,
                n_iter_no_change=None,
                random_state=0,
                verbose=0)

est.set_params(loss='binary_crossentropy')

lightgbm_est = get_equivalent_estimator(est, lib='lightgbm')
lightgbm_est.fit(X_train_data_new, y_train)

# predict and score on the dev set
pred_y = lightgbm_est.predict(X_test_data_new)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.6, max_depth=3, random_state=0)
clf.fit(X_train_data_new, y_train)

# predict and score on the dev set
pred_y = clf.predict(X_test_data_new)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)

In [None]:
# explicitly require this experimental feature
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
# now you can import normally from ensemble
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble._hist_gradient_boosting.utils import (get_equivalent_estimator)

est = HistGradientBoostingClassifier(learning_rate=.2,
                max_iter=100,
                max_bins=500,
                max_depth=-1,
                max_leaf_nodes=100,
                n_iter_no_change=None,
                random_state=0,
                verbose=0)

est.set_params(loss='binary_crossentropy')

lightgbm_est = get_equivalent_estimator(est, lib='lightgbm')
lightgbm_est.fit(X_train_data_new, y_train)

# predict and score on the dev set
pred_y = lightgbm_est.predict(X_test_data_new)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)

# PCA Stuff

In [None]:
# Run against correlated set and

def run_PCA_GMM_analysis(param_threshold, X_train, X_test, y_train, y_test):
    # set up a series of conditions where the parameter sum of PCA
    # components and GMM components will be <= 50
    # Parameters will be expressed in terms of G = GMM components, 
    # P = PCA components and C = Number of classes 
    C = 2 # number of classes will always be two for this experiment
    # P and G combinations are what will change based on their relationship with
    # the covariance type
    # First line up a cartesian matrix of PCA and GMM components possibilities
    P, G = np.meshgrid(np.arange(1, param_threshold), np.arange(1, param_threshold))
    # Define possible combinations where the sum of parameters will be <= param_threshold
    # depending on the covariance type
    # The parameter count formula calculations were found on
    # https://stats.stackexchange.com/questions/280893/number-of-parameters-in-multivariate-gaussian-for-different-covariance-matrices/280896#280896
    diag_params = 2 * P * G * C <= param_threshold
    full_params = ((P * G) + (P * (P + 1) * G) / 2) * C <= param_threshold
    sphe_params = (P + 1) * G * C <= param_threshold
    tied_params = ((P * G) + (P * (P + 1)) / 2) * C <= param_threshold
    experiments = [sphe_params, diag_params, tied_params, full_params]
    CT = ['spherical', 'diag', 'tied', 'full']
    # store the winning hyperparameters
    winning_accuracy = 0.0
    winning_CT = None
    winning_PCA = None
    winning_GMM = None
    # keep track of all the trials for sorting / displaying
    trials = np.empty((0,4))
    # run through each experiment - spherical, diag, tied and full
    for i, experiment in enumerate(experiments):
    # iterate through the combinatorial product of PCA components and GMM 
    # components that sum up to less than 50 parameter combinations for the
    # experiment
        for p, g in np.column_stack((P[experiment], G[experiment])):
            pca = PCA(p)
            projected = pca.fit_transform(X_train)
            projected_test = pca.transform(X_test)

            gm_mod_pos = GaussianMixture(n_components = g, reg_covar=.0001, covariance_type=CT[i])
            gm_mod_pos.fit(projected[y_train==1])

            gm_mod_neg = GaussianMixture(n_components = g, reg_covar=.0001, covariance_type=CT[i])
            gm_mod_neg.fit(projected[y_train==0])

            prob_pos = gm_mod_pos.score_samples(projected_test)
            prob_neg = gm_mod_neg.score_samples(projected_test)

            winning_predictions = np.where(prob_pos >= prob_neg, 1, 0)
            accuracy = metrics.accuracy_score(y_test, winning_predictions)
            trials = np.append(trials, np.array([[p, g, i, accuracy]]), axis=0)
            if accuracy > winning_accuracy:
                winning_accuracy = accuracy
                winning_CT = CT[i]
                winning_PCA = p
                winning_GMM = g

    # calculate the parameter count for the winning experiment using its 
    # hyperparameters
    if winning_CT=='spherical':
        winning_params = (winning_PCA + 1) * winning_GMM * C 
    elif winning_CT=='diag':
        winning_params = (2 * winning_PCA * winning_GMM * C)
    elif winning_CT=='full':
        winning_params = ((winning_PCA * winning_GMM) + 
                          (winning_PCA * (winning_PCA + 1) * winning_GMM) / 2) * C
    elif winning_CT=='tied':
        winning_params = ((winning_PCA * winning_GMM) + 
                          (winning_PCA * (winning_PCA + 1)) / 2) * C
    print("Best accuracy is {:5.4f} with {} parameters".format(
        winning_accuracy, winning_params))
    print("PCA components = {}, GMM components = {}, covariance_type = {}".format(
        winning_PCA, winning_GMM, winning_CT))
    trials = trials[trials[:,3].argsort()[::-1], :]
    print("\n| Rank | PCA | GMM | Cov. Type | Accuracy |")
    print("+------+-----+-----+-----------+----------+")
    for i, trial in enumerate(trials):
        print("| {:4} | {:3} | {:3} | {} | {:9.4f}|".format(
            i+1, int(trial[0]), int(trial[1]),
            CT[int(trial[2])].rjust(9), trial[3]))

In [None]:
correlated_cols=[]
for i, col in enumerate(numeric_cols):
    corr = train_labels.corr(train_data[col])
    if(corr > 0.05  or corr < -0.05):
        correlated_cols.append(col)

correlated_cols.remove('EngineVersion_build1')
correlated_cols.remove('EngineVersion_combined')
correlated_cols.remove('Processor')
correlated_cols.remove('Census_OSArchitecture')

train_data_corr = train_data[correlated_cols]
dev_data_corr = dev_data[correlated_cols]
test_data_corr = test_data[correlated_cols]

X_train = train_data_corr
y_train = train_labels
X_test = dev_data_corr
y_test = dev_labels

In [None]:
run_PCA_GMM_analysis(29, X_train, X_test, y_train, y_test)

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

# predict and score on the dev set
pred_y = dtc.predict(X_test)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)

In [None]:

mycols=all_cols.copy()
mycols.remove('EngineVersion_combined')
mycols.remove('AppVersion_combined')
mycols.remove('AvSigVersion_combined')
mycols.remove('Census_OSVersion_combined')
mycols.remove('OsVer_combined')
mycols.remove('OsBuildLab_combined')


In [None]:
#impute for all columns and let PCA find the best dimensions
#imputer = SimpleImputer(strategy = 'median')
#imputer.fit(train_data)
#X_train = imputer.transform(train_data)[train_data['Wdft_IsGamer']==0]
#X_test = imputer.transform(dev_data)[dev_data['Wdft_IsGamer']==0]
X_train = train_data[mycols]#[train_data['Wdft_IsGamer']==0]
y_train = train_labels#[train_data['Wdft_IsGamer']==0]
X_test = dev_data[mycols]#[dev_data['Wdft_IsGamer']==0]
y_test = dev_labels#[dev_data['Wdft_IsGamer']==0]

In [None]:
classifier =  KNeighborsClassifier(n_neighbors=2, n_jobs=-1)
classifier.fit(X_train, y_train)

pred_y = classifier.predict(X_test)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)

In [None]:
from sklearn.svm import LinearSVC

classifier = LinearSVC()
classifier.fit(X_train, y_train)

pred_y = classifier.predict(X_test)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)

In [None]:
X_train.shape

In [None]:
run_PCA_GMM_analysis(120, X_train, X_test, y_train, y_test)

In [None]:
pca = PCA(n_components=2)
projected = pca.fit_transform(X_train)

plt.figure(figsize=(15,10))

from collections import defaultdict

category = defaultdict(list)
category[0].append('not-detected')
category[1].append('has-detections')

colors = ['navy', 'turquoise', 'darkorange', 'red', 'purple']

for color, cat in zip(colors, category.keys()):
    print(color)
    print(cat)
    plt.scatter(projected[y_train==cat, 0], projected[y_train==cat, 1],
                color=color, alpha=.8, lw=2, label=cat)
plt.legend(loc='best', shadow=False, scatterpoints=1)

plt.show()

# Feature reduction using ExtraTreesClassifier - experimental code

In [None]:
best_cols =[
    'SmartScreen',
    'AVProductsInstalled',
    'AVProductStatesIdentifier',
    'OsBuildLab_platform',
    'AppVersion_build1',
    'Census_IsAlwaysOnAlwaysConnectedCapable',
    'EngineVersion_build2',
    'Census_TotalPhysicalRAM',
    'EngineVersion_build1',
    'Census_IsVirtualDevice',
    'AvSigVersion_minor',
    'SMode_wasna',
    'SMode',
    'RtpStateBitfield_wasna',
    'Wdft_IsGamer_wasna',
    'Wdft_RegionIdentifier_wasna',
    'Census_ProcessorClass',
    'Census_PrimaryDiskTotalCapacity_wasna'
]

In [None]:
best_cols = [
    'AVProductsInstalled',
    'AVProductStatesIdentifier',
    'OsBuildLab_platform',
    'AppVersion_combined',
    'Census_IsAlwaysOnAlwaysConnectedCapable',
    'EngineVersion_combined',
    'IsProtected',
    'Census_IsVirtualDevice',
    'SMode_wasna',
    'IsSxsPassiveMode',
    'Census_FirmwareVersionIdentifier_wasna',
    'Census_DeviceFamily',
    'HasTpm',
    'Wdft_IsGamer_wasna',
    'OsVer_combined'
]

In [None]:
X_train = train_data#[best_cols]
y_train = train_labels
X_test = dev_data#[best_cols]
y_test = dev_labels

In [None]:
fsel = ske.ExtraTreesClassifier().fit(X_train, y_train)
model = SelectFromModel(fsel, prefit=True)
X_train_data_new = model.transform(X_train)
X_test_data_new = model.transform(X_test)
nb_features = X_train_data_new.shape[1]
nb_features

In [None]:
classifier =  KNeighborsClassifier(n_neighbors=2, n_jobs=-1)
classifier.fit(X_train_data_new, y_train)

pred_y = classifier.predict(X_test_data_new)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)

In [None]:
from sklearn.svm import LinearSVC

classifier = LinearSVC()
classifier.fit(X_train_data_new, y_train)

pred_y = classifier.predict(X_test_data_new)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)

In [None]:
run_PCA_GMM_analysis(60, X_train_data_new, X_test_data_new, y_train, y_test)

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train_data_new, y_train)

# predict and score on the dev set
pred_y = dtc.predict(X_test_data_new)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

# predict and score on the dev set
pred_y = dtc.predict(X_test)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)

# Feature reduction using Ridge Reduction (and trying Emsemble Boosting on the features)

In [None]:
X_train = train_data
y_train = train_labels
X_test = dev_data
y_test = dev_labels

In [None]:
# Feature reduction using Ridge regression

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(penalty='l2', solver='liblinear',
                                multi_class='ovr', C=0.18, max_iter=1000)
rfe = RFE(model, 15)
model = rfe.fit(X_train, y_train)

print("Num Features: %s" % (model.n_features_))
print("Selected Features: %s" % (model.support_))
print("Feature Ranking: %s" % (model.ranking_))

from sklearn.linear_model import Ridge
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)


def pretty_print_coefs(coefs, names = None, sort = False):
    if names == None:
        names = ["X%s" % x for x in range(len(coefs))]
    lst = zip(coefs, names)
    if sort:
        lst = sorted(lst,  key = lambda x:-np.abs(x[0]))
    return " + ".join("%s * %s" % (round(coef, 3), name)
                                   for coef, name in lst)

print ("Ridge model:", pretty_print_coefs(ridge.coef_, names=all_cols))


#X_train_data_new = model.transform(X_train)
#X_test_data_new = model.transform(X_test)

In [None]:
new_cols = []
#print(np.array(model.ranking_))
for c in (model.ranking_):
    if (model.support_[c]):
        print(all_cols[c])
        new_cols.append(all_cols[c])

In [None]:
X_train = train_data[new_cols]
y_train = train_labels
X_test = dev_data[new_cols]
y_test = dev_labels

In [None]:
# explicitly require this experimental feature
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
# now you can import normally from ensemble
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble._hist_gradient_boosting.utils import (get_equivalent_estimator)

est = HistGradientBoostingClassifier(learning_rate=.1,
                max_iter=100,
                max_bins=100,
                max_leaf_nodes=31,
                n_iter_no_change=None,
                random_state=0,
                verbose=0)

est.set_params(loss='binary_crossentropy')

lightgbm_est = get_equivalent_estimator(est, lib='lightgbm')
lightgbm_est.fit(X_train, y_train)

# predict and score on the dev set
pred_y = lightgbm_est.predict(X_test)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)

In [None]:
model = LogisticRegression(penalty='l2', solver='liblinear', multi_class='ovr', C=0.18, max_iter=1000)
model.fit(X_train, y_train)

# predict and score on the dev set
pred_y = model.predict(X_test)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)

# Misc

In [None]:
classifier =  KNeighborsClassifier(n_neighbors=2, n_jobs=-1)
classifier.fit(X_train, y_train)

pred_y = classifier.predict(X_test)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)

In [None]:
from sklearn.svm import LinearSVC

classifier = LinearSVC()
classifier.fit(X_train, y_train)

pred_y = classifier.predict(X_test)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)

In [None]:
run_PCA_GMM_analysis(33, X_train, X_test, y_train, y_test)

In [None]:
from sklearn.decomposition import PCA

# Use PCA to reduce dimensionality so we can visualize the dataset on a 2d plot
pca = PCA(n_components=2)
train_x_pca_cont = pca.fit_transform(X_train)

plt.figure(figsize=(15,10))

from collections import defaultdict

category = defaultdict(list)
category[0].append('not-detected')
category[1].append('has-detections')

colors = ['navy', 'turquoise', 'darkorange', 'red', 'purple']

for color, cat in zip(colors, category.keys()):
    print(color)
    print(cat)
    plt.scatter(train_x_pca_cont[y_train==cat, 0], train_x_pca_cont[y_train==cat, 1],
                color=color, alpha=.8, lw=2, label=cat)
plt.legend(loc='best', shadow=False, scatterpoints=1)

plt.show()

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

# predict and score on the dev set
pred_y = dtc.predict(X_test)

results = confusion_matrix(y_test, pred_y)
error = zero_one_loss(y_test, pred_y)
accuracy = metrics.accuracy_score(y_test, pred_y)

print(results)
print(error)
print(accuracy)