# ***Data Encoding***
Not all machine learning methods can handle all sorts of features (expicitly the categorical features). Therefor features are encoded to ensure a smooth processing. This file encode train and test data.

In the following features are encoded by one of two methods:

Label_Encoding: Label Encoding transform values to become numbers between  0  and  𝑛−1 , where  𝑛  is the number of different labels Frequency_Encoding: Frequency Encoding is a special case of Label Encoding. Feature values are encoded based on there frequency. Transformed values are numbers between  0  and  𝑚 , where  𝑚  is the number of values with a frequency greater or equal than 2. The resulting encoded data is stored under ../data/train_endcoding.csv and ../data/test_encoding.csv.

# **import libararies**

In [2]:
import pandas as pd
import numpy as np
%pip install dask[dataframe]
import dask.dataframe as dd
from sklearn import preprocessing
from tqdm import tqdm
import gc
import sys
import pickle
import warnings
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
import pandas.core.algorithms as algos
from pandas import Series
import scipy.stats.stats as stats
import re
import traceback
import string
if not sys.warnoptions:
    warnings.simplefilter("ignore")

Collecting partd>=0.3.10; extra == "dataframe"
  Downloading https://files.pythonhosted.org/packages/44/e1/68dbe731c9c067655bff1eca5b7d40c20ca4b23fd5ec9f3d17e201a6f36b/partd-1.1.0-py3-none-any.whl
Collecting fsspec>=0.6.0; extra == "dataframe"
[?25l  Downloading https://files.pythonhosted.org/packages/d3/66/974e01194980d9780cc09724315111f9cccba26b4351552fdb4d97eb842e/fsspec-0.8.0-py3-none-any.whl (85kB)
[K     |████████████████████████████████| 92kB 5.1MB/s 
Collecting locket
  Downloading https://files.pythonhosted.org/packages/d0/22/3c0f97614e0be8386542facb3a7dcfc2584f7b83608c02333bced641281c/locket-0.2.0.tar.gz
Building wheels for collected packages: locket
  Building wheel for locket (setup.py) ... [?25l[?25hdone
  Created wheel for locket: filename=locket-0.2.0-cp36-none-any.whl size=4040 sha256=7b48aae45735acd67b22df8ac7474f20364488355e3d40175cef1115d20ff1d9
  Stored in directory: /root/.cache/pip/wheels/26/1e/e8/4fa236ec931b1a0cdd61578e20d4934d7bf188858723b84698
Successfully

In [6]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

In [None]:
def convert_types(df):
    # Convert data types to reduce memory
    for c in df:
        col_type = str(df[c].dtypes)
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        
        # Convert objects to category
        if col_type == 'object':
            df[c] = df[c].astype('category')
        
        # numerics
        elif col_type in numerics:
            c_min = df[c].min()
            c_max = df[c].max()
            if col_type[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[c] = df[c].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[c] = df[c].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[c] = df[c].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[c] = df[c].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[c] = df[c].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[c] = df[c].astype(np.float32)
                else:
                    df[c] = df[c].astype(np.float64)  
        
    return df

## ***import data***

In [None]:
# target column
target = 'HasDetections'
# id from data set
data_id = 'MachineIdentifier'

In [None]:
# load train data set
# use dask to load faster
file = '/content/drive/My Drive/microsoft/train_featureengineering.csv'
ddf = dd.read_csv(file, dtype = dtypes)
train = ddf.compute()

In [None]:
 train.shape

In [None]:
train = convert_types(train)

In [None]:
# load test data set
# use dask to load faster
file = '/content/drive/My Drive/microsoft/test_featureengineering.csv'
ddf = dd.read_csv(file,dtype=dtypes)
test = ddf.compute()

In [None]:
test.shape

In [None]:
test['HasDetections']=np.nan

In [None]:
test['Census_ChassisTypeName'] = test['Census_ChassisTypeName'].fillna('unknown')

In [None]:
test.shape

In [None]:
test=convert_types(test)

In [None]:
# Combine train- and test-data
frames = [train, test]
df = pd.concat(frames)
df.shape

In [None]:
# Optimize the memory
gc.enable()
del test, train
gc.collect()

0

In [None]:
df = convert_types(df)

In [None]:
df.shape

(16774736, 78)

# **Data Encoding**

In [None]:
list_frequency_encoding = ['AppVersion',
 'AvSigVersion',
 'Census_OSVersion',
 'EngineVersion',
 'OsBuildLab']

In [None]:
correct_feature_by_hand = ['AppVersion',
 'AvSigVersion',
 'Census_ActivationChannel',
 'Census_ChassisTypeName',
 'Census_DeviceFamily',
 'Census_FlightRing',
 'Census_GenuineStateName',
 'Census_MDC2FormFactor',
 'Census_OSArchitecture',
 'Census_OSBranch',
 'Census_OSEdition',
 'Census_OSInstallTypeName',
 'Census_OSSkuName',
 'Census_OSVersion',
 'Census_OSWUAutoUpdateOptionsName',
 'Census_PowerPlatformRoleName',
 'Census_PrimaryDiskTypeName',
 'EngineVersion',
 'MachineIdentifier',
 'OsBuildLab',
 'OsPlatformSubRelease',
 'OsVer',
 'Platform',
 'Processor',
 'SkuEdition',
 'SmartScreen']

In [None]:

# new features from feature engineering
new_features_labelencode = ['monitor_dims', 'SmartScreen_AVProductsInstalled']

In [None]:
list_label_encoding = list(set(correct_feature_by_hand)-set(list_frequency_encoding))
list_label_encoding.remove(data_id)
list_label_encoding.extend(new_features_labelencode)

In [None]:
list_label_encoding

['OsVer',
 'Census_GenuineStateName',
 'Census_ChassisTypeName',
 'SmartScreen',
 'Census_OSInstallTypeName',
 'Census_OSEdition',
 'Census_MDC2FormFactor',
 'Census_FlightRing',
 'Census_OSWUAutoUpdateOptionsName',
 'OsPlatformSubRelease',
 'Census_PowerPlatformRoleName',
 'Census_OSArchitecture',
 'Census_DeviceFamily',
 'Census_OSBranch',
 'Census_OSSkuName',
 'Census_PrimaryDiskTypeName',
 'Census_ActivationChannel',
 'Platform',
 'Processor',
 'SkuEdition',
 'monitor_dims',
 'SmartScreen_AVProductsInstalled']

In [None]:
def frequency_encoding(feature):
    # Count the number of values of each feature and reset the indices
    t = df[feature].value_counts().reset_index()
    # Building up a new index (old index is set by default to 'level_0')
    t = t.reset_index()
    # Set the old index 'level_0' for all values, which only occur once, to NaN
    t.loc[t[feature] == 1, 'level_0'] = np.nan
    # Reset the original index (= the value name) as index
    t.set_index('index', inplace=True)
    # return the number of values , which occur two or more times, +1
    max_label = t['level_0'].max() + 1
    # fill all nan-values to max_label
    t.fillna(max_label, inplace=True)
    return t.to_dict()['level_0']

In [None]:
# Creating a dictionary for storing the encoder
enc_dict = dict();

In [None]:
# Encode all the features in 'list_frequency_encoding'
for feature in tqdm(list_frequency_encoding):
    freq_enc_dict = frequency_encoding(feature)
    df[feature] = df[feature].map(lambda x: freq_enc_dict.get(x, np.nan))
    df[feature] = df[feature].astype('int64')    
    # Save the freq_enc_dict
    enc_dict[feature] = freq_enc_dict

100%|██████████| 5/5 [00:01<00:00,  2.54it/s]


In [None]:
for feature in tqdm(list_label_encoding):
    print(feature)
    le = preprocessing.LabelEncoder()
    df[feature] = le.fit_transform(df[feature])
    df[feature] = df[feature].astype('int64') 
    # Save the freq_enc_dict
    enc_dict[feature] = le

  0%|          | 0/22 [00:00<?, ?it/s]

OsVer


  5%|▍         | 1/22 [00:02<01:02,  2.98s/it]

Census_GenuineStateName


  9%|▉         | 2/22 [00:05<00:59,  2.98s/it]

Census_ChassisTypeName


 14%|█▎        | 3/22 [00:08<00:56,  2.95s/it]

SmartScreen


 18%|█▊        | 4/22 [00:11<00:52,  2.94s/it]

Census_OSInstallTypeName


 23%|██▎       | 5/22 [00:14<00:50,  2.94s/it]

Census_OSEdition


 27%|██▋       | 6/22 [00:17<00:46,  2.92s/it]

Census_MDC2FormFactor


 32%|███▏      | 7/22 [00:20<00:44,  2.96s/it]

Census_FlightRing


 36%|███▋      | 8/22 [00:23<00:40,  2.92s/it]

Census_OSWUAutoUpdateOptionsName


 41%|████      | 9/22 [00:26<00:37,  2.90s/it]

OsPlatformSubRelease


 45%|████▌     | 10/22 [00:29<00:34,  2.90s/it]

Census_PowerPlatformRoleName


 50%|█████     | 11/22 [00:32<00:31,  2.89s/it]

Census_OSArchitecture


 55%|█████▍    | 12/22 [00:34<00:28,  2.90s/it]

Census_DeviceFamily


 59%|█████▉    | 13/22 [00:37<00:26,  2.90s/it]

Census_OSBranch


 64%|██████▎   | 14/22 [00:40<00:23,  2.90s/it]

Census_OSSkuName


 68%|██████▊   | 15/22 [00:43<00:20,  2.91s/it]

Census_PrimaryDiskTypeName


 73%|███████▎  | 16/22 [00:46<00:17,  2.92s/it]

Census_ActivationChannel


 77%|███████▋  | 17/22 [00:49<00:14,  2.98s/it]

Platform


 82%|████████▏ | 18/22 [00:52<00:11,  2.95s/it]

Processor


 86%|████████▋ | 19/22 [00:55<00:08,  2.92s/it]

SkuEdition


 91%|█████████ | 20/22 [00:58<00:05,  2.97s/it]

monitor_dims


 95%|█████████▌| 21/22 [01:01<00:02,  3.00s/it]

SmartScreen_AVProductsInstalled


100%|██████████| 22/22 [01:04<00:00,  2.93s/it]


In [None]:
test = df[df[target].isnull()]
test.shape

(7853253, 78)

In [None]:
train=df[df[target].notnull]
train.shape

In [None]:
# Save the encoded data-files
train.to_csv('/content/drive/My Drive/microsoft/train_labelencoding_sample.csv', index = False)
test.to_csv('/content/drive/My Drive/microsoft/test_labelencoding_sample.csv', index = False)

### **Dimensionality Reduction**
Since after one hot encoding, the dimension of the design matrix becomes huge such that training our model becomes computationally expensive. We use only a set of selected features for training our model. In stage 1 we use data_vars fumction to give us an initial estimate of the importance of features. After that we threshold the importance and remove the features which lie below a certain threshold. In second step we use iterate_vif function to remove highly coorelated indepedent variables

**Taking the sample of population with equal distribution of HasDetections**

In [8]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
train_1=train[train['HasDetections']==1].sample(100000,random_state=42)
train_0=train[train['HasDetections']==0].sample(100000,random_state=42)
train_sample=pd.concat([train_1,train_0])

In [10]:
# target column
target = 'HasDetections'
# id from data set
data_id = 'MachineIdentifier'

In [11]:
y_train=train_sample[target]
X_train=train_sample.drop([target,data_id],axis=1)

In [12]:
max_bin = 20
force_bin = 3

# define a binning function
def mono_bin(Y, X, n = max_bin):
    
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]
    r = 0
    while np.abs(r) < 1:
        try:
            d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n)})
            d2 = d1.groupby('Bucket', as_index=True)
            r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
            n = n - 1 
        except Exception as e:
            n = n - 1

    if len(d2) == 1:
        n = force_bin         
        bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
        if len(np.unique(bins)) == 2:
            bins = np.insert(bins, 0, 1)
            bins[1] = bins[1]-(bins[1]/2)
        d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins),include_lowest=True)}) 
        d2 = d1.groupby('Bucket', as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["MIN_VALUE"] = d2.min().X
    d3["MAX_VALUE"] = d2.max().X
    d3["COUNT"] = d2.count().Y
    d3["EVENT"] = d2.sum().Y
    d3["NONEVENT"] = d2.count().Y - d2.sum().Y
    d3=d3.reset_index(drop=True)
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]       
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    
    return(d3)

def char_bin(Y, X):
        
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]    
    df2 = notmiss.groupby('X',as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["COUNT"] = df2.count().Y
    d3["MIN_VALUE"] = df2.sum().Y.index
    d3["MAX_VALUE"] = d3["MIN_VALUE"]
    d3["EVENT"] = df2.sum().Y
    d3["NONEVENT"] = df2.count().Y - df2.sum().Y
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]      
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    d3 = d3.reset_index(drop=True)
    
    return(d3)

def data_vars(df1, target):
    
    stack = traceback.extract_stack()
    filename, lineno, function_name, code = stack[-2]
    vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0]
    final = (re.findall(r"[\w']+", vars_name))[-1]
    
    x = df1.dtypes.index
    count = -1
    
    for i in x:
        if i.upper() not in (final.upper()):
            if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2:
                conv = mono_bin(target, df1[i])
                conv["VAR_NAME"] = i
                count = count + 1
            else:
                conv = char_bin(target, df1[i])
                conv["VAR_NAME"] = i            
                count = count + 1
                
            if count == 0:
                iv_df = conv
            else:
                iv_df = iv_df.append(conv,ignore_index=True)
    
    iv = pd.DataFrame({'IV':iv_df.groupby('VAR_NAME').IV.max()})
    iv = iv.reset_index()
    return(iv_df,iv)

In [13]:
final_iv, IV = data_vars(X_train, y_train)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [14]:
features = list(IV[(IV['IV'] >= 0.01) & (IV['IV'] <= 0.8)]['VAR_NAME'])
X2 = X_train[features]

In [15]:
X2.shape

(200000, 17)

In [16]:
def iterate_vif(df, vif_threshold=5, max_vif=6):
  count = 0
  while max_vif > vif_threshold:
    count += 1
    print("Iteration # "+str(count))
    vif = pd.DataFrame()
    vif["VIFactor"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    vif["features"] = df.columns
    
    if vif['VIFactor'].max() > vif_threshold:
      print('Removing %s with VIF of %f' % (vif[vif['VIFactor'] == vif['VIFactor'].max()]['features'].values[0], vif['VIFactor'].max()))
      df = df.drop(vif[vif['VIFactor'] == vif['VIFactor'].max()]['features'].values[0], axis=1)
      max_vif = vif['VIFactor'].max()
    else:
        print('Complete')
        return df, vif.sort_values('VIFactor')
 
final_df, final_vif = iterate_vif(X2)

Iteration # 1
Removing Processor with VIF of 102.676802
Iteration # 2
Removing AVProductsEnabled with VIF of 31.212012
Iteration # 3
Removing IsProtected with VIF of 13.556412
Iteration # 4
Removing SmartScreen_AVProductsInstalled with VIF of 9.490429
Iteration # 5
Removing Census_PowerPlatformRoleName with VIF of 6.787455
Iteration # 6
Complete


In [17]:
X1 = X2._get_numeric_data()
X_train, final_vif = iterate_vif(X1)

Iteration # 1
Removing Processor with VIF of 102.676802
Iteration # 2
Removing AVProductsEnabled with VIF of 31.212012
Iteration # 3
Removing IsProtected with VIF of 13.556412
Iteration # 4
Removing SmartScreen_AVProductsInstalled with VIF of 9.490429
Iteration # 5
Removing Census_PowerPlatformRoleName with VIF of 6.787455
Iteration # 6
Complete


In [18]:
X_train.shape

(200000, 12)

In [19]:
X_test=test[X_train.columns]
X_test.shape

(7853253, 12)

In [22]:
X_train.to_csv('/content/drive/My Drive/label_encoding/X_train.csv', index = False)
y_train.to_csv('/content/drive/My Drive/label_encoding/y_train.csv', index = False)
X_test.to_csv('//content/drive/My Drive/label_encoding/X_test.csv', index = False)