## Basic Filter Methods plus LASSO pipeline
### Putting it all together!

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import roc_auc_score

In [2]:
data = pd.read_csv('dataset_1.csv')
data.shape

(50000, 301)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis=1),
    data['target'],
    test_size=0.3,
    random_state=0)
X_train.shape, X_test.shape

((35000, 300), (15000, 300))

**Copy of the dataset!**

In [4]:
X_train_original = X_train.copy()
X_test_original = X_test.copy()

### Remove constant features

In [5]:
constant_features = [feat for feat in X_train.columns if X_train[feat].std() == 0]
X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)
X_train.shape, X_test.shape

((35000, 266), (15000, 266))

### Remove quasi-constant features

In [6]:
sel = VarianceThreshold(
    threshold=0.01)  # 0.1 indicates 99% of observations approximately
sel.fit(X_train)  # fit finds the features with low variance
sum(sel.get_support()) # how many not quasi-constant?

215

In [7]:
features_to_keep = X_train.columns[sel.get_support()]

**Remove features!**

In [8]:
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)
X_train.shape, X_test.shape

((35000, 215), (15000, 215))

**Transform the NumPy arrays to dataframes!**

In [9]:
X_train= pd.DataFrame(X_train)
X_train.columns = features_to_keep
X_test= pd.DataFrame(X_test)
X_test.columns = features_to_keep

### Remove duplicated features

In [10]:
duplicated_feat = []
for i in range(0, len(X_train.columns)):
    if i % 10 == 0:  # this helps me understand how the loop is going
        print(i)
    col_1 = X_train.columns[i]
    for col_2 in X_train.columns[i + 1:]:
        if X_train[col_1].equals(X_train[col_2]):
            duplicated_feat.append(col_2)     
len(duplicated_feat)

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210


10

**Remove duplicated features!**

In [11]:
X_train.drop(labels=duplicated_feat, axis=1, inplace=True)
X_test.drop(labels=duplicated_feat, axis=1, inplace=True)
X_train.shape, X_test.shape

((35000, 205), (15000, 205))

**Copy the dataset without constant, quasi-constant and duplicated variables to measure the performance of machine learning models!**

In [12]:
X_train_basic_filter = X_train.copy()
X_test_basic_filter = X_test.copy()

### Remove correlated features

In [13]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            # we are interested in absolute coeff value
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr
corr_features = correlation(X_train, 0.8)
print('correlated features: ', len(set(corr_features)))

correlated features:  93


In [14]:
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)
X_train.shape, X_test.shape

((35000, 112), (15000, 112))

**Copy the dataset without correlated features!**

In [15]:
X_train_corr = X_train.copy()
X_test_corr = X_test.copy()

### Remove features using Lasso

In [16]:
scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler()

**Fit a Lasso and selet features, make sure to select l1 ! Then remove features with zero coefficient from dataset and parse again as dataframe!**

In [17]:
sel_ = SelectFromModel(
    LogisticRegression(C=0.5,
                       penalty='l1',
                       solver='liblinear',
                       random_state=10))
sel_.fit(scaler.transform(X_train), y_train)
X_train_lasso = pd.DataFrame(sel_.transform(X_train))
X_test_lasso = pd.DataFrame(sel_.transform(X_test))
X_train_lasso.columns = X_train.columns[(sel_.get_support())]  # add the columns name
X_test_lasso.columns = X_train.columns[(sel_.get_support())]

In [18]:
X_train_lasso.shape, X_test_lasso.shape

((35000, 90), (15000, 90))

### Compare the performance of Logistic Regression with the different feature subsets

**Create a function to train logistic regression and compare performance in train and test set!**

In [20]:
def run_logistic(X_train, X_test, y_train, y_test):
    scaler = StandardScaler().fit(X_train) # with a scaler
    logit = LogisticRegression(random_state=44, max_iter=500)
    logit.fit(scaler.transform(X_train), y_train)
    print('Train set')
    pred = logit.predict_proba(scaler.transform(X_train))
    print('Logistic Regression roc-auc: {}'.format(
        roc_auc_score(y_train, pred[:, 1])))
    print('Test set')
    pred = logit.predict_proba(scaler.transform(X_test))
    print('Logistic Regression roc-auc: {}'.format(
        roc_auc_score(y_test, pred[:, 1])))

**Original!**

In [21]:
run_logistic(X_train_original,
             X_test_original,
             y_train,
             y_test)

Train set
Logistic Regression roc-auc: 0.8028294003037697
Test set
Logistic Regression roc-auc: 0.7951005875022253


**Filter methods - basic!**

In [22]:
run_logistic(X_train_basic_filter,
             X_test_basic_filter,
             y_train,
             y_test)

Train set
Logistic Regression roc-auc: 0.8022720222681473
Test set
Logistic Regression roc-auc: 0.7947452037606925


**Filter methods - correlation!**

In [23]:
run_logistic(X_train_corr,
             X_test_corr,
             y_train,
             y_test)

Train set
Logistic Regression roc-auc: 0.7942736215203425
Test set
Logistic Regression roc-auc: 0.7881862599647895


**Embedded methods - Lasso!**

In [24]:
run_logistic(X_train_lasso,
             X_test_lasso,
             y_train,
             y_test)

Train set
Logistic Regression roc-auc: 0.7941637150789304
Test set
Logistic Regression roc-auc: 0.7882337348920934


As you can see, with these procedures **we reduced the feature space quite a bit, without losing model performance dramatically.**