## Libraries used

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, SelectPercentile, SelectFpr, f_classif, \
mutual_info_classif, SelectFromModel
import warnings
warnings.filterwarnings('ignore')

## Read data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
X_train = train.iloc[:,2:]
y_train = train.iloc[:,1]
X_test = test.iloc[:,1:]

## Generate output csv for submission

In [3]:
def genop(pred):
    df = pd.DataFrame()
    df['id'] = test['id']
    df['target'] = pred
    df.to_csv('output.csv', index=False)

## Check correlation of features

In [4]:
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
print(get_top_abs_correlations(X_train, 5))

Top Absolute Correlations
32   75     0.259315
101  193    0.252825
22   28     0.246062
140  187    0.244012
79   187    0.243916
dtype: float64


As we can see the features have no correlation

## Baseline Linear Model (logistic regression)

In [5]:
cont_scale_pipeline = make_pipeline(SimpleImputer(strategy = "median"), StandardScaler())
pipe_scale = make_pipeline(cont_scale_pipeline, LogisticRegression())
scores_pipe_scale = cross_val_score(pipe_scale, X_train, y_train, cv=3, scoring='roc_auc')
model = pipe_scale.fit(X_train, y_train)
pred = model.predict(X_test)
print("The auc-roc scores are ",scores_pipe_scale)
genop(pred)

The auc-roc scores are  [0.72962963 0.71069182 0.65849057]


This model got a public score of 0.666 on Kaggle

## Baseline Linear Model (Linear SVC)

In [6]:
cont_scale_pipeline = make_pipeline(SimpleImputer(strategy = "median"), StandardScaler())
pipe_scale = make_pipeline(cont_scale_pipeline, LinearSVC())
scores_pipe_scale = cross_val_score(pipe_scale, X_train, y_train, cv=3, scoring='roc_auc')
model = pipe_scale.fit(X_train, y_train)
pred = model.predict(X_test)
print("The auc-roc scores are ",scores_pipe_scale)
genop(pred)

The auc-roc scores are  [0.74135802 0.70377358 0.62515723]


This model got a public score of 0.656 on Kaggle

## Feature Selection 

### SelectKbest technique (K=35)

In [7]:
select = SelectKBest(k=35, score_func=f_classif)
select.fit(X_train, y_train)
print(X_train.shape)
print(select.transform(X_train).shape)
up_X_train = pd.DataFrame(select.transform(X_train))
up_X_test = pd.DataFrame(select.transform(X_test))


cont_scale_pipeline = make_pipeline(SimpleImputer(strategy = "median"), StandardScaler())
pipe_scale = make_pipeline(cont_scale_pipeline, LogisticRegression(C=0.1))
scores_pipe_scale = cross_val_score(pipe_scale, X_train, y_train, cv=3, scoring='roc_auc')
model = pipe_scale.fit(up_X_train, y_train)
pred = model.predict(up_X_test)
print("The auc-roc scores are ",scores_pipe_scale)
genop(pred)


(250, 300)
(250, 35)
The auc-roc scores are  [0.7308642  0.70628931 0.67861635]


This model got a public score of 0.722 on Kaggle

### SelectKbest technique (K=75)

In [8]:
select = SelectKBest(k=75, score_func=f_classif)
select.fit(X_train, y_train)
print(X_train.shape)
print(select.transform(X_train).shape)
up_X_train = pd.DataFrame(select.transform(X_train))
up_X_test = pd.DataFrame(select.transform(X_test))


cont_scale_pipeline = make_pipeline(SimpleImputer(strategy = "median"), StandardScaler())
pipe_scale = make_pipeline(cont_scale_pipeline, LogisticRegression(C=0.1))
scores_pipe_scale = cross_val_score(pipe_scale, X_train, y_train, cv=3, scoring='roc_auc')
model = pipe_scale.fit(up_X_train, y_train)
pred = model.predict(up_X_test)
print("The auc-roc scores are ",scores_pipe_scale)
genop(pred)


(250, 300)
(250, 75)
The auc-roc scores are  [0.7308642  0.70628931 0.67861635]


This model got a public score of 0.706 on Kaggle