# Load libraries

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score as auc

# Load and prepare data

In [14]:
test_raw = pd.read_csv("test.csv")
train_raw = pd.read_csv("train.csv")

In [16]:
X = train_raw.copy()
y = X.pop('target')
train_id = X.pop('id')

Xt = test_raw.copy()
test_id = Xt.pop('id')

Xall = X.append(Xt, sort=False)

print (Xall.shape, X.shape, Xt.shape)

(500000, 23) (300000, 23) (200000, 23)


# One-Hot encode data

In [12]:
Xohe = pd.get_dummies(Xall, columns=Xall.columns, sparse=True, drop_first=True)
Xohe = Xohe.sparse.to_coo().tocsc()

train = Xohe[:train.shape[0], :]
test = Xohe[train.shape[0]:, :]

print (Xohe.shape, train.shape, test.shape)

(500000, 16529) (300000, 16529) (200000, 16529)


# Create and fit model. We use Logistic Regression. 

In [18]:
X_train, X_val, y_train, y_val = train_test_split(train, y, train_size=0.7, random_state=1234)

params = {'solver': 'lbfgs', 'C': 0.13, 'max_iter' : 5000}

model = LogisticRegression(**params)
model.fit(X_train, y_train)


LogisticRegression(C=0.13, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=5000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

# Predict probabilities

In [19]:
pred_y_val = model.predict_proba(X_val)[:, 1]
pred_y_test = model.predict_proba(X_test)[:, 1]
print(auc(y_val, pred_y_val)) 

0.8042186190842686


# Make submission

In [20]:
pd.DataFrame({'id': test_id, 'target': pred_y_test}).to_csv('submission.csv', index=False)
