In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
! ls ../input/dont-overfit-ii

In [None]:
train = pd.read_csv("../input/dont-overfit-ii/train.csv")
test = pd.read_csv("../input/dont-overfit-ii/test.csv")

In [None]:
train.head()

## EDA
* Right off the bat, we are looking at only 250 entries, with this little data of course we'll overfit.
* Overfitting is the result of on overcomplicated model; hence, we'll deploy some sort of feature selection to simplify the model complexity and combat overfitting in this manner. Also, we'll try to perform data augmentation if possible to get a bit more samples. More data will help with overfitting.

In [None]:
train.info()

In [None]:
train.isnull().sum() # doent seem to have any missing values

In [None]:
train.isnull().any().any() # no missing values.

In [None]:
plt.figure(figsize=(24, 24))

for i, feature in enumerate(list(train.columns)[2:27]):
    plt.subplot(5, 5, i + 1)
    plt.hist(train[feature])
    plt.title(f'feature name:{feature}')

From above plots, looks like it's safe to assume all 302 cols/features are normally distributed.

In [None]:
import seaborn as sns
sns.set_style('whitegrid')
sns.countplot(x='target',data=train, palette='RdBu_r')

## Data Imbalance
From the Histogram above, we have data imbalance. Apply SMOTE later?

In [None]:
train.corr()['target'].plot(kind='bar', style='k--', label='Series', grid=True, figsize=(20, 4)) 
corr = train.corr()['target']

In [None]:
corr_df = corr.to_frame()
# selecting only corr() that are greater than 0.2
corr_df[corr_df['target'] > 0.2] 

Feature 18 and 128 seemed to be highly correlated with the target label.

## Baseline Model buildling

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,classification_report,roc_auc_score,roc_curve,auc
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn import model_selection
from sklearn import preprocessing

X_train = train.drop(['id', 'target'], axis=1)
y_train = train['target']
X_test = test.drop(['id'], axis=1)
n_fold = 20
folds = model_selection.StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)
repeated_folds = model_selection.RepeatedStratifiedKFold(n_splits=20, n_repeats=20, random_state=42)

from sklearn.model_selection import train_test_split
train_x,valid_x,train_y,valid_y = train_test_split(X_train, y_train, random_state = 96, stratify=y_train)

# do not also scale the X_test else you'll leak the scaling info to your test/hold-out set
scaler = preprocessing.StandardScaler()
train_x = scaler.fit_transform(train_x)
valid_x = scaler.transform(valid_x)

In [None]:
#Model train and eval function
def model_train_eval(algorithm,dtrain_X,dtrain_Y,dtest_X,dtest_Y,cols=None):
    
    algorithm.fit(dtrain_X, dtrain_Y)
    predictions = algorithm.predict(dtest_X)
    print (algorithm)
    # embed()
    print ("ROC-AUC score : ", roc_auc_score(dtest_Y, predictions))
    print ("classification report :\n",classification_report(predictions,dtest_Y))
    
    prediction_probabilities = algorithm.predict_proba(dtest_X)[:,1]
    fpr , tpr , thresholds   = roc_curve(dtest_Y,prediction_probabilities)
    return roc_auc_score(dtest_Y, predictions)

In [None]:
# A lot of people are using logreg currently, let's try
model = LogisticRegression(class_weight='balanced', penalty='l1', C=0.1, solver='liblinear')
model_train_eval(model,train_x,train_y,valid_x,valid_y)

In [None]:
from sklearn.model_selection import learning_curve

def plot_training_curves(X_train, y_train, model):
    train_sizes, train_scores, test_scores = learning_curve(estimator=model,
                                               X=X_train,
                                               y=y_train,
                                               train_sizes=np.linspace(0.1, 1.0, 10),
                                               cv=10,
                                               n_jobs=1)
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    train_mean
    plt.plot(train_sizes, train_mean,
             color='blue', marker='o',
             markersize=5, label='Training accuracy')

    plt.fill_between(train_sizes,
                     train_mean + train_std,
                     train_mean - train_std,
                     alpha=0.15, color='blue')

    plt.plot(train_sizes, test_mean,
             color='green', linestyle='--',
             marker='s', markersize=5,
             label='Validation accuracy')

    plt.fill_between(train_sizes,
                     test_mean + test_std,
                     test_mean - test_std,
                     alpha=0.15, color='green')

    plt.grid()
    plt.xlabel('Number of training examples')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.ylim([0.4, 0.9])
    plt.tight_layout()
    # plt.savefig('images/06_05.png', dpi=300)
    plt.show()

In [None]:
plot_training_curves(X_train, y_train, model)

Looks like pretty big gap - still Overfitting.

In [None]:
test.head()

In [None]:
! pwd

In [None]:
def save_submission_file(model, filename="submission.csv"):
    holdout_data = test.drop(['id'], axis=1)
    predictions = model.predict(holdout_data)
    
    holdout_ids = test["id"]
    submission_df = {"id": holdout_ids,
                 "target": predictions}
    submission = pd.DataFrame(submission_df)

    submission.to_csv(os.path.join('/kaggle/working',filename),index=False)
    return submission

save_submission_file(model)

In [None]:
! ls