In [1]:
(pyvenv-activate "~/courses/Machine Learning/")

## EDA



Import packages

    import os
    os.chdir("utils")

0 - bfa00fdd-902b-4828-8471-e3c89b341441

    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import datetime as dt
    import seaborn as sns
    %matplotlib inline
    import os
    %load_ext autoreload
    %autoreload 2
    
    from implementations import reg_logistic_regression
    from helpers import sigmoid
    from cross_validation import accuracy, f1_score

1 - 9d1c6224-2e62-460e-a899-2d6fd053ceff

Read the training data

    train = pd.read_csv("../data/train.csv")

2 - 6438e20a-a329-4cf5-8509-109980b1271f

Replace the missing values by np.nan, and try list-wise deletion

    cc = train.replace(to_replace=-999, value=np.nan).dropna()
    cc.shape

3 - 1899f13a-0b81-4177-b3d2-8fcbf3473553

We are ultimately left with a sample of 68 thousand rows, which should be an adequate sample size for prediction.
We also want to investigate the balance of the outcome. First, subset and coerce the label vector to numeric

    y = np.where(np.asarray(cc.loc[:, 'Prediction']) == 's', 1, 0)
    y

4 - 868ee9ff-1a52-4155-891e-cc7473aa7c83

Plot the outcome variable

    unique, counts = np.unique(y, return_counts=True)
    plt.bar(unique, counts)

5 - 171961ac-19ab-45ba-8d8e-768bece6f00b

We have approximately 30 thousand labels that are classified as 1, and we see that the outcome is balanced among 1s 
ans 0s. Hence, we are able to evaluate the model on a balanced outcome.

Now we create the feature set. Drop the Prediction and the id columns to create the feature matrix

    c = ['Id', 'Prediction']
    X = cc.drop(columns=c)

6 - b893918c-0838-4d00-ba19-c4e8a09ecd2e

Standardize the feature set for prediction

    X_standardized = (X - X.mean(axis=0)) / X.std(axis=0)

7 - acc82dc3-2c44-4429-ac78-4ae082d8df5d

Augment the dataset with 1s, for the intercept of the model.

    X_model = pd.concat([
        pd.DataFrame(
            np.ones((X_standardized.shape[0], 1)), 
            columns=['beta0'], 
            index=X_standardized.index
        ),
        X_standardized
    ], axis=1)
    X_model.iloc[:5, :5]

8 - f11e70bc-952d-4f08-9db9-c1d1f0f254e0

To conduct the same data-preprocessing on the test set, we prepare a function for these operations

    def missing_data_handling(raw_sample, method="cc", imp_percentage=None):
        """
        Handle missing data for the raw sample
    
        Parameters
        ----------
        raw_sample: Numpy array
            Sample to be handled.
        method: String
            Missing data handler. Must be one of 'cc' (for complete case)
            or 'si' (for simple, median imputation).
        imp_percentage: Float
            If method is 'si' and imp_percentage is specified, then only
            the features with proportions of missing data corresponding to
            imp_percentage will be imputed, the rest are removed. 
            If None (as default), all columns with missing data are handled
            using imputation.
    
        Returns
        -------
        raw_sample: Numpy array
            Sample with missing data handled.
        """
        assert method in ['cc', ''], "Parameter method must be one of 'cc', 'si'"
        
        if method == 'cc':
            raw_sample = raw_sample.replace(to_replace=-999, value=np.nan).dropna()
        elif method == 'si':
            pass
    
        return raw_sample
    
    def conduct_data_preparation(raw_sample, missing_method="cc", 
                                 include_outcome=True):
        """Missing data handling and data subsetting."""
        ## Drop nas
        sample = missing_data_handling(
            raw_sample,
            method=missing_method
        )
        ## Subset outcome 
        y = None
        if include_outcome:
            y = np.where(np.asarray(cc.loc[:, 'Prediction']) == 's', 1, 0)
        c = ['Id', 'Prediction']
        X = cc.drop(columns=c)
    
        return X, y
    
    
    def prepare_features(X, include_outcome=True):
        """Missing data handling, data subsettting, and augmentation."""
        ## Standardize sample
        X_standardized = (X - X.mean(axis=0)) / X.std(axis=0)
        ## Make prediction data
        X_model = pd.concat([
            pd.DataFrame(
                np.ones((X_standardized.shape[0], 1)), 
                columns=['beta0'], 
                index=X_standardized.index
            ),
            X_standardized
        ], axis=1)
    
        return X_model

9 - 882976c7-2592-418f-8c60-7318ac961094

    X, y = conduct_data_preparation(train)

10 - aa3e6491-46f0-4e41-892c-e0e70e3d9a41



### Exploring the distributions of the variables



    f = X.hist(figsize=(15, 15), bins = 100)

11 - 6eb3381b-958e-496e-8a23-6c74b4dd13cd

    corr = X.corr()
    f, ax = plt.subplots(figsize=(10, 10))
    sns.heatmap(
        corr, 
        xticklabels=corr.columns.values,
        yticklabels=corr.columns.values,
        ax=ax
    )

12 - f0bad7c0-7cd1-4f17-9b8d-c31a5d9435e3



## Training



Define a function similar to that in implementations, but constructed to work with dataframes

    def split_data(x, y, ratio, shuffle=True, seed=1):
        """Split data into train and test set."""
    
        split = int(x.shape[0]*ratio)
        
        if shuffle:
            np.random.seed(seed)
            train_idx = np.random.permutation(np.arange(x.shape[0]))[:split]
            test_idx = np.random.permutation(np.arange(x.shape[0]))[split:]
    
    
            x_train = x.iloc[train_idx]
            y_train = y[train_idx]
            x_test = x.iloc[test_idx]
            y_test = y[test_idx]
    
        else:
            x_train = x.iloc[:split, :]
            y_train = y[:split]
            x_test = x.iloc[split:, :]
            y_test = y[split:]
        
        return x_train, x_test, y_train, y_test

13 - 42ebf20f-fc31-4bf7-98c7-6ef6065ce1eb

Now, we don't want to conduct the data preparation on the full sample, as that would 
cause information leakage, and biased oos performance. Hence we want to first split the training data

    X_train, X_test, y_train, y_test =  split_data(X, y, 0.9)
    X_train = prepare_features(X_train)
    X_test = prepare_features(X_test)
    X_train.iloc[:5, :5]

14 - 0982a7ae-f023-48a7-84d2-08342072c53a

    X_train.iloc[:5, :5]

15 - a13590ab-e3e6-4c6f-9490-0ed300c716e3

    w, loss = reg_logistic_regression(
        y=y_train.reshape(-1, 1),
        tx=np.asarray(X_train),
        lambda_=0.2,
        reg=1,
        initial_w=np.array([0 for x in X_train.columns])[np.newaxis].T,
        max_iters=100,
        gamma=0.00011,
        batch_size=50
    )
    loss

16 - 894ea345-0ec1-48bf-8e4e-b453f45b18c9

Evaluate the oos performance

    predictions = np.rint(sigmoid(X_test @ w))
    predictions.head()

17 - 07d49219-409e-414a-925e-7a3edfd50adb

    acc = accuracy(
        y_targ=y_test,
        y_pred=np.array(predictions).ravel()
    )
    f1 = f1_score(
        y_targ=y_test,
        y_pred=np.array(predictions).ravel()
    )
    print("Accuracy: {acc}, F1-score: {f1}".format(acc=acc, f1=f1))

18 - 80ee957b-8c5c-4001-910f-19f537ad098a



## Testing



    test = pd.read_csv("../data/test.csv")

19 - 113dd582-331a-4817-9b8d-679698d8aa4b

Create prediction data with the test set

    X_test, _ = create_prediction_data(test, False)
    X_test.shape

20 - cc39f6b6-8872-492f-ace1-bd285da307e5

    predictions = np.rint(sigmoid(X_test @ w))

21 - b27fcf80-e99e-4ab2-9fd7-0cfd7a385b5b

Save the predictions to disk

    predictions.to_csv("../predictions/predictions_" + str(dt.datetime.strftime(dt.datetime.now(), "%d%m%Y")))

22 - c8812eec-1026-4992-b606-be470e1772ca

