In [1]:
import numpy as np
from helpers import *
from implementations import *
import matplotlib.pyplot as plt
from IPython.display import display, clear_output

In [2]:
def get_feature_names(data_path):
    try:
        file_path = data_path + "/x_train.csv"

        data = np.genfromtxt(file_path, delimiter=',', dtype=str, max_rows=1)
        
        # Extract the feature names
        feature_names = data[1:]  
        return feature_names.tolist()  
    except FileNotFoundError:
        return None  

In [3]:
x_train, x_test, y_train, train_ids, test_ids = load_csv_data('data/dataset/')
feature_names = get_feature_names('data/dataset/')

In [45]:
best_lambda=1e-5
best_gamma=3

In [4]:
print("x_train shape: {}".format(x_train.shape))
print("x_test shape: {}".format(x_test.shape))
print("y_train shape: {}".format(y_train.shape))

x_train shape: (328135, 321)
x_test shape: (109379, 321)
y_train shape: (328135,)


In [5]:
# Removing irrelevant columns and columns few answers

COLUMNS_TO_REMOVE = [
    "_STATE", "FMONTH", "IDATE", "IMONTH", "IDAY", "IYEAR", "DISPCODE", "SEQNO", "_PSU", 
    "CTELENUM", "PVTRESD1", "COLGHOUS", "STATERES", "CELLFON3", "LADULT", 
    "NUMADULT", "NUMMEN", "NUMWOMEN", "CTELNUM1", "CELLFON2", "CADULT", 
    "PVTRESD2", "CCLGHOUS", "CSTATE", "LANDLINE", "HHADULT", "POORHLTH", 
    "ASTHNOW", "DIABAGE2", "RENTHOM1", "NUMHHOL2", "NUMPHON2", "CPDEMO1", 
    "INTERNET", "PREGNANT", "SMOKDAY2", "STOPSMK2", "LASTSMK2", "AVEDRNK2", 
    "DRNK3GE5", "MAXDRNKS", "LMTJOIN3", "ARTHDIS2", "ARTHSOCL", "JOINPAIN", 
    "SEATBELT", "FLUSHOT6", "FLSHTMY2", "IMFVPLAC", "PNEUVAC3", "HIVTSTD3", 
    "WHRTST10", "PDIABTST", "PREDIAB1", "INSULIN", "BLDSUGAR", "FEETCHK2", 
    "DOCTDIAB", "CHKHEMO3", "FEETCHK", "EYEEXAM", "DIABEYE", "DIABEDU", 
    "PAINACT2", "QLMENTL2", "QLSTRES2", "QLHLTH2", "CAREGIV1", "CRGVREL1", 
    "CRGVLNG1", "CRGVHRS1", "CRGVPRB1", "CRGVPERS", "CRGVHOUS", "CRGVMST2", 
    "CRGVEXPT", "VIDFCLT2", "VIREDIF3", "VIPRFVS2", "VINOCRE2", "VIEYEXM2", 
    "VIINSUR2", "VICTRCT4", "VIGLUMA2", "VIMACDG2", "CIMEMLOS", "CDHOUSE", 
    "CDASSIST", "CDHELP", "CDSOCIAL", "CDDISCUS", "WTCHSALT", "LONGWTCH", 
    "DRADVISE", "ASTHMAGE", "ASATTACK", "ASERVIST", "ASDRVIST", "ASRCHKUP", 
    "ASACTLIM", "ASYMPTOM", "ASNOSLEP", "ASTHMED3", "ASINHALR", "HAREHAB1", 
    "STREHAB1", "CVDASPRN", "ASPUNSAF", "RLIVPAIN", "RDUCHART", "RDUCSTRK", 
    "ARTTODAY", "ARTHWGT", "ARTHEXER", "ARTHEDU", "TETANUS", "HPVADVC2", 
    "HPVADSHT", "SHINGLE2", "HADMAM", "HOWLONG", "HADPAP2", "LASTPAP2", 
    "HPVTEST", "HPLSTTST", "HADHYST2", "PROFEXAM", "LENGEXAM", "BLDSTOOL", 
    "LSTBLDS3", "HADSIGM3", "HADSGCO1", "LASTSIG3", "PCPSAAD2", "PCPSADI1", 
    "PCPSARE1", "PSATEST1", "PSATIME", "PCPSARS1", "PCPSADE1", "PCDMDECN", 
    "SCNTMNY1", "SCNTMEL1", "SCNTPAID", "SCNTWRK1", "SCNTLPAD", "SCNTLWK1", 
    "SXORIENT", "TRNSGNDR", "RCSGENDR", "RCSRLTN2", "CASTHDX2", "CASTHNO2", 
    "EMTSUPRT", "LSATISFY", "ADPLEASR", "ADDOWN", "ADSLEEP", "ADENERGY", 
    "ADEAT1", "ADFAIL", "ADTHINK", "ADMOVE", "MISTMNT", "ADANXEV", "QSTVER", 
    "QSTLANG", "MSCODE", "_STSTR", "_STRWT", "_RAWRAKE", "_WT2RAKE", "_CHISPNC", 
    "_CRACE1", "_CPRACE", "_CLLCPWT", "_DUALUSE", "_DUALCOR", "_LLCPWT", 
    "_RFHLTH", "_HCVU651", "_RFHYPE5", "_CHOLCHK", "_RFCHOL", "_LTASTH1", 
    "_CASTHM1", "_ASTHMS1", "_DRDXAR1", "_PRACE1", "_MRACE1", "_HISPANC", 
    "_RACE", "_RACEG21", "_RACEGR3", "_RACE_G1", "_AGE65YR",
    "_MISFRTN", "_MISVEGN", "_FRTRESP", "_VEGRESP", "_FRT16", "_VEG23", 
    "_FRUITEX", "_VEGETEX", "METVL11_", "METVL21_", "ACTIN11_", "ACTIN21_", 
    "PADUR1_", "PADUR2_", "PAFREQ1_", "PAFREQ2_", "_MINAC11", "_MINAC21", 
    "PAMISS1_", "PAMIN11_", "PAMIN21_", "PA1MIN_", "PAVIG11_", "PAVIG21_", 
    "PA1VIGM_", "_LMTACT1", "_LMTWRK1", "_LMTSCL1", "_RFSEAT2", "_RFSEAT3", 
    "_FLSHOT6", "_PNEUMO2", "_AIDTST3", "EXRACT11", "EXEROFT1", "EXERHMM1", 
    "EXRACT21", "EXEROFT2", "EXERHMM2", "PHYSHLTH", "MENTHLTH", "CHILDREN", 
    "WEIGHT2", "HEIGHT3", "ALCDAY5", "FRUITJU1", "FRUIT1", "FVBEANS", 
    "FVGREEN", "FVORANG", "VEGETAB1", "STRENGTH"
]

In [6]:
# Rules to encode categorical data

ENCODING_RULES = {
    'GENHLTH': {'valid_range': (1, 5), 'invalid_values': {7, 9, np.nan}},
    'HLTHPLN1': {'valid_range': (1, 2), 'invalid_values': {7, 9, np.nan}, 'no_value': 2},
    'PERSDOC2': {'valid_range': (1, 3), 'invalid_values': {7, 9, np.nan}, 'no_value': 3},
    'MEDCOST': {'valid_range': (1, 2), 'invalid_values': {7, 9, np.nan}, 'no_value': 2},
    'CHECKUP1': {'valid_range': (1, 4), 'invalid_values': {7, 9, np.nan}, 'no_value': 8},
    'BPHIGH4': {'valid_range': (1, 4), 'invalid_values': {7, 9, np.nan}, 'no_value': 3},
    'BPMEDS': {'valid_range': (1, 2), 'invalid_values': {7, 9, np.nan}, 'no_value': 2},
    'BLOODCHO': {'valid_range': (1, 2), 'invalid_values': {7, 9, np.nan}, 'no_value': 2},
    'CHOLCHK': {'valid_range': (1, 4), 'invalid_values': {7, 9, np.nan}},
    'TOLDHI2': {'valid_range': (1, 2), 'invalid_values': {7, 9, np.nan}, 'no_value': 2},
    'CVDSTRK3': {'valid_range': (1, 2), 'invalid_values': {7, 9, np.nan}, 'no_value': 2},
    'ASTHMA3': {'valid_range': (1, 2), 'invalid_values': {7, 9, np.nan}, 'no_value': 2},
    'CHCSCNCR': {'valid_range': (1, 2), 'invalid_values': {7, 9, np.nan}, 'no_value': 2},
    'CHCOCNCR': {'valid_range': (1, 2), 'invalid_values': {7, 9, np.nan}, 'no_value': 2},
    'CHCCOPD1': {'valid_range': (1, 2), 'invalid_values': {7, 9, np.nan}, 'no_value': 2},
    'HAVARTH3': {'valid_range': (1, 2), 'invalid_values': {7, 9, np.nan}, 'no_value': 2},
    'ADDEPEV2': {'valid_range': (1, 2), 'invalid_values': {7, 9, np.nan}, 'no_value': 2},
    'CHCKIDNY': {'valid_range': (1, 2), 'invalid_values': {7, 9, np.nan}, 'no_value': 2},
    'DIABETE3': {'valid_range': (1, 4), 'invalid_values': {7, 9, np.nan}, 'no_value': 3},
    'SEX': {'valid_range': (1, 2), 'invalid_values': {np.nan}},
    'MARITAL': {'valid_range': (1, 6), 'invalid_values': {7, 9, np.nan}},
    'EDUCA': {'valid_range': (1, 6), 'invalid_values': {7, 9, np.nan}},
    'VETERAN3': {'valid_range': (1, 2), 'invalid_values': {7, 9, np.nan}, 'no_value': 2},
    'EMPLOY1': {'valid_range': (1, 8), 'invalid_values': {9, np.nan}},
    'INCOME2': {'valid_range': (1, 8), 'invalid_values': {77, 99, np.nan}},
    'QLACTLM2': {'valid_range': (1, 2), 'invalid_values': {7, 9, np.nan}, 'no_value': 2},
    'USEEQUIP': {'valid_range': (1, 2), 'invalid_values': {7, 9, np.nan}, 'no_value': 2},
    'BLIND': {'valid_range': (1, 2), 'invalid_values': {7, 9, np.nan}, 'no_value': 2},
    'DECIDE': {'valid_range': (1, 2), 'invalid_values': {7, 9, np.nan}, 'no_value': 2},
    'DIFFWALK': {'valid_range': (1, 2), 'invalid_values': {7, 9, np.nan}, 'no_value': 2},
    'DIFFDRES': {'valid_range': (1, 2), 'invalid_values': {7, 9, np.nan}, 'no_value': 2},
    'DIFFALON': {'valid_range': (1, 2), 'invalid_values': {7, 9, np.nan}, 'no_value': 2},
    'SMOKE100': {'valid_range': (1, 2), 'invalid_values': {7, 9, np.nan}, 'no_value': 2},
    'USENOW3': {'valid_range': (1, 3), 'invalid_values': {7, 9, np.nan}, 'no_value': 3},
    'EXERANY2': {'valid_range': (1, 2), 'invalid_values': {7, 9, np.nan}, 'no_value': 2},
    'HIVTST6': {'valid_range': (1, 2), 'invalid_values': {7, 9, np.nan}, 'no_value': 2},
    '_AGEG5YR': {'valid_range': (1, 13), 'invalid_values': {14, np.nan}},
    '_AGE_G': {'valid_range': (1, 6), 'invalid_values': {np.nan}},
    '_BMI5CAT': {'valid_range': (1, 4), 'invalid_values': {np.nan}},
    '_RFBMI5': {'valid_range': (1, 2), 'no_value': 1, 'invalid_values': {9, np.nan}},
    '_CHLDCNT': {'valid_range': (1, 6), 'no_value': 1, 'invalid_values': {9, np.nan}},
    '_EDUCAG': {'valid_range': (1, 4), 'no_value': 1, 'invalid_values': {9, np.nan}},
    '_INCOMG': {'valid_range': (1, 5), 'invalid_values': {9, np.nan}},
    '_SMOKER3': {'valid_range': (1, 4), 'no_value': 4, 'invalid_values': {9, np.nan}},
    '_RFSMOK3': {'valid_range': (1, 2), 'no_value': 1, 'invalid_values': {9, np.nan}},
    'DRNKANY5': {'valid_range': (1, 2), 'no_value': 2, 'invalid_values': {7, 9, np.nan}},
    '_RFBING5': {'valid_range': (1, 2), 'no_value': 1, 'invalid_values': {9, np.nan}},
    '_RFDRHV5': {'valid_range': (1, 2), 'no_value': 1, 'invalid_values': {9, np.nan}},
    '_FRTLT1': {'valid_range': (1, 2), 'invalid_values': {9, np.nan}},
    '_VEGLT1': {'valid_range': (1, 2), 'invalid_values': {9, np.nan}},
    '_TOTINDA': {'valid_range': (1, 2), 'no_value': 2, 'invalid_values': {9, np.nan}},
    '_PACAT1': {'valid_range': (1, 4), 'invalid_values': {9, np.nan}},
    '_PAINDX1': {'valid_range': (1, 2), 'no_value': 2, 'invalid_values': {9, np.nan}},
    '_PA150R2': {'valid_range': (1, 3), 'no_value': 3, 'invalid_values': {9, np.nan}},
    '_PA300R2': {'valid_range': (1, 3), 'no_value': 3, 'invalid_values': {9, np.nan}},
    '_PA30021': {'valid_range': (1, 2), 'invalid_values': {9, np.nan}},
    '_PASTRNG': {'valid_range': (1, 2), 'no_value': 2, 'invalid_values': {9, np.nan}},
    '_PAREC1': {'valid_range': (1, 4), 'no_value': 4, 'invalid_values': {9, np.nan}},
    '_PASTAE1': {'valid_range': (1, 4), 'no_value': 4, 'invalid_values': {9, np.nan}},
}

In [7]:
# Rules for numerical data

NUMERICAL_FEATURES_RULES = {
    '_AGE80': {'invalid_values': {np.nan}},
    'HTIN4': {'invalid_values': {np.nan}},
    'HTM4': {'invalid_values': {np.nan}},
    'WTKG3': {'invalid_values': {9999, np.nan}},
    '_BMI5': {'invalid_values': {np.nan}},
    'DROCDY3_': {'invalid_values': {900, np.nan}},
    '_DRNKWEK': {'invalid_values': {99900, np.nan}},
    'FTJUDA1_': {'invalid_values': {np.nan}},
    'FRUTDA1_': {'invalid_values': {np.nan}},
    'BEANDAY_': {'invalid_values': {np.nan}},
    'GRENDAY_': {'invalid_values': {np.nan}},
    'ORNGDAY_': {'invalid_values': {np.nan}},
    'VEGEDA1_': {'invalid_values': {np.nan}},
    '_FRUTSUM': {'invalid_values': {np.nan}},
    '_VEGESUM': {'invalid_values': {np.nan}},
    'MAXVO2_': {'invalid_values': {99900, np.nan}},
    'FC60_': {'invalid_values': {99900, np.nan}},
    'STRFREQ_': {'invalid_values': {99900, np.nan}},
}

In [8]:
# Checks if every feature is either in 'COLUMNS_TO_REMOVE', 'ENCODING_RULES' or 'NUMERICAL_FEATURES_RULE'

def check_all_columns_accounted_for(all_columns, columns_to_remove, encoding_rules, numerical_features_rules):
    # Create a set of all the features that are either to be removed, encoded, or are numerical
    all_accounted_columns = set(columns_to_remove) | set(encoding_rules.keys()) | set(numerical_features_rules.keys())
    
    # Create a set of all the columns from the dataset
    all_columns_set = set(all_columns)
    
    # Find the set of columns that are not accounted for
    unaccounted_columns = all_columns_set - all_accounted_columns
    
    if unaccounted_columns:
        print(f"The following columns are not accounted for: {', '.join(unaccounted_columns)}")
        return False
    else:
        print("All columns are accounted for.")
        return True

check_all_columns_accounted_for(feature_names, COLUMNS_TO_REMOVE, ENCODING_RULES, NUMERICAL_FEATURES_RULES)

All columns are accounted for.


True

In [9]:
def clean_dataset(x_train, x_test, feature_names, nan_threshold=0.5):
    # ------- Remove specified columns -------
    # Find the indices of the columns to remove based on their names
    indices_to_remove = []
    for i, feature_name in enumerate(feature_names):
        if feature_name in COLUMNS_TO_REMOVE:
            indices_to_remove.append(i)

    # Remove the specified columns from the training and test datasets
    x_train = np.delete(x_train, indices_to_remove, axis=1)
    x_test = np.delete(x_test, indices_to_remove, axis=1)

    # Update the feature names list to exclude the removed columns
    updated_feature_names = []
    for i, feature_name in enumerate(feature_names):
        if i not in indices_to_remove:
            updated_feature_names.append(feature_name)

    # ------- Remove columns with too many NaN values -------
    # Calculate the percentage of NaN values in each column of the training data
    nan_percentages = np.mean(np.isnan(x_train), axis=0)

    # Identify columns with less than the threshold percentage of NaN values
    valid_column_indices = nan_percentages < nan_threshold

    # Keep only the valid columns in the training and test datasets
    x_train = x_train[:, valid_column_indices]
    x_test = x_test[:, valid_column_indices]

    # Update the feature names list to include only those corresponding to valid columns
    filtered_feature_names = []
    for i, feature in enumerate(updated_feature_names):
        if valid_column_indices[i]:
            filtered_feature_names.append(feature)

    return x_train, x_test, filtered_feature_names

In [10]:
def encode_data(x_train, x_test, y_train, feature_names, invalid_percentage_threshold=0.5):
    encoded_feature_names = []
    new_x_train = []
    new_x_test = []
    train_missingness_matrix = []

    for i, feature_name in enumerate(feature_names):
        rules = ENCODING_RULES.get(feature_name)

        if rules:
            valid_range = rules.get('valid_range', (np.nanmin(x_train[:, i]), np.nanmax(x_train[:, i])))
            invalid_values = rules.get('invalid_values', set())
            no_value = rules.get('no_value')
            
            # Creates arrays to store the encoded features
            encoded_train_column = []
            encoded_test_column = []
            
            # Finding and storing missing data
            invalid_non_nan = {x for x in invalid_values if not np.isnan(x)}
            is_invalid_train = np.in1d(x_train[:, i], list(invalid_non_nan))
            is_nan_train = np.isnan(x_train[:, i])
            train_missingness = np.logical_or(is_invalid_train, is_nan_train).astype(int)

            is_invalid_test = np.in1d(x_test[:, i], list(invalid_non_nan))
            is_nan_test = np.isnan(x_test[:, i])
            test_missingness = np.logical_or(is_invalid_test, is_nan_test).astype(int)
            
            train_missingness_matrix.append(train_missingness)  # Storing the missingness data
            
            # Updates the arrays storing the 'missing column' for the i-th feature 
            encoded_train_column.append(train_missingness)
            encoded_test_column.append(test_missingness)
            # Updates the new list of (encoded) features adding the '{feature_name}_missing' column
            encoded_feature_names.append(f'{feature_name}_missing')
            
            # Changing values according to the rules
            for val in range(valid_range[0], valid_range[1] + 1):
                if val not in invalid_values and val != no_value:
                    train_column = (x_train[:, i] == val).astype(int)
                    test_column = (x_test[:, i] == val).astype(int)

                    if no_value is not None:
                        train_column[x_train[:, i] == no_value] = 0
                        test_column[x_test[:, i] == no_value] = 0
                    
                    # Updates the arrays storing the val-column for the i-th feature 
                    encoded_train_column.append(train_column)
                    encoded_test_column.append(test_column)
                    # Updates the new list of (encoded) features adding the '{feature_name}_{val}' column
                    encoded_feature_names.append(f'{feature_name}_{val}')
            
            # Combine the encoding of the i-th  feature with the ones already made for previous features
            new_x_train.append(np.stack(encoded_train_column, axis=1))
            new_x_test.append(np.stack(encoded_test_column, axis=1))
        
        # If there are no rules simply attaches the normal column
        else:
            new_x_train.append(x_train[:, i][:, np.newaxis])
            new_x_test.append(x_test[:, i][:, np.newaxis])
            encoded_feature_names.append(feature_name)

    x_train_encoded = np.concatenate(new_x_train, axis=1)
    x_test_encoded = np.concatenate(new_x_test, axis=1)

    # Calculate the percentage of invalid values for each row
    invalid_counts = np.sum(np.array(train_missingness_matrix).T, axis=1)
    total_features = len(feature_names)
    invalid_percentage = invalid_counts / total_features

    # Removing rows with more than a certain percentage of invalid values
    rows_to_remove = np.where(invalid_percentage > invalid_percentage_threshold)
    x_train_encoded = np.delete(x_train_encoded, rows_to_remove, axis=0)
    y_train = np.delete(y_train, rows_to_remove, axis=0)

    return x_train_encoded, x_test_encoded, y_train, encoded_feature_names

In [11]:
def clean_and_standardize_numerical_data(x_train, x_test, y_train, feature_names):
    # Identify invalid values and replace them with NaN
    for i, feature_name in enumerate(feature_names):
        rules = NUMERICAL_FEATURES_RULES.get(feature_name)
        if rules:
            invalid_values = rules['invalid_values']
            for val in invalid_values:
                x_train[x_train[:, i] == val, i] = np.nan
                x_test[x_test[:, i] == val, i] = np.nan

    # Calculate the mean of valid values in the training set for each feature
    valid_numerical_features_indices = [
        i for i, feature_name in enumerate(feature_names) if feature_name in NUMERICAL_FEATURES_RULES
    ]
    x_train_numerical = x_train[:, valid_numerical_features_indices]
    means = np.nanmean(x_train_numerical, axis=0)

    # Replace NaNs with the calculated means
    for i in range(x_train_numerical.shape[1]):
        nan_indices_train = np.isnan(x_train_numerical[:, i])
        x_train_numerical[nan_indices_train, i] = means[i]

        nan_indices_test = np.isnan(x_test[:, valid_numerical_features_indices[i]])
        x_test[nan_indices_test, valid_numerical_features_indices[i]] = means[i]

    # Standardize the data (zero mean and unit variance)
    stds = np.nanstd(x_train_numerical, axis=0, ddof=1)  # ddof=1 is used to compute sample standard deviation
    stds[stds == 0] = 1  # Avoid division by zero
    
    x_train_numerical = (x_train_numerical - means) / stds
    x_train[:, valid_numerical_features_indices] = x_train_numerical

    # Apply the same transformation to the test set
    x_test_numerical = x_test[:, valid_numerical_features_indices]
    x_test_numerical = (x_test_numerical - means) / stds
    x_test[:, valid_numerical_features_indices] = x_test_numerical

    return x_train, x_test, y_train

In [12]:
x_train, x_test, feature_names = clean_dataset(x_train, x_test, feature_names, nan_threshold=0.5)
x_train, x_test, y_train, feature_names = encode_data(x_train, x_test, y_train, feature_names, invalid_percentage_threshold=0.2)
x_train, x_test, y_train = clean_and_standardize_numerical_data(x_train, x_test, y_train, feature_names)
#x_train, y_train = remove_nan_rows(x_train, y_train)

In [13]:
# Convert -1 labels to 0
y_train[y_train == -1] = 0

In [14]:
print("x_train shape: {}".format(x_train.shape))
print("x_test shape: {}".format(x_test.shape))
print("y_train shape: {}".format(y_train.shape))

x_train shape: (312052, 219)
x_test shape: (109379, 219)
y_train shape: (312052,)


In [15]:
# Function to predict the labels for the test data
def predict_labels(w, data):
    """Generates class predictions given weights, and a test data matrix"""
    y_pred = sigmoid(data.dot(w))
    y_pred[np.where(y_pred <= 0.45)] = -1
    y_pred[np.where(y_pred > 0.45)] = 1
    return y_pred

In [16]:
def compute_f1(x_test, y_test, w):
    y_pred = np.array(predict_labels(w, x_test))
    y_true = np.array(y_test)

    # Calculating True Positives, False Positives, and False Negatives
    TP = np.sum((y_true == 1) & (y_pred == 1))
    FP = np.sum((y_true == -1) & (y_pred == 1))
    FN = np.sum((y_true == 1) & (y_pred == -1))

    # Calculating Precision and Recall
    precision = TP / (TP + FP) if TP + FP > 0 else 0
    recall = TP / (TP + FN) if TP + FN > 0 else 0

    # Calculating F1 Score
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

    return f1

In [46]:
# Regularized logistic regression with optimal hyperparameters
np.random.seed(13)
initial_w = np.random.randn(x_train.shape[1]) * 0.01
max_iters = 1000
w, _ = reg_logistic_regression(y_train, x_train, best_lambda, initial_w, max_iters, best_gamma, threshold=1e-5)

In [33]:
f1 = compute_f1(x_train, y_train, w)
print(f'F1 Score: {f1}')

F1 Score: 0.0863482875060299


In [43]:
# Generate predictions and save ouput in csv format for submission:
OUTPUT_PATH = 'data/submission.csv'

y_pred = predict_labels(w, x_test)

#create_csv_submission(test_ids, y_pred, OUTPUT_PATH)
create_csv_submission(test_ids, y_pred, OUTPUT_PATH)