In [1]:
import numpy as np # math calcuations and other matrix, vector processing
import pandas as pd # dataframe organization (similar to Excel)
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler #replace by MinMaxScaler
from sklearn.model_selection import train_test_split # used this to check accuracy of different solvers 
# and check for overfitting

In [2]:
train_csv = 'noshow_train.csv'
test_csv = 'noshow_test.csv'
feature_names = ['PatientId', 'AppointmentID', 'Gender', 
                 'Age', 'Scholarship', 'Hipertension',
                 'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received'] 
label_name = ['No-show']

In [3]:
# Read data
def read_data(csv_file):
    df = pd.read_csv(csv_file)
    return df

# process data
def get_features_labels(df, feature_names, label_name=None):
    X_features = df[feature_names].values
    #df['Gender'].map({'M': 0, 'F': 1}).astype(float).values
    y_labels = None
    if label_name: #have the label column in the dataframe
        y_labels = df[label_name].values
    return X_features, y_labels

# Normalize feature
def feature_normalization(X_train_features, X_test_features,
                          do_normalization=True):
    if do_normalization:
        scaler = StandardScaler()
        scaler.fit(X_train_features)
        X_train_scaled_features = scaler.transform(X_train_features)
        X_test_scaled_features = scaler.transform(X_test_features)
        return X_train_scaled_features, X_test_scaled_features
    else:
        return X_train_features, X_test_features
    
def train_and_predict(X_train_features, X_test_features, y_train_labels,
                     max_iter=1000, solver='lbfgs', penalty='l2', C=1):
    # Training and prediction
    my_logreg = LogisticRegression(max_iter=max_iter,
                                  solver=solver, penalty = penalty, C = C) # Initializing the LogReg model
    my_logreg.fit(X_train_features, y_train_labels.ravel())
    y_test_pred = my_logreg.predict(X_test_features)
    return y_test_pred

In [4]:
#main function
# Process the training/testing data
df_train = read_data(train_csv)
df_test = read_data(test_csv)
# change 'gender' to numeric
df_train['Gender'] = df_train['Gender'].map({'M': 0, 'F': 1})
df_train['Gender'] = df_train['Gender'].astype(float)
df_test['Gender'] = df_test['Gender'].map({'M': 0, 'F': 1})
df_test['Gender'] = df_test['Gender'].astype(float)
X_train_features, y_train_labels = get_features_labels(df_train, feature_names, label_name)
X_test_features, y_test_labels = get_features_labels(df_test, feature_names)

In [5]:
# test feature normalization
X_train_scaled_features, X_test_scaled_features = \
        feature_normalization(X_train_features, X_test_features, 
                              do_normalization=True) 

In [6]:
# Training and prediction
y_test_pred = train_and_predict(X_train_scaled_features, X_test_scaled_features, 
                                y_train_labels,
                     max_iter=1000, solver='lbfgs', penalty = 'none', C = 1)

In [9]:
# Save Predictions
df_test_pred = pd.DataFrame(data=y_test_pred, columns=['No-show'])
df_test_pred.to_csv('noshow_prediction.csv')