In [1]:
# Apply preprocessing to df - drop duplicates, impute 0, cap outliers, normalize
def preprocess(df):
    # Drop all 5 duplicates from set
    df = df.drop_duplicates()
    df.count()[0]
    
    # Impute 0's with mean
    bmi_mean = df[df['bmi']!=0]['bmi'].mean()
    bp_mean = df[df['blood_pressure']!=0]['blood_pressure'].mean()
    cst_mean = df[df['cardio_stress_test']!=0]['cardio_stress_test'].mean()
    it_mean = df[df['insulin_test']!=0]['insulin_test'].mean()

    df['bmi'].replace(0, bmi_mean, inplace=True)
    df['blood_pressure'].replace(0, bp_mean, inplace=True)
    df['cardio_stress_test'].replace(0, cst_mean, inplace=True)
    df['insulin_test'].replace(0, it_mean, inplace=True) 
    
    pd.options.mode.chained_assignment = None # Surpress SettingWithCopyWarning

    #Cap and floor outliers
    cap_n_floor = {'age': [18.0, 66.0],
                    'weight': [117.9, 247.0 ],
                    'bmi': [19.49, 53.38],
                    'blood_pressure': [ 44.0, 110.0],
                    'insulin_test': [ 28.6, 543.2],
                    'liver_stress_test': [0.1584, 1.765 ],
                    'cardio_stress_test': [40.0,  90.3],
                    'years_smoking': [ 0.0, 14.0]}
    
    for col in df.columns:
        df[col][df[col] >= cap_n_floor[col][1]] = cap_n_floor[col][1]
        df[col][df[col] <=  cap_n_floor[col][0]] = cap_n_floor[col][0]
        
        
    # Normalize numeric columns with StandardScaler 
    scaler = preprocessing.StandardScaler() # Subtract mean and devide by standard deviation
    df.iloc[:,:-1] = scaler.fit_transform(df.iloc[:, :-1])
    
    return df

In [2]:
# Generate predictions for given filename and saved model's path
def predict(filename, model):
    input_file = pd.read_excel(filename)
    input_file = preprocess(input_file.drop('zeta_disease',axis=1))
    
    with open(model, 'rb') as model_file:
        model = pickle.load(model_file)
    
    predictions = model.predict(input_file)
    input_file['zeta_disease'] = predictions
    
    return input_file

In [3]:
from sklearn.impute import KNNImputer
from sklearn import preprocessing
import pandas as pd
import pickle

predicted_zeta = predict('recruiting_zeta-disease_prediction-data_take-home-challenge.xlsx','random_forest_model.sav')
display(predicted_zeta)
predicted_zeta.to_csv("predicted_zeta.csv")

Unnamed: 0,age,weight,bmi,blood_pressure,insulin_test,liver_stress_test,cardio_stress_test,years_smoking,zeta_disease
0,-0.958114,-1.021011,0.776978,-0.695919,-0.986618,-0.800945,-0.629119,4,0
1,-0.690733,0.007345,0.157872,0.769174,0.147891,0.678757,-2.003665,6,0
2,-0.066845,-1.167919,-1.173206,-0.183137,-0.959605,0.87493,-0.946322,2,0
3,0.022282,0.998974,-0.321935,-0.402901,-0.243784,-0.442229,-0.100448,6,1
4,2.250454,0.521523,-0.724354,-1.208702,0.634109,1.262791,0.32249,9,0
5,0.913551,-2.159547,0.312649,2.161013,-1.283751,-1.355833,-0.840588,12,0
6,-1.314621,-1.461734,0.575769,-1.281957,-0.919087,0.235968,0.639693,3,0
7,-1.047241,-1.535188,-0.507667,-0.622665,-0.973111,-0.688846,-0.311916,7,0
8,0.111409,0.594977,-0.616011,-1.428466,-0.0547,-0.503883,-0.311916,6,0
9,-1.403748,0.521523,-1.34346,0.402901,-1.067654,1.262791,-1.263525,3,0
