In [160]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder

import scipy.stats as stats
from scipy.stats import chi2_contingency

from datetime import date

In [161]:
test_demo=pd.read_excel('test_demo.xlsx')
test_habits=pd.read_excel('test_habits.xlsx')
test_health=pd.read_excel('test_health.xlsx')
print(test_demo.shape)
print(test_habits.shape)
print(test_health.shape)

(225, 5)
(225, 6)
(225, 9)


In [162]:
test_df = test_demo.merge(test_habits, on = 'PatientID').merge(test_health, on = 'PatientID')
test_df.sample(3)

Unnamed: 0,PatientID,Name,Birth_Year,Region,Education,Smoking_Habit,Drinking_Habit,Exercise,Fruit_Habit,Water_Habit,Height,Weight,High_Cholesterol,Blood_Pressure,Mental_Health,Physical_Health,Checkup,Diabetes
186,1538,Mr. Antoine Lafarge,1976,London,University Complete (3 or more years),No,I consider myself a social drinker,No,Less than 1. I do not consume fruits every day.,More than half a liter but less than one liter,171,75,253,120,16,3,Not sure,Neither I nor my immediate family have diabetes.
200,1358,Mr. Nicholas Sanders,1963,East of England,University Complete (3 or more years),No,I usually consume alcohol every day,No,3 to 4 pieces of fruit in average,More than half a liter but less than one liter,178,81,208,134,20,3,More than 3 years,"I don't have diabetes, but I have direct famil..."
211,1801,Mr. Walter Lindenbaum,1959,London,High School Incomplete (10th to 11th grade),Yes,I consider myself a social drinker,No,Less than 1. I do not consume fruits every day.,More than half a liter but less than one liter,167,51,237,145,18,11,Less than 3 years but more than 1 year,"I don't have diabetes, but I have direct famil..."


In [163]:
test_df.set_index('PatientID', inplace=True)
#test_df.sort_index(axis=0, inplace=True)

In [164]:
test_df['Gender'] = test_df.apply(lambda row: 'Male' if row['Name'].startswith('Mr.') else 'Female', axis=1)

In [165]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 225 entries, 1343 to 1757
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Name              225 non-null    object
 1   Birth_Year        225 non-null    int64 
 2   Region            225 non-null    object
 3   Education         225 non-null    object
 4   Smoking_Habit     225 non-null    object
 5   Drinking_Habit    225 non-null    object
 6   Exercise          225 non-null    object
 7   Fruit_Habit       225 non-null    object
 8   Water_Habit       225 non-null    object
 9   Height            225 non-null    int64 
 10  Weight            225 non-null    int64 
 11  High_Cholesterol  225 non-null    int64 
 12  Blood_Pressure    225 non-null    int64 
 13  Mental_Health     225 non-null    int64 
 14  Physical_Health   225 non-null    int64 
 15  Checkup           225 non-null    object
 16  Diabetes          225 non-null    object
 17  Gender      

In [166]:
test_data_num = test_df.select_dtypes(include=np.number).set_index(test_df.index).copy()
test_data_cat = test_df.select_dtypes(exclude=np.number).set_index(test_df.index).copy()

In [167]:
test_data_cat['Smoking_Habit']=pd.Series(np.where(test_data_cat.Smoking_Habit.values == 'Yes', 1, 0),test_data_cat.index)
test_data_cat['Exercise']=pd.Series(np.where(test_data_cat.Exercise.values == 'Yes', 1, 0),test_data_cat.index)

In [168]:
def age(birthdate):
    year = date.today().year
    age = year - birthdate 
    return age

test_data_num['Birth_Year'] = test_data_num['Birth_Year'].apply(lambda x: age(x))
test_data_num.rename(columns = {'Birth_Year':'Age'}, inplace=True)

In [169]:
scaler = MinMaxScaler().fit(test_data_num)
test_data_scaled = scaler.transform(test_data_num)
test_data_scaled = pd.DataFrame(test_data_scaled, columns = test_data_num.columns).set_index(test_df.index)
test_data_scaled.head(3)

Unnamed: 0_level_0,Age,Height,Weight,High_Cholesterol,Blood_Pressure,Mental_Health,Physical_Health
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1343,0.418605,0.724138,0.690909,0.237762,0.735849,0.708333,0.066667
1727,0.511628,0.689655,0.690909,0.314685,0.339623,0.75,0.0
1828,0.232558,0.689655,0.654545,0.356643,0.339623,0.875,0.033333


In [170]:
test_data_cat.drop(['Name', 'Region', 'Education', 'Smoking_Habit', 'Water_Habit'],axis=1, inplace=True)

In [171]:
# Difine Dict with the key-value pair to remap.
dict_drinking_habit = {"I consider myself a social drinker" : 'Social drinker',  'I usually consume alcohol every day' : 'Every day', "I do not consume any type of alcohol": 'Not consume'}
test_data_cat.replace({"Drinking_Habit": dict_drinking_habit}, inplace=True)

#Aggregate values on fruit_habit
test_data_cat["Fruit_Habit"].replace('Less than 1. I do not consume fruits every day.', 'Not consume', inplace=True)
test_data_cat["Fruit_Habit"].replace(['1 to 2 pieces of fruit in average', '3 to 4 pieces of fruit in average'], '1-4', inplace=True)
test_data_cat["Fruit_Habit"].replace(['5 to 6 pieces of fruit in average', 'More than six pieces of fruit'], 'More than 5', inplace=True)

dict_checkup = {"Less than 3 years but more than 1 year" : '1-3 years',  'Less than three months' : 'Less than 3 months'}
test_data_cat.replace({"Checkup": dict_checkup}, inplace=True)

test_data_cat["Diabetes"].replace(['Neither I nor my immediate family have diabetes.', "I don't have diabetes, but I have direct family members who have diabetes."], 'No', inplace=True)
dict_diabetes = {"I do have diabetes" : 'Yes',  'I have/had pregnancy diabetes or borderline diabetes' : 'Pregnancy or Borderline Diabetes'}
test_data_cat.replace({"Diabetes": dict_diabetes}, inplace=True)

test_data_cat['Gender'].replace(['Male'], 1, inplace=True)
test_data_cat['Gender'].replace(['Female'], 0, inplace=True)
test_data_cat.rename(columns={'Gender':'Gender Male'}, inplace=True)

In [172]:
df_ohc = test_data_cat.copy()
# Use OneHotEncoder to encode the categorical features. Get feature names and create a DataFrame 
# with the one-hot encoded categorical features (pass feature names)
ohc = OneHotEncoder(sparse=False)
ohc_feat = ohc.fit_transform(df_ohc.drop(['Exercise', 'Gender Male'], axis=1))
ohc_feat_names = ohc.get_feature_names_out()
ohc_df = pd.DataFrame(ohc_feat, index=df_ohc.index, columns=ohc_feat_names)
ohc_df

Unnamed: 0_level_0,Drinking_Habit_Every day,Drinking_Habit_Not consume,Drinking_Habit_Social drinker,Fruit_Habit_1-4,Fruit_Habit_More than 5,Fruit_Habit_Not consume,Checkup_1-3 years,Checkup_Less than 3 months,Checkup_More than 3 years,Checkup_Not sure,Diabetes_No,Diabetes_Pregnancy or Borderline Diabetes,Diabetes_Yes
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1343,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1727,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1828,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1155,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1020,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1256,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1318,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1953,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [173]:
test_data_cat  = pd.concat([test_data_cat,ohc_df],axis = 1)
test_data_cat.drop(['Drinking_Habit', 'Fruit_Habit', 'Checkup', 'Diabetes'], inplace=True, axis=1)
test_data_cat

Unnamed: 0_level_0,Exercise,Gender Male,Drinking_Habit_Every day,Drinking_Habit_Not consume,Drinking_Habit_Social drinker,Fruit_Habit_1-4,Fruit_Habit_More than 5,Fruit_Habit_Not consume,Checkup_1-3 years,Checkup_Less than 3 months,Checkup_More than 3 years,Checkup_Not sure,Diabetes_No,Diabetes_Pregnancy or Borderline Diabetes,Diabetes_Yes
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1343,0,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1727,0,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1828,1,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1155,0,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1020,0,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,0,1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1256,0,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1318,1,1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1953,1,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [174]:
test_data=test_data_scaled.merge(test_data_cat, on = 'PatientID')
test_data

Unnamed: 0_level_0,Age,Height,Weight,High_Cholesterol,Blood_Pressure,Mental_Health,Physical_Health,Exercise,Gender Male,Drinking_Habit_Every day,...,Fruit_Habit_1-4,Fruit_Habit_More than 5,Fruit_Habit_Not consume,Checkup_1-3 years,Checkup_Less than 3 months,Checkup_More than 3 years,Checkup_Not sure,Diabetes_No,Diabetes_Pregnancy or Borderline Diabetes,Diabetes_Yes
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1343,0.418605,0.724138,0.690909,0.237762,0.735849,0.708333,0.066667,0,1,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1727,0.511628,0.689655,0.690909,0.314685,0.339623,0.750000,0.000000,0,1,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1828,0.232558,0.689655,0.654545,0.356643,0.339623,0.875000,0.033333,1,1,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1155,0.465116,0.793103,0.818182,0.622378,0.132075,0.666667,0.000000,0,1,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1020,0.604651,0.724138,0.254545,0.188811,0.433962,0.666667,0.466667,0,1,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,0.418605,0.931034,0.854545,0.283217,0.292453,0.791667,0.133333,0,1,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1256,0.023256,0.103448,0.381818,0.181818,0.415094,0.916667,0.200000,0,0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1318,0.581395,0.758621,0.872727,0.160839,0.433962,0.708333,0.000000,1,1,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1953,0.255814,0.137931,0.054545,0.367133,0.415094,0.625000,0.000000,1,0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [1]:
import joblib
# RF= joblib.load('my_model.pkl')
teste1 = joblib.load('my_model_cross_validation_teste.pkl')

In [176]:
test_data

Unnamed: 0_level_0,Age,Height,Weight,High_Cholesterol,Blood_Pressure,Mental_Health,Physical_Health,Exercise,Gender Male,Drinking_Habit_Every day,...,Fruit_Habit_1-4,Fruit_Habit_More than 5,Fruit_Habit_Not consume,Checkup_1-3 years,Checkup_Less than 3 months,Checkup_More than 3 years,Checkup_Not sure,Diabetes_No,Diabetes_Pregnancy or Borderline Diabetes,Diabetes_Yes
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1343,0.418605,0.724138,0.690909,0.237762,0.735849,0.708333,0.066667,0,1,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1727,0.511628,0.689655,0.690909,0.314685,0.339623,0.750000,0.000000,0,1,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1828,0.232558,0.689655,0.654545,0.356643,0.339623,0.875000,0.033333,1,1,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1155,0.465116,0.793103,0.818182,0.622378,0.132075,0.666667,0.000000,0,1,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1020,0.604651,0.724138,0.254545,0.188811,0.433962,0.666667,0.466667,0,1,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,0.418605,0.931034,0.854545,0.283217,0.292453,0.791667,0.133333,0,1,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1256,0.023256,0.103448,0.381818,0.181818,0.415094,0.916667,0.200000,0,0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1318,0.581395,0.758621,0.872727,0.160839,0.433962,0.708333,0.000000,1,1,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1953,0.255814,0.137931,0.054545,0.367133,0.415094,0.625000,0.000000,1,0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [2]:
res=RF.predict(test_data)
predictions = pd.DataFrame(res)

predictions['ind']=test_data.index
predictions.set_index('ind', inplace=True)
predictions.to_csv("predictions_RF_v2.csv")



AttributeError: 'dict' object has no attribute 'predict'

In [178]:
predictions

Unnamed: 0_level_0,0
ind,Unnamed: 1_level_1
1343,1
1727,1
1828,1
1155,1
1020,1
...,...
1000,0
1256,1
1318,0
1953,1
