# Processing Datasets

In [1]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
import csv

from sklearn.preprocessing import OneHotEncoder

try:
    from sklearn.preprocessing import OrdinalEncoder
except ImportError:
    from future_encoders import OrdinalEncoder # Scikit-Learn < 0.20
    
from sklearn.model_selection import train_test_split

try:
    from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+
except ImportError:
    from sklearn.preprocessing import Imputer as SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

try:
    from sklearn.compose import ColumnTransformer
except ImportError:
    from future_encoders import ColumnTransformer # Scikit-Learn < 0.20
    
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import randint

from sklearn.metrics import mean_squared_error

from sklearn.tree import DecisionTreeRegressor

from sklearn.svm import SVR

from sklearn.linear_model import BayesianRidge

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import cross_val_score 

from sklearn.preprocessing import PolynomialFeatures

from sklearn.pipeline import make_pipeline


In [2]:
# Funtions

def write_to_csv_file(line, mode):
    with open("ML_results.csv", mode) as csv_file:
        writer = csv.writer(csv_file, delimiter=';')
        writer.writerow(line)
        
def write_to_models_file(line, mode):
    with open("Best models.txt", mode) as file:
        file.write(line  + '\n')

def write_features_to_csv_file(blanckDict, user, algorithm, mse, importances):
    
    dictImp = blankDict.copy()

    dictImp['user'] = user
    dictImp['algorithm'] = algorithm
    dictImp['neg_mean_squared_error'] = mse

    for imp, feat in importances:
        dictImp["feature_"+feat]=imp

    write_to_csv_file(dictImp.values(), 'a')

## Charge DF

In [3]:
df = pd.read_csv("datasets/bloodpressure/general_database.csv", delimiter=";")

## Prepare DF for visivility (Cat to num)

In [4]:
users = df.user.unique().tolist()
users

[1, 2, 3, 4, 5, 6, 7]

In [5]:
df_general = df.copy()
df_users = list()

for user in users:
   df_users.append(df[(df.user == user)].copy().reset_index(drop=True).drop("user", axis=1))


In [6]:
df_users_num = list()

for df_user in df_users:
   df_users_num.append(df_user.drop(["birthcountry","homecountry","educationlevel","gender"], axis=1))

df_general_num = df_general.drop(["birthcountry","homecountry","educationlevel","gender"], axis=1)

In [7]:
df_users_cat = list()

for df_user in df_users:
   df_users_cat.append(df_user[["birthcountry","homecountry","educationlevel","gender"]])

df_general_cat = df_general[["birthcountry","homecountry","educationlevel","gender"]]

In [8]:
cat_encoder = OrdinalEncoder()

df_users_cat_encoder = list()

for df_user_cat in df_users_cat:
   df_users_cat_encoder.append(cat_encoder.fit_transform(df_user_cat))

df_1_cat_encoder = cat_encoder.fit_transform(df_users_cat[0])

df_general_cat_encoder = cat_encoder.fit_transform(df_general_cat)

In [9]:
df_users_cat_transf = list()

for df_user_cat_encoder in df_users_cat_encoder:
   df_users_cat_transf.append(pd.DataFrame(df_user_cat_encoder, columns = ["birthcountry","homecountry","educationlevel","gender"]) )

df_general_cat_transf = pd.DataFrame(df_general_cat_encoder, columns = ["birthcountry","homecountry","educationlevel","gender"]) 


In [10]:
df_users_transf = list()

for n in range(len(users)):
   df_users_transf.append(pd.concat([df_users_num[n], df_users_cat_transf[n]], axis=1, join='inner'))


df_general_transf = pd.concat([df_general_num, df_general_cat_transf], axis=1, join='inner')

## Create plots for users and general

In [11]:
'''plots_path = "plots/"
if not os.path.exists(plots_path):
    os.makedirs(plots_path)'''

'plots_path = "plots/"\nif not os.path.exists(plots_path):\n    os.makedirs(plots_path)'

In [12]:
'''for n in range(len(users)):
    df_users_transf[n].hist(bins=50, grid=False, figsize=(50,40))
    plt.savefig(plots_path + "dataset_" + str(users[n]) + ".pdf")
    plt.close()'''

'for n in range(len(users)):\n    df_users_transf[n].hist(bins=50, grid=False, figsize=(50,40))\n    plt.savefig(plots_path + "dataset_" + str(users[n]) + ".pdf")\n    plt.close()'

In [13]:
'''df_general_transf.hist(bins=50, grid=False, figsize=(50,40))
plt.savefig(plots_path + "general_dataset.pdf")
plt.close()'''

'df_general_transf.hist(bins=50, grid=False, figsize=(50,40))\nplt.savefig(plots_path + "general_dataset.pdf")\nplt.close()'

## Cleaning features and view

In [14]:
# User 1

df_users_transf[0] = df_users_transf[0][df_users_transf[0].columns.drop(list(df_users_transf[0].filter(regex='alcohol_')))]
df_users_transf[0] = df_users_transf[0][df_users_transf[0].columns.drop(list(df_users_transf[0].filter(regex='smoke_')))]
df_users_transf[0] = df_users_transf[0][df_users_transf[0].columns.drop(list(df_users_transf[0].filter(regex='diet_')))]
df_users_transf[0] = df_users_transf[0][df_users_transf[0].columns.drop(list(df_users_transf[0].filter(regex='pollution_')))]
df_users_transf[0] = df_users_transf[0].drop("birthcountry", axis=1)
df_users_transf[0] = df_users_transf[0].drop("gender", axis=1)
df_users_transf[0] = df_users_transf[0].drop("birthdate", axis=1)
df_users_transf[0] = df_users_transf[0].drop("educationlevel", axis=1)
df_users_transf[0] = df_users_transf[0].drop("homecountry", axis=1)
df_users_transf[0] = df_users_transf[0].drop("height", axis=1)
df_users_transf[0] = df_users_transf[0].drop("weight", axis=1)
df_users_transf[0] = df_users_transf[0].drop(["depression_q2", "depression_q3", "depression_q4", "depression_q5", "depression_q6", "depression_q9", "depression_q10"], axis=1)
df_users_transf[0] = df_users_transf[0].drop(["physical_q1", "physical_q2"], axis=1)
df_users_transf[0] = df_users_transf[0].drop(["social_q1", "social_q3","social_q4"], axis=1)
df_users_transf[0] = df_users_transf[0].drop(["stress_q3", "stress_q4", "stress_q5", "stress_q8"], axis=1)

#['med_Paracetamol', 'med_Ibuprofeno', 'med_Naproxeno', 'med_Dexketoprofeno', 'med_Acido acetil salicilico', 'med_Metamizol', 'med_Morfina',  'med_Fentanil', 'med_Tramadol', 'med_Nitroglicerina', 'med_Verapamilo', 'med_Nifedipino', 'med_Porpanolol', 'med_Atenolol', 'med_Labetalol',  'med_Amiodarona', 'med_Lidocaina', 'med_Adenosina', 'med_Digoxina', 'med_Sulfato de magnesio', 'med_Insulina', 'med_Glucagon',  'med_Lercanidipino', 'med_Acenocumarol', 'med_Alprazolam', 'med_Midazolam', 'med_Diazepam', 'med_Lorazepam', 'med_Lormetazepam',  'med_Clorazepato dipotasico', 'med_Sertralina', 'med_Zolpidem', 'med_Dimenhidrinato', 'med_Doxilamina', 'med_Difenhidramina',  'med_Dimenhidrinato', 'med_Hidroxicina', 'med_Cetirizina', 'med_Hidroclorotiazida']
df_users_transf[0] = df_users_transf[0].drop(['med_Paracetamol', 'med_Ibuprofeno', 'med_Naproxeno', 'med_Dexketoprofeno', 'med_Acido acetil salicilico', 'med_Metamizol', 'med_Morfina',  'med_Fentanil', 'med_Tramadol', 'med_Nitroglicerina', 'med_Verapamilo', 'med_Nifedipino', 'med_Porpanolol', 'med_Atenolol', 'med_Labetalol',  'med_Amiodarona', 'med_Lidocaina', 'med_Adenosina', 'med_Digoxina', 'med_Sulfato de magnesio', 'med_Insulina', 'med_Glucagon',  'med_Lercanidipino', 'med_Acenocumarol', 'med_Alprazolam', 'med_Midazolam', 'med_Diazepam', 'med_Lorazepam', 'med_Lormetazepam',  'med_Clorazepato dipotasico', 'med_Sertralina', 'med_Zolpidem', 'med_Dimenhidrinato', 'med_Doxilamina', 'med_Difenhidramina',  'med_Dimenhidrinato', 'med_Hidroxicina', 'med_Cetirizina'], axis=1)


In [15]:
# User 2

df_users_transf[1] = df_users_transf[1][df_users_transf[1].columns.drop(list(df_users_transf[1].filter(regex='alcohol_')))]
df_users_transf[1] = df_users_transf[1][df_users_transf[1].columns.drop(list(df_users_transf[1].filter(regex='smoke_')))]
df_users_transf[1] = df_users_transf[1][df_users_transf[1].columns.drop(list(df_users_transf[1].filter(regex='pollution_')))]
df_users_transf[1] = df_users_transf[1].drop("birthcountry", axis=1)
df_users_transf[1] = df_users_transf[1].drop("gender", axis=1)
df_users_transf[1] = df_users_transf[1].drop("birthdate", axis=1)
df_users_transf[1] = df_users_transf[1].drop("educationlevel", axis=1)
df_users_transf[1] = df_users_transf[1].drop("homecountry", axis=1)
df_users_transf[1] = df_users_transf[1].drop("height", axis=1)
df_users_transf[1] = df_users_transf[1].drop("weight", axis=1)
df_users_transf[1] = df_users_transf[1].drop(["depression_q9","depression_q10"], axis=1)
df_users_transf[1] = df_users_transf[1][df_users_transf[1].columns.drop(list(df_users_transf[1].filter(regex='diet_').drop("diet_q4", axis=1)))]
df_users_transf[1] = df_users_transf[1][df_users_transf[1].columns.drop(list(df_users_transf[1].filter(regex='social_').drop("social_q4", axis=1)))]
df_users_transf[1] = df_users_transf[1].drop(["stress_q4","stress_q5","stress_q6","stress_q12"], axis=1)

df_users_transf[1] = df_users_transf[1].drop(['med_Naproxeno', 'med_Dexketoprofeno', 'med_Metamizol', 'med_Morfina',  'med_Fentanil', 'med_Tramadol', 'med_Nitroglicerina', 'med_Verapamilo', 'med_Nifedipino', 'med_Porpanolol', 'med_Atenolol', 'med_Labetalol',  'med_Amiodarona', 'med_Lidocaina', 'med_Adenosina', 'med_Digoxina', 'med_Sulfato de magnesio', 'med_Insulina', 'med_Glucagon',  'med_Lercanidipino', 'med_Acenocumarol', 'med_Alprazolam', 'med_Midazolam', 'med_Diazepam', 'med_Lorazepam', 'med_Lormetazepam',  'med_Clorazepato dipotasico', 'med_Sertralina', 'med_Zolpidem', 'med_Dimenhidrinato', 'med_Doxilamina', 'med_Difenhidramina',  'med_Dimenhidrinato', 'med_Hidroxicina', 'med_Cetirizina', 'med_Hidroclorotiazida'], axis=1)

In [16]:
# User 3

df_users_transf[2] = df_users_transf[2][df_users_transf[2].columns.drop(list(df_users_transf[2].filter(regex='alcohol_')))]
df_users_transf[2] = df_users_transf[2][df_users_transf[2].columns.drop(list(df_users_transf[2].filter(regex='pollution_')))]
df_users_transf[2] = df_users_transf[2].drop("birthcountry", axis=1)
df_users_transf[2] = df_users_transf[2].drop("gender", axis=1)
df_users_transf[2] = df_users_transf[2].drop("birthdate", axis=1)
df_users_transf[2] = df_users_transf[2].drop("educationlevel", axis=1)
df_users_transf[2] = df_users_transf[2].drop("homecountry", axis=1)
df_users_transf[2] = df_users_transf[2].drop("height", axis=1)
df_users_transf[2] = df_users_transf[2].drop("weight", axis=1)
df_users_transf[2] = df_users_transf[2].drop("depression_q6", axis=1)
df_users_transf[2] = df_users_transf[2][df_users_transf[2].columns.drop(list(df_users_transf[2].filter(regex='diet_').drop(["diet_q3","diet_q13"], axis=1)))]

df_users_transf[2] = df_users_transf[2].drop(['med_Paracetamol', 'med_Ibuprofeno', 'med_Dexketoprofeno', 'med_Metamizol', 'med_Morfina',  'med_Fentanil', 'med_Tramadol', 'med_Nitroglicerina', 'med_Verapamilo', 'med_Nifedipino', 'med_Porpanolol', 'med_Atenolol', 'med_Labetalol',  'med_Amiodarona', 'med_Lidocaina', 'med_Adenosina', 'med_Digoxina', 'med_Sulfato de magnesio', 'med_Insulina', 'med_Glucagon',  'med_Lercanidipino', 'med_Acenocumarol', 'med_Alprazolam', 'med_Midazolam', 'med_Diazepam', 'med_Lorazepam', 'med_Lormetazepam',  'med_Clorazepato dipotasico', 'med_Sertralina', 'med_Zolpidem', 'med_Dimenhidrinato', 'med_Doxilamina', 'med_Difenhidramina',  'med_Dimenhidrinato', 'med_Hidroxicina', 'med_Cetirizina'], axis=1)
df_users_transf[2] = df_users_transf[2].drop("smoke_other", axis=1)
df_users_transf[2] = df_users_transf[2].drop(["social_q4","social_q7"], axis=1)
df_users_transf[2] = df_users_transf[2].drop(["stress_q7","stress_q13"], axis=1)

In [17]:
# User 4

df_users_transf[3] = df_users_transf[3].drop(["alcohol_destilled","alcohol_destilledmix","alcohol_other","alcohol_otherfermented"], axis=1)
df_users_transf[3] = df_users_transf[3][df_users_transf[3].columns.drop(list(df_users_transf[3].filter(regex='pollution_')))]
df_users_transf[3] = df_users_transf[3][df_users_transf[3].columns.drop(list(df_users_transf[3].filter(regex='smoke_')))]


df_users_transf[3] = df_users_transf[3].drop("birthcountry", axis=1)
df_users_transf[3] = df_users_transf[3].drop("gender", axis=1)
df_users_transf[3] = df_users_transf[3].drop("birthdate", axis=1)
df_users_transf[3] = df_users_transf[3].drop("educationlevel", axis=1)
df_users_transf[3] = df_users_transf[3].drop("homecountry", axis=1)
df_users_transf[3] = df_users_transf[3].drop("height", axis=1)
df_users_transf[3] = df_users_transf[3].drop("weight", axis=1)

df_users_transf[3] = df_users_transf[3].drop(["depression_q2", "depression_q5", "depression_q7"], axis=1)
df_users_transf[3] = df_users_transf[3].drop(["diet_q4", "diet_q6", "diet_q10", "diet_q11", "diet_q12", "diet_q13"], axis=1)

df_users_transf[3] = df_users_transf[3][df_users_transf[3].columns.drop(list(df_users_transf[3].filter(regex='med_').drop(["med_Paracetamol"], axis=1)))]
df_users_transf[3] = df_users_transf[3].drop(["social_q5", "social_q7"], axis=1)
df_users_transf[3] = df_users_transf[3].drop(["stress_q3", "stress_q6", "stress_q12"], axis=1)

In [18]:
# User 5

df_users_transf[4] = df_users_transf[4][df_users_transf[4].columns.drop(list(df_users_transf[4].filter(regex='alcohol_').drop(["alcohol_otherfermented", "alcohol_winemix"], axis=1)))]
df_users_transf[4] = df_users_transf[4][df_users_transf[4].columns.drop(list(df_users_transf[4].filter(regex='pollution_')))]
df_users_transf[4] = df_users_transf[4][df_users_transf[4].columns.drop(list(df_users_transf[4].filter(regex='smoke_')))]
df_users_transf[4] = df_users_transf[4][df_users_transf[4].columns.drop(list(df_users_transf[4].filter(regex='med_')))]

df_users_transf[4] = df_users_transf[4].drop("birthcountry", axis=1)
df_users_transf[4] = df_users_transf[4].drop("gender", axis=1)
df_users_transf[4] = df_users_transf[4].drop("birthdate", axis=1)
df_users_transf[4] = df_users_transf[4].drop("educationlevel", axis=1)
df_users_transf[4] = df_users_transf[4].drop("homecountry", axis=1)
df_users_transf[4] = df_users_transf[4].drop("height", axis=1)
df_users_transf[4] = df_users_transf[4].drop("weight", axis=1)

df_users_transf[4] = df_users_transf[4].drop(["depression_q2", "depression_q4", "depression_q8", "depression_q10"], axis=1)
df_users_transf[4] = df_users_transf[4][df_users_transf[4].columns.drop(list(df_users_transf[4].filter(regex='diet_').drop(["diet_q3", "diet_q7", "diet_q11"], axis=1)))]
df_users_transf[4] = df_users_transf[4].drop(["social_q1","social_q7"], axis=1)
df_users_transf[4] = df_users_transf[4].drop(["stress_q7", "stress_q8", "stress_q9", "stress_q10", "stress_q13"], axis=1)

In [19]:
# User 6

df_users_transf[5] = df_users_transf[5][df_users_transf[5].columns.drop(list(df_users_transf[5].filter(regex='alcohol_')))]
df_users_transf[5] = df_users_transf[5][df_users_transf[5].columns.drop(list(df_users_transf[5].filter(regex='pollution_')))]
df_users_transf[5] = df_users_transf[5][df_users_transf[5].columns.drop(list(df_users_transf[5].filter(regex='smoke_')))]
df_users_transf[5] = df_users_transf[5][df_users_transf[5].columns.drop(list(df_users_transf[5].filter(regex='med_').drop(["med_Naproxeno","med_Paracetamol"], axis=1)))]

df_users_transf[5] = df_users_transf[5].drop("birthcountry", axis=1)
df_users_transf[5] = df_users_transf[5].drop("gender", axis=1)
df_users_transf[5] = df_users_transf[5].drop("birthdate", axis=1)
df_users_transf[5] = df_users_transf[5].drop("educationlevel", axis=1)
df_users_transf[5] = df_users_transf[5].drop("homecountry", axis=1)
df_users_transf[5] = df_users_transf[5].drop("height", axis=1)

df_users_transf[5] = df_users_transf[5].drop(["depression_q5","depression_q9"], axis=1)
df_users_transf[5] = df_users_transf[5][df_users_transf[5].columns.drop(list(df_users_transf[5].filter(regex='diet_').drop(["diet_q3", "diet_q5", "diet_q8", "diet_q10", "diet_q14"], axis=1)))]
df_users_transf[5] = df_users_transf[5][df_users_transf[5].columns.drop(list(df_users_transf[5].filter(regex='social_').drop(["social_q4"], axis=1)))]
df_users_transf[5] = df_users_transf[5].drop(["stress_q4"], axis=1)


In [20]:
# User 7

df_users_transf[6] = df_users_transf[6][df_users_transf[6].columns.drop(list(df_users_transf[6].filter(regex='alcohol_')))]
df_users_transf[6] = df_users_transf[6][df_users_transf[6].columns.drop(list(df_users_transf[6].filter(regex='pollution_')))]
df_users_transf[6] = df_users_transf[6][df_users_transf[6].columns.drop(list(df_users_transf[6].filter(regex='smoke_')))]
df_users_transf[6] = df_users_transf[6][df_users_transf[6].columns.drop(list(df_users_transf[6].filter(regex='med_').drop(["med_Naproxeno","med_Paracetamol"], axis=1)))]

df_users_transf[6] = df_users_transf[6].drop("birthcountry", axis=1)
df_users_transf[6] = df_users_transf[6].drop("gender", axis=1)
df_users_transf[6] = df_users_transf[6].drop("birthdate", axis=1)
df_users_transf[6] = df_users_transf[6].drop("educationlevel", axis=1)
df_users_transf[6] = df_users_transf[6].drop("homecountry", axis=1)
df_users_transf[6] = df_users_transf[6].drop("height", axis=1)
df_users_transf[6] = df_users_transf[6].drop("weight", axis=1)

df_users_transf[6] = df_users_transf[6].drop(["depression_q1","depression_q3","depression_q9","depression_q10"], axis=1)
df_users_transf[6] = df_users_transf[6].drop(["physical_q5","physical_q7"], axis=1)
df_users_transf[6] = df_users_transf[6][df_users_transf[6].columns.drop(list(df_users_transf[6].filter(regex='diet_').drop(["diet_q2", "diet_q3", "diet_q9"], axis=1)))]
df_users_transf[6] = df_users_transf[6][df_users_transf[6].columns.drop(list(df_users_transf[6].filter(regex='social_').drop(["social_q4", "social_q6"], axis=1)))]
df_users_transf[6] = df_users_transf[6].drop(["stress_q13"], axis=1)


In [21]:
# General dataset

df_general_transf = df_general_transf.drop(['alcohol_destilled', 'alcohol_destilledmix', 'alcohol_other'], axis=1)
df_general_transf = df_general_transf[df_general_transf.columns.drop(list(df_general_transf.filter(regex='pollution_')))]
df_general_transf = df_general_transf.drop("birthcountry", axis=1)
df_general_transf = df_general_transf.drop("homecountry", axis=1)


#['med_Paracetamol', 'med_Ibuprofeno', 'med_Naproxeno', 'med_Dexketoprofeno', 'med_Acido acetil salicilico', 'med_Metamizol', 'med_Morfina',  'med_Fentanil', 'med_Tramadol', 'med_Nitroglicerina', 'med_Verapamilo', 'med_Nifedipino', 'med_Porpanolol', 'med_Atenolol', 'med_Labetalol',  'med_Amiodarona', 'med_Lidocaina', 'med_Adenosina', 'med_Digoxina', 'med_Sulfato de magnesio', 'med_Insulina', 'med_Glucagon',  'med_Lercanidipino', 'med_Acenocumarol', 'med_Alprazolam', 'med_Midazolam', 'med_Diazepam', 'med_Lorazepam', 'med_Lormetazepam',  'med_Clorazepato dipotasico', 'med_Sertralina', 'med_Zolpidem', 'med_Dimenhidrinato', 'med_Doxilamina', 'med_Difenhidramina',  'med_Dimenhidrinato', 'med_Hidroxicina', 'med_Cetirizina', 'med_Hidroclorotiazida']
df_general_transf = df_general_transf.drop(['med_Dexketoprofeno', 'med_Metamizol', 'med_Morfina',  'med_Fentanil', 'med_Tramadol', 'med_Nitroglicerina', 'med_Verapamilo', 'med_Nifedipino', 'med_Porpanolol', 'med_Atenolol', 'med_Labetalol',  'med_Amiodarona', 'med_Lidocaina', 'med_Adenosina', 'med_Digoxina', 'med_Sulfato de magnesio', 'med_Insulina', 'med_Glucagon',  'med_Acenocumarol', 'med_Alprazolam', 'med_Midazolam', 'med_Diazepam', 'med_Clorazepato dipotasico', 'med_Dimenhidrinato', 'med_Doxilamina', 'med_Difenhidramina',  'med_Dimenhidrinato', 'med_Hidroxicina', 'med_Cetirizina'], axis=1)

df_general_transf = df_general_transf.drop("smoke_other", axis=1)

In [22]:
'''
for n in range(len(users)):
    df_users_transf[n].hist(bins=50, grid=False, figsize=(50,40))
    plt.savefig(plots_path + "database_" + str(n+1) + "_clear.pdf")
    plt.close()
'''

'\nfor n in range(len(users)):\n    df_users_transf[n].hist(bins=50, grid=False, figsize=(50,40))\n    plt.savefig(plots_path + "database_" + str(n+1) + "_clear.pdf")\n    plt.close()\n'

In [23]:
'''df_general_transf.hist(bins=50, grid=False, figsize=(50,40))
plt.savefig(plots_path + "database_general_clear.pdf")
plt.close()'''

'df_general_transf.hist(bins=50, grid=False, figsize=(50,40))\nplt.savefig(plots_path + "database_general_clear.pdf")\nplt.close()'

## Split training and test

In [24]:
df_original_general = df_general.copy()

In [25]:
# Clear the features delete in previous section
df_general = df_general[df_general_transf.columns.values]

In [26]:
train_users_set = list()
test_users_set = list()

df_users = list()
df_users_label_dia = list()
df_users_label_sys = list()
df_users_labels = list()

for df_user_transf in df_users_transf:
    train, test = train_test_split(df_user_transf, test_size=0.2, random_state=42)
    train_users_set.append(train)
    test_users_set.append(test)
    
    df_users.append(train.drop(["diastolic","systolic"], axis=1))
    
    df_users_label_dia.append(train["diastolic"].copy())
    df_users_label_sys.append(train["systolic"].copy())
    df_users_labels.append(train[["diastolic","systolic"]].copy())

train_general_set, test_general_set = train_test_split(df_general, test_size=0.2, random_state=42)

df_general = train_general_set.drop(["diastolic","systolic"], axis=1)

df_general_label_dia = train_general_set["diastolic"].copy()
df_general_label_sys = train_general_set["systolic"].copy()
df_general_labels = train_general_set[["diastolic","systolic"]].copy()

In [27]:
# Transform categorical features with OneHot, Oridinal is only for view

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

In [28]:
# In case of general df is two features with categories

# Transform categorical features with OneHot, Oridinal is only for view
cat_encoder_one = OneHotEncoder()

num_attribs = list(df_general)
num_attribs.remove("educationlevel")
num_attribs.remove("gender")
cat_attribs = ["educationlevel","gender"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

df_general_prepared = full_pipeline.fit_transform(df_general)

## Machine Learning algorithms

In [29]:
# Create a dictionary for print result in a file

blankDict = {'user' : '', 'algorithm' : '', 'neg_mean_squared_error':  ''}

for name in df_general.columns.values:
    blankDict['feature_'+name] = '0.0'

write_to_csv_file(blankDict.keys(), 'w')

write_to_models_file("BEST MODELS", 'w')

In [30]:
# Iteractions, cross validation and grade values

number_iter = 100
cv_users = 10
cv_general = 10
max_grade = 5

### Ramdon Forest

In [31]:
# For each user search best model
for n in range(len(users)):
    
    attribs = list(df_users[n])
    
    df_user_prepared = num_pipeline.fit_transform(df_users[n])
    
    param_distribs_rf = {
            'n_estimators': randint(low=1, high=200),
            'max_features': randint(low=1, high=len(df_users[n].columns)),
    }
    
    # Search best model
    forest_reg = RandomForestRegressor(random_state=42)
    rnd_rf_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs_rf,
                                    n_iter=number_iter, cv=cv_users, scoring='neg_mean_squared_error', random_state=42)
    rnd_rf_search.fit(df_user_prepared, df_users_labels[n])

    # Best model
    model = rnd_rf_search.best_estimator_
    
    write_to_models_file("RF:" + str(users[n] )+ ':' + str(model), 'a')
    
    # Prepare test_set
    X_test = test_users_set[n].drop(["diastolic","systolic"], axis=1)
    y_test = test_users_set[n][["diastolic","systolic"]].copy()

    X_test_prepared = num_pipeline.transform(X_test)
    predictions = model.predict(X_test_prepared)

    # Metric
    final_mse = mean_squared_error(y_test, predictions)
    final_rmse = np.sqrt(final_mse)

    # Score of the features in the model
    importances = rnd_rf_search.best_estimator_.feature_importances_
    importance_features = sorted(zip(importances, attribs), reverse=True)

    # Save the score and metric in the file of results
    write_features_to_csv_file(blankDict, users[n], 'RF', final_rmse, importance_features)



In [32]:
# Search best model

attribs = list(df_general)

param_distribs_rf = {
            'n_estimators': randint(low=1, high=200),
            'max_features': randint(low=1, high=len(df_general.columns)),
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_rf_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs_rf,
                                n_iter=number_iter, cv=cv_general, scoring='neg_mean_squared_error', random_state=42)
rnd_rf_search.fit(df_general_prepared, df_general_labels)

# Best model
model = rnd_rf_search.best_estimator_

write_to_models_file("RF:" + 'General' + ':' + str(model), 'a')

# Prepare test_set
X_test = test_general_set.drop(["diastolic","systolic"], axis=1)
y_test = test_general_set[["diastolic","systolic"]].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)

# Metric
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

# Score of the features in the model
importances = rnd_rf_search.best_estimator_.feature_importances_
importance_features = sorted(zip(importances, attribs), reverse=True)

# Save the score and metric in the file of results
write_features_to_csv_file(blankDict, 'General', 'RF', final_rmse, importance_features)



### Decision Tree

In [33]:
# For each user search best model
for n in range(len(users)):
    
    attribs = list(df_users[n])
    
    df_user_prepared = num_pipeline.fit_transform(df_users[n])
    
    # Search best model
    param_distribs_dt = {
            'max_depth': randint(low=2, high=150),
            'min_samples_split': randint(low=2, high=80),
            'max_features': randint(low=1, high=len(df_users[n].columns)),
        }


    tree_reg = DecisionTreeRegressor(random_state=42)
    rnd_tree_search = RandomizedSearchCV(tree_reg, param_distributions=param_distribs_dt,
                                    n_iter=number_iter, cv=cv_users, scoring='neg_mean_squared_error', random_state=42)
    rnd_tree_search.fit(df_user_prepared, df_users_labels[n])

    # Best model
    model = rnd_tree_search.best_estimator_
    
    write_to_models_file("DT:" + str(users[n]) + ':' + str(model), 'a')
    
    # Prepare test_set
    X_test = test_users_set[n].drop(["diastolic","systolic"], axis=1)
    y_test = test_users_set[n][["diastolic","systolic"]].copy()

    X_test_prepared = num_pipeline.transform(X_test)
    predictions = model.predict(X_test_prepared)

    # Metric
    final_mse = mean_squared_error(y_test, predictions)
    final_rmse = np.sqrt(final_mse)

    # Score of the features in the model
    importances = rnd_tree_search.best_estimator_.feature_importances_
    importance_features = sorted(zip(importances, attribs), reverse=True)

    # Save the score and metric in the file of results
    write_features_to_csv_file(blankDict, users[n], 'DT', final_rmse, importance_features)



In [34]:
# Search best model

attribs = list(df_general)

param_distribs_dt = {
        'max_depth': randint(low=2, high=150),
        'min_samples_split': randint(low=2, high=80),
        'max_features': randint(low=1, high=len(df_general.columns)),
    }


tree_reg = DecisionTreeRegressor(random_state=42)
rnd_tree_search = RandomizedSearchCV(tree_reg, param_distributions=param_distribs_dt,
                                n_iter=number_iter, cv=cv_general, scoring='neg_mean_squared_error', random_state=42)
rnd_tree_search.fit(df_general_prepared, df_general_labels)

# Best model
model = rnd_tree_search.best_estimator_

write_to_models_file("DT:" + 'General' + ':' + str(model), 'a')

# Prepare test_set
X_test = test_general_set.drop(["diastolic","systolic"], axis=1)
y_test = test_general_set[["diastolic","systolic"]].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)

# Metric
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

# Score of the features in the model
importances = rnd_tree_search.best_estimator_.feature_importances_
importance_features = sorted(zip(importances, attribs), reverse=True)

# Save the score and metric in the file of results
write_features_to_csv_file(blankDict, 'General', 'DT', final_rmse, importance_features)



### SVR

In [None]:
param_distribs_svr = {
            'kernel': ['linear'], 
            'gamma': randint(low=1e-4, high=1e+4),
            'C': randint(low=1e+0,high=1e+2)
        }

In [None]:
# For each user search best model
for n in range(len(users)):
    
    attribs = list(df_users[n])

    df_user_prepared = num_pipeline.fit_transform(df_users[n])

    # Search best model

    svm_reg = SVR()
    rnd_svr_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs_svr,
                                    n_iter=number_iter, cv=cv_users, scoring='neg_mean_squared_error', random_state=42)
    rnd_svr_search.fit(df_user_prepared, df_users_label_dia[n])

    # Best model
    model = rnd_svr_search.best_estimator_
    
    write_to_models_file("SVR_dia:" + str(users[n]) + ':' + str(model), 'a')

    # Prepare test_set
    X_test = test_users_set[n].drop(["diastolic","systolic"], axis=1)
    y_test = test_users_set[n][["diastolic"]].copy()

    X_test_prepared = num_pipeline.transform(X_test)
    predictions = model.predict(X_test_prepared)

    # Metric
    final_mse = mean_squared_error(y_test, predictions)
    final_rmse = np.sqrt(final_mse)

    # Score of the features in the model
    importance = rnd_svr_search.best_estimator_.coef_
    importance_features = sorted(zip(importance[0], attribs), reverse=True)

    # Save the score and metric in the file of results
    write_features_to_csv_file(blankDict, users[n], 'SVR_dia', final_rmse, importance_features)



In [None]:
attribs = list(df_general)

# Search best model

svm_reg = SVR()
rnd_svr_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs_svr,
                                n_iter=number_iter, cv=cv_general, scoring='neg_mean_squared_error', random_state=42)
rnd_svr_search.fit(df_general_prepared, df_general_label_dia)

# Best model
model = rnd_svr_search.best_estimator_

write_to_models_file("SVR_dia:" + 'General' + ':' + str(model), 'a')

# Prepare test_set
X_test = test_general_set.drop(["diastolic","systolic"], axis=1)
y_test = test_general_set[["diastolic"]].copy()

X_test_prepared = full_pipeline.transform(X_test)
predictions = model.predict(X_test_prepared)

# Metric
final_mse = mean_squared_error(y_test, predictions)
final_rmse = np.sqrt(final_mse)

# Score of the features in the model
importance = rnd_svr_search.best_estimator_.coef_
importance_features = sorted(zip(importance[0], attribs), reverse=True)

# Save the score and metric in the file of results
write_features_to_csv_file(blankDict, 'General', 'SVR_dia', final_rmse, importance_features)

In [None]:
# For each user search best model
for n in range(len(users)):
    
    attribs = list(df_users[n])

    df_user_prepared = num_pipeline.fit_transform(df_users[n])

    # Search best model

    svm_reg = SVR()
    rnd_svr_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs_svr,
                                    n_iter=number_iter, cv=cv_users, scoring='neg_mean_squared_error', random_state=42)
    rnd_svr_search.fit(df_user_prepared, df_users_label_sys[n])

    # Best model
    model = rnd_svr_search.best_estimator_
    
    write_to_models_file("SVR_sys:" + str(users[n]) + ':' + str(model), 'a')

    # Prepare test_set
    X_test = test_users_set[n].drop(["diastolic","systolic"], axis=1)
    y_test = test_users_set[n][["systolic"]].copy()

    X_test_prepared = num_pipeline.transform(X_test)
    predictions = model.predict(X_test_prepared)

    # Metric
    final_mse = mean_squared_error(y_test, predictions)
    final_rmse = np.sqrt(final_mse)

    # Score of the features in the model
    importance = rnd_svr_search.best_estimator_.coef_
    importance_features = sorted(zip(importance[0], attribs), reverse=True)

    # Save the score and metric in the file of results
    write_features_to_csv_file(blankDict, users[n], 'SVR_sys', final_rmse, importance_features)

In [None]:
attribs = list(df_general)

# Search best model

svm_reg = SVR()
rnd_svr_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs_svr,
                                n_iter=number_iter, cv=cv_general, scoring='neg_mean_squared_error', random_state=42)
rnd_svr_search.fit(df_general_prepared, df_general_label_sys)

# Best model
model = rnd_svr_search.best_estimator_

write_to_models_file("SVR_sys:" + 'General' + ':' + str(model), 'a')

# Prepare test_set
X_test = test_general_set.drop(["diastolic","systolic"], axis=1)
y_test = test_general_set[["systolic"]].copy()

X_test_prepared = full_pipeline.transform(X_test)
predictions = model.predict(X_test_prepared)

# Metric
final_mse = mean_squared_error(y_test, predictions)
final_rmse = np.sqrt(final_mse)

# Score of the features in the model
importance = rnd_svr_search.best_estimator_.coef_
importance_features = sorted(zip(importance[0], attribs), reverse=True)

# Save the score and metric in the file of results
write_features_to_csv_file(blankDict, 'General', 'SVR_sys', final_rmse, importance_features)

### Bayesian Ridge

In [40]:
param_distribs_br = {'n_iter': randint(low=50, high=800),}

In [41]:
# For each user search best model
for n in range(len(users)):
    
    attribs = list(df_users[n])

    df_user_prepared = num_pipeline.fit_transform(df_users[n])

    # Search best model

    br_reg = BayesianRidge(compute_score=True)
    rnd_br_search = RandomizedSearchCV(br_reg, param_distributions=param_distribs_br,
                                    n_iter=number_iter, cv=cv_users, scoring='neg_mean_squared_error', random_state=42)
    rnd_br_search.fit(df_user_prepared, df_users_label_dia[n])

    # Best model
    model = rnd_br_search.best_estimator_
    
    write_to_models_file("BR_dia:" + 'General' + ':' + str(model), 'a')

    # Prepare test_set
    X_test = test_users_set[n].drop(["diastolic","systolic"], axis=1)
    y_test = test_users_set[n][["diastolic"]].copy()

    X_test_prepared = num_pipeline.transform(X_test)
    predictions = model.predict(X_test_prepared)

    # Metric
    final_mse = mean_squared_error(y_test, predictions)
    final_rmse = np.sqrt(final_mse)

    # Score of the features in the model
    importance = rnd_br_search.best_estimator_.coef_
    importance_features = sorted(zip(importance, attribs), reverse=True)

    # Save the score and metric in the file of results
    write_features_to_csv_file(blankDict, users[n], 'BR_dia', final_rmse, importance_features)



In [42]:
attribs = list(df_general)

# Search best model

br_reg = BayesianRidge(compute_score=True)
rnd_br_search = RandomizedSearchCV(br_reg, param_distributions=param_distribs_br,
                                n_iter=number_iter, cv=cv_general, scoring='neg_mean_squared_error', random_state=42)
rnd_br_search.fit(df_general_prepared, df_general_label_dia)

# Best model
model = rnd_br_search.best_estimator_

write_to_models_file("BR_dia:" + 'General' + ':' + str(model), 'a')

# Prepare test_set
X_test = test_general_set.drop(["diastolic","systolic"], axis=1)
y_test = test_general_set[["diastolic"]].copy()

X_test_prepared = full_pipeline.transform(X_test)
predictions = model.predict(X_test_prepared)

# Metric
final_mse = mean_squared_error(y_test, predictions)
final_rmse = np.sqrt(final_mse)

# Score of the features in the model
importance = rnd_br_search.best_estimator_.coef_
importance_features = sorted(zip(importance, attribs), reverse=True)

# Save the score and metric in the file of results
write_features_to_csv_file(blankDict, 'General', 'BR_dia', final_rmse, importance_features)

In [43]:
# For each user search best model
for n in range(len(users)):
    
    attribs = list(df_users[n])

    df_user_prepared = num_pipeline.fit_transform(df_users[n])

    # Search best model

    br_reg = BayesianRidge(compute_score=True)
    rnd_br_search = RandomizedSearchCV(br_reg, param_distributions=param_distribs_br,
                                    n_iter=number_iter, cv=cv_users, scoring='neg_mean_squared_error', random_state=42)
    rnd_br_search.fit(df_user_prepared, df_users_label_sys[n])

    # Best model
    model = rnd_br_search.best_estimator_
    
    write_to_models_file("BR_sys:" + 'General' + ':' + str(model), 'a')

    # Prepare test_set
    X_test = test_users_set[n].drop(["diastolic","systolic"], axis=1)
    y_test = test_users_set[n][["systolic"]].copy()

    X_test_prepared = num_pipeline.transform(X_test)
    predictions = model.predict(X_test_prepared)

    # Metric
    final_mse = mean_squared_error(y_test, predictions)
    final_rmse = np.sqrt(final_mse)

    # Score of the features in the model
    importance = rnd_br_search.best_estimator_.coef_
    importance_features = sorted(zip(importance, attribs), reverse=True)

    # Save the score and metric in the file of results
    write_features_to_csv_file(blankDict, users[n], 'BR_sys', final_rmse, importance_features)



In [44]:
attribs = list(df_general)

# Search best model

br_reg = BayesianRidge(compute_score=True)
rnd_br_search = RandomizedSearchCV(br_reg, param_distributions=param_distribs_br,
                                n_iter=number_iter, cv=cv_general, scoring='neg_mean_squared_error', random_state=42)
rnd_br_search.fit(df_general_prepared, df_general_label_sys)

# Best model
model = rnd_br_search.best_estimator_

write_to_models_file("BR_sys:" + 'General' + ':' + str(model), 'a')

# Prepare test_set
X_test = test_general_set.drop(["diastolic","systolic"], axis=1)
y_test = test_general_set[["systolic"]].copy()

X_test_prepared = full_pipeline.transform(X_test)
predictions = model.predict(X_test_prepared)

# Metric
final_mse = mean_squared_error(y_test, predictions)
final_rmse = np.sqrt(final_mse)

# Score of the features in the model
importance = rnd_br_search.best_estimator_.coef_
importance_features = sorted(zip(importance, attribs), reverse=True)

# Save the score and metric in the file of results
write_features_to_csv_file(blankDict, 'General', 'BR_sys', final_rmse, importance_features)



### Linear regression

In [45]:
param_distribs_lr = {'n_jobs': randint(low=20, high=800),}

In [46]:
# For each user search best model
for n in range(len(users)):
    
    attribs = list(df_users[n])

    df_user_prepared = num_pipeline.fit_transform(df_users[n])
    
    # Search best model
    
    lin_reg = LinearRegression()
    
    rnd_lr_search = RandomizedSearchCV(lin_reg, param_distributions=param_distribs_lr,
                                n_iter=number_iter, cv=cv_users, scoring='neg_mean_squared_error', random_state=42)
    rnd_lr_search.fit(df_user_prepared, df_users_label_dia[n])
    
    # Best model
    model = rnd_lr_search.best_estimator_
    
    write_to_models_file("LR_dia:" + str(users[n]) + ':' + str(model), 'a')

    # Prepare test_set
    X_test = test_users_set[n].drop(["diastolic","systolic"], axis=1)
    y_test = test_users_set[n][["diastolic"]].copy()

    X_test_prepared = num_pipeline.transform(X_test)
    predictions = model.predict(X_test_prepared)

    # Metric
    final_mse = mean_squared_error(y_test, predictions)
    final_rmse = np.sqrt(final_mse)
    

    # Score of the features in the model
    importance = rnd_lr_search.best_estimator_.coef_
    importance_features = sorted(zip(importance, attribs), reverse=True)

    # Save the score and metric in the file of results
    write_features_to_csv_file(blankDict, users[n], 'LR_dia', final_rmse, importance_features)



In [47]:
attribs = list(df_general)

# Search best model

lin_reg = LinearRegression()

rnd_lr_search = RandomizedSearchCV(lin_reg, param_distributions=param_distribs_lr,
                            n_iter=number_iter, cv=cv_general, scoring='neg_mean_squared_error', random_state=42)
rnd_lr_search.fit(df_general_prepared, df_general_label_dia)

# Best model
model = rnd_lr_search.best_estimator_

write_to_models_file("LR_dia:" + 'General' + ':' + str(model), 'a')

# Prepare test_set
X_test = test_general_set.drop(["diastolic","systolic"], axis=1)
y_test = test_general_set[["diastolic"]].copy()

X_test_prepared = full_pipeline.transform(X_test)
predictions = model.predict(X_test_prepared)

# Metric
final_mse = mean_squared_error(y_test, predictions)
final_rmse = np.sqrt(final_mse)

# Score of the features in the model
importance = rnd_lr_search.best_estimator_.coef_
importance_features = sorted(zip(importance, attribs), reverse=True)

# Save the score and metric in the file of results
write_features_to_csv_file(blankDict, 'General', 'LR_dia', final_rmse, importance_features)



In [48]:
# For each user search best model
for n in range(len(users)):
    
    attribs = list(df_users[n])

    df_user_prepared = num_pipeline.fit_transform(df_users[n])
    
    # Search best model

    lin_reg = LinearRegression()
    
    rnd_lr_search = RandomizedSearchCV(lin_reg, param_distributions=param_distribs_lr,
                                n_iter=number_iter, cv=cv_users, scoring='neg_mean_squared_error', random_state=42)
    rnd_lr_search.fit(df_user_prepared, df_users_label_sys[n])
    
    # Best model
    model = rnd_lr_search.best_estimator_
    
    write_to_models_file("LR_sys:" + str(users[n]) + ':' + str(model), 'a')

    # Prepare test_set
    X_test = test_users_set[n].drop(["diastolic","systolic"], axis=1)
    y_test = test_users_set[n][["systolic"]].copy()

    X_test_prepared = num_pipeline.transform(X_test)
    predictions = model.predict(X_test_prepared)

    # Metric
    final_mse = mean_squared_error(y_test, predictions)
    final_rmse = np.sqrt(final_mse)
    

    # Score of the features in the model
    importance = rnd_lr_search.best_estimator_.coef_
    importance_features = sorted(zip(importance, attribs), reverse=True)

    # Save the score and metric in the file of results
    write_features_to_csv_file(blankDict, users[n], 'LR_sys', final_rmse, importance_features)



In [49]:
attribs = list(df_general)

# Search best model

lin_reg = LinearRegression()

rnd_lr_search = RandomizedSearchCV(lin_reg, param_distributions=param_distribs_lr,
                            n_iter=number_iter, cv=cv_general, scoring='neg_mean_squared_error', random_state=42)
rnd_lr_search.fit(df_general_prepared, df_general_label_sys)

# Best model
model = rnd_lr_search.best_estimator_

write_to_models_file("LR_sys:" + 'General' + ':' + str(model), 'a')

# Prepare test_set
X_test = test_general_set.drop(["diastolic","systolic"], axis=1)
y_test = test_general_set[["systolic"]].copy()

X_test_prepared = full_pipeline.transform(X_test)
predictions = model.predict(X_test_prepared)

# Metric
final_mse = mean_squared_error(y_test, predictions)
final_rmse = np.sqrt(final_mse)

# Score of the features in the model
importance = rnd_lr_search.best_estimator_.coef_
importance_features = sorted(zip(importance, attribs), reverse=True)

# Save the score and metric in the file of results
write_features_to_csv_file(blankDict, 'General', 'LR_sys', final_rmse, importance_features)



### Polinomial regression

In [50]:
# For each user search best model
for n in range(len(users)):
    
    rmses = []
    degrees = np.arange(2, max_grade)
    models = list()
    min_rmse, min_deg = 1e10, 0
    
    df_user_prepared = num_pipeline.fit_transform(df_users[n])

    X = df_user_prepared.copy()
    y = df_users_label_dia[n].copy()
    
    attribs = list(df_users[n])

    for deg in degrees:

        lin_reg = LinearRegression()

        poly_reg = make_pipeline(PolynomialFeatures(deg),lin_reg)

        poly_reg.fit(X, y)

        X_test = test_users_set[n].drop(["diastolic","systolic"], axis=1)
        y_test = test_users_set[n][["diastolic"]].copy()

        X_test_prepared = num_pipeline.transform(X_test)
        final_predictions = poly_reg.predict(X_test_prepared)

        final_mse = mean_squared_error(y_test, final_predictions)
        final_rmse = np.sqrt(final_mse)
        
        models.append(lin_reg)

        # Cross-validation of degree
        if min_rmse > final_rmse:
            min_rmse = final_rmse
            min_deg = deg
   
    # Best model
    model = models[min_deg-2]
    
    write_to_models_file("PR_dia:" + str(users[n]) + ':' + "Polynomial grade = "+ str(min_deg), 'a')

    # Score of the features in the model
    importance = model.coef_
    importance_features = sorted(zip(importance, attribs), reverse=True)

    # Save the score and metric in the file of results
    write_features_to_csv_file(blankDict, users[n], 'PR_dia', min_rmse, importance_features)

In [51]:
# For each user search best model

rmses = []
degrees = np.arange(2, max_grade)
models = list()
min_rmse, min_deg = 1e10, 0

X = df_general_prepared.copy()
y = df_general_label_dia.copy()

attribs = list(df_general)

for deg in degrees:

    lin_reg = LinearRegression()

    poly_reg = make_pipeline(PolynomialFeatures(deg),lin_reg)

    poly_reg.fit(X, y)

    X_test = test_general_set.drop(["diastolic","systolic"], axis=1)
    y_test = test_general_set[["diastolic"]].copy()

    X_test_prepared = full_pipeline.transform(X_test)
    final_predictions = poly_reg.predict(X_test_prepared)

    final_mse = mean_squared_error(y_test, final_predictions)
    final_rmse = np.sqrt(final_mse)

    models.append(lin_reg)

    # Cross-validation of degree
    if min_rmse > final_rmse:
        min_rmse = final_rmse
        min_deg = deg

# Best model
model = models[min_deg-2]

write_to_models_file("PR_dia:" + 'General' + ':' + "Polynomial grade = "+ str(min_deg), 'a')

# Score of the features in the model
importance = model.coef_
importance_features = sorted(zip(importance, attribs), reverse=True)

# Save the score and metric in the file of results
write_features_to_csv_file(blankDict, 'General', 'PR_dia', min_rmse, importance_features)

In [52]:
# For each user search best model
for n in range(len(users)):
    
    rmses = []
    degrees = np.arange(2, max_grade)
    models = list()
    min_rmse, min_deg = 1e10, 0
    
    df_user_prepared = num_pipeline.fit_transform(df_users[n])

    X = df_user_prepared.copy()
    y = df_users_label_sys[n].copy()
    
    attribs = list(df_users[n])

    for deg in degrees:

        lin_reg = LinearRegression()

        poly_reg = make_pipeline(PolynomialFeatures(deg),lin_reg)

        poly_reg.fit(X, y)

        X_test = test_users_set[n].drop(["diastolic","systolic"], axis=1)
        y_test = test_users_set[n][["systolic"]].copy()

        X_test_prepared = num_pipeline.transform(X_test)
        final_predictions = poly_reg.predict(X_test_prepared)

        final_mse = mean_squared_error(y_test, final_predictions)
        final_rmse = np.sqrt(final_mse)
        
        models.append(lin_reg)

        # Cross-validation of degree
        if min_rmse > final_rmse:
            min_rmse = final_rmse
            min_deg = deg
   
    # Best model
    model = models[min_deg-2]
    
    write_to_models_file("PR_sys:" + str(users[n]) + ':' + "Polynomial grade = "+ str(min_deg), 'a')

    # Score of the features in the model
    importance = model.coef_
    importance_features = sorted(zip(importance, attribs), reverse=True)

    # Save the score and metric in the file of results
    write_features_to_csv_file(blankDict, users[n], 'PR_sys', min_rmse, importance_features)

In [53]:
# For each user search best model

rmses = []
degrees = np.arange(2, max_grade)
models = list()
min_rmse, min_deg = 1e10, 0

X = df_general_prepared.copy()
y = df_general_label_sys.copy()

attribs = list(df_general)

for deg in degrees:
    
    lin_reg = LinearRegression()

    poly_reg = make_pipeline(PolynomialFeatures(deg),lin_reg)

    poly_reg.fit(X, y)

    X_test = test_general_set.drop(["diastolic","systolic"], axis=1)
    y_test = test_general_set[["systolic"]].copy()

    X_test_prepared = full_pipeline.transform(X_test)
    final_predictions = poly_reg.predict(X_test_prepared)

    final_mse = mean_squared_error(y_test, final_predictions)
    final_rmse = np.sqrt(final_mse)

    models.append(lin_reg)

    # Cross-validation of degree
    if min_rmse > final_rmse:
        min_rmse = final_rmse
        min_deg = deg

# Best model
model = models[min_deg-2]

write_to_models_file("PR_sys:" + 'General' + ':' + "Polynomial grade = "+ str(min_deg), 'a')

# Score of the features in the model
importance = model.coef_
importance_features = sorted(zip(importance, attribs), reverse=True)

# Save the score and metric in the file of results
write_features_to_csv_file(blankDict, 'General', 'PR_sys', min_rmse, importance_features)