# TFM - Cristian Leguisamon - Universidad Castilla-La Mancha 
## Master en ciencia de datos e ingeniería de datos en la nube
### TFM - Parte 01

In [31]:
import pandas as pd
import numpy as np
import json
import sklearn
from sklearn.linear_model import LinearRegression

data_folder = "Data FitBit"
data_folder_nutrition = "Data Nutrition"
df_dailyActivity = pd.read_csv(f"{data_folder}/dailyActivity_merged.csv")
df_weightLogInfo = pd.read_csv(f"{data_folder}/weightLogInfo_merged.csv")
limit_height = [140,200]
std_dev_height = 10
limit_bmi = [20,40]
std_dev_bmi = 5
limit_weight=[50,150]

Se realizan transformaciones sobre el dataframe weightLogInfo

In [32]:
df_weightLogInfo = df_weightLogInfo.fillna(0)
df_weightLogInfo['Date'] = pd.to_datetime(df_weightLogInfo['Date'], format='%m/%d/%Y %I:%M:%S %p')
df_weightLogInfo = df_weightLogInfo.sort_values(by=['Id', 'Date'], ascending=[True, False])
idx_most_recent = df_weightLogInfo.groupby('Id')['Date'].idxmax()
df_weightLogInfo = df_weightLogInfo.loc[idx_most_recent]
df_weightLogInfo = df_weightLogInfo.reset_index(drop=True)
id_user_list_da = df_dailyActivity['Id'].unique()
id_user_weight = df_weightLogInfo['Id'].unique()
id_user_list_da = [id_user for id_user in id_user_list_da if id_user not in id_user_weight]
df_weightLogInfo.drop(['WeightPounds', 'IsManualReport', 'LogId', 'Fat'], axis=1, inplace=True)
df_weightLogInfo[['WeightKg', 'BMI']] = df_weightLogInfo[['WeightKg', 'BMI']].round(2)
df_weightLogInfo['Date'] = pd.to_datetime(df_weightLogInfo['Date']).dt.date
df_weightLogInfo['Height'] = round(np.sqrt(df_weightLogInfo['WeightKg'] / df_weightLogInfo['BMI']) * 100, 2)
columns_df = df_weightLogInfo.columns.tolist()
max_date = df_weightLogInfo['Date'].max()

In [33]:
data = {column: [0] * len(id_user_list_da) for column in columns_df}
data['Id'] = id_user_list_da
df_result = pd.DataFrame(data)

In [34]:
def calculate_coeficients(df_weightLogInfo):
    X = df_weightLogInfo[['BMI', 'Height']]
    y = df_weightLogInfo['WeightKg']
    reg = LinearRegression()
    reg.fit(X, y)
    coef_bmi = reg.coef_[0]
    coef_height = reg.coef_[1]
    intercept = reg.intercept_
    return coef_bmi, coef_height, intercept

for idx, row in df_result.iterrows():
    new_rows = []
    coef_bmi, coef_height,intercept = calculate_coeficients(df_weightLogInfo)
    mean_height = np.mean(limit_height)
    height_value = np.random.normal(mean_height, std_dev_height)           
    mean_bmi = np.mean(limit_bmi)
    bmi = np.random.normal(mean_bmi, std_dev_bmi)
    new_row = {
        'Id': row['Id'],
        'Date': row['Date'],
        'WeightKg': 0,
        'BMI': bmi,
        'Height': height_value,
    }
    new_rows.append(new_row)
    df_tmp = pd.DataFrame(new_rows)
    df_weightLogInfo = pd.concat([df_weightLogInfo, df_tmp], ignore_index=True)

df_weightLogInfo['Date'] = df_weightLogInfo['Date'].replace(0, np.nan)
df_weightLogInfo['Date'].fillna(max_date, inplace=True)
df_weightLogInfo[['WeightKg', 'BMI','Height']] = df_weightLogInfo[['WeightKg', 'BMI','Height']].round(2)
df_weightLogInfo['WeightKg'] = df_weightLogInfo['BMI'] * (df_weightLogInfo['Height'] / 100) ** 2

In [35]:
print(df_weightLogInfo.head())

           Id        Date    WeightKg    BMI  Height
0  1503960366  2016-05-03   52.599443  22.65  152.39
1  1927972279  2016-04-13  133.506850  47.54  167.58
2  2873212765  2016-05-12   57.303357  21.69  162.54
3  4319703577  2016-05-04   72.300313  27.38  162.50
4  4558609924  2016-05-09   69.102721  27.00  159.98


Combinamos los dataframes de informacion diaria con la informacion del peso de cada persona

In [36]:
merged_df = df_weightLogInfo.merge(df_dailyActivity, on='Id', how='inner')
merged_df.drop(['TrackerDistance','LoggedActivitiesDistance','SedentaryActiveDistance','ActivityDate'], axis=1, inplace=True)

In [37]:
merged_df['Date'] = pd.to_datetime(merged_df['Date'], format='%m/%d/%Y %I:%M:%S %p')
merged_df = merged_df.sort_values(by=['Id', 'Date'], ascending=[True, False])
idx_most_recent = merged_df.groupby('Id')['Date'].idxmax()
merged_df = merged_df.loc[idx_most_recent]
merged_df = merged_df.reset_index(drop=True)
avg_weight = merged_df['WeightKg'].mean()
merged_df.loc[merged_df['WeightKg'] < 45, 'WeightKg'] = avg_weight

Agregamos la columna 'age', simulando datos con una distribución triangular. También calculamos un estimador de nivel de actividad que luego es utilizado para el calculo de nutrientes necesarios. Para calcular la edad, utilizo una distribucion triangular porque los valores centrales en el rango 20 - 85 deberían tener mas probabilidad de aparecer en el listado. Los datos del rango de edad de los usuarios de esta pulsera, fueron consultados en distintas fuentes de internet.

In [38]:
merged_df['age'] = 0
num_samples = len(merged_df)
ages = np.random.triangular(20, 45, 85, size=num_samples)
merged_df['age'] = ages
merged_df['age'] = merged_df['age'].astype(int)
merged_df['ActiveMinutes'] = merged_df['VeryActiveMinutes'] + merged_df['FairlyActiveMinutes'] + merged_df['LightlyActiveMinutes']
merged_df['ActiveDistance'] = merged_df['VeryActiveDistance'] + merged_df['ModeratelyActiveDistance'] + merged_df['LightActiveDistance']

In [39]:
average_very_active_minutes = merged_df['ActiveMinutes'].mean()
average_very_active_distance = merged_df['ActiveDistance'].mean()
def calculate_status_activity(row):
    if row['ActiveMinutes'] < average_very_active_minutes - 0.1 * average_very_active_minutes:
        return 1
    elif average_very_active_minutes - 0.1 * average_very_active_minutes <= row['ActiveMinutes'] <= average_very_active_minutes + 0.1 * average_very_active_minutes:
        return 1.25
    elif average_very_active_minutes + 0.1 * average_very_active_minutes < row['ActiveMinutes'] <= average_very_active_minutes + 0.5 * average_very_active_minutes:
        return 1.5
    else:
        return 1.75

In [40]:
def calculate_status_distance(row):
    if row['ActiveDistance'] < average_very_active_distance - 0.1 * average_very_active_distance:
        return 1
    elif average_very_active_distance - 0.1 * average_very_active_distance <= row['ActiveDistance'] <= average_very_active_distance + 0.1 * average_very_active_distance:
        return 1.25
    elif average_very_active_distance + 0.1 * average_very_active_distance < row['ActiveDistance'] <= average_very_active_distance + 0.5 * average_very_active_distance:
        return 1.5
    else:
        return 1.75

In [41]:
merged_df['status_activity'] = merged_df.apply(calculate_status_activity, axis=1)
merged_df['status_distance'] = merged_df.apply(calculate_status_distance, axis=1)
merged_df['avg_activity'] = (merged_df['status_activity'] + merged_df['status_distance']) / 2
merged_df['eerest'] = round(66.5 + (13.7 * merged_df['WeightKg']) + (5.0 * merged_df['Height']) - (6.8 * merged_df['age']),2)

In [42]:
def calculate_EErest(row):
    if row['avg_activity'] == 1:
        return row['eerest'] + row['eerest'] * 0.2
    elif 1 < row['avg_activity'] < 1.25:
        return row['eerest'] + row['eerest'] * 0.3
    elif row['avg_activity'] >= 1.25:
        return row['eerest'] + row['eerest'] * 0.5
    else:
        return row['eerest']

In [44]:
def calculate_liquid(row):
    if row['age'] <= 30:
        return 40 * row['WeightKg']
    elif row['age'] > 30 and row['age'] <= 55:
        return 35 * row['WeightKg']
    elif row['age'] > 55 and row['age'] <= 75:
        return 30 * row['WeightKg']
    else:
        return 25 * row['WeightKg']

In [45]:
merged_df['eerest'] = round(merged_df.apply(calculate_EErest, axis=1),2)
merged_df['liquid'] = round(merged_df.apply(calculate_liquid, axis=1),2)
print(merged_df.head())

           Id       Date    WeightKg    BMI  Height  TotalSteps  \
0  1503960366 2016-05-03   52.599443  22.65  152.39       13162   
1  1624580081 2016-05-12   61.619125  24.34  159.11        8163   
2  1644430081 2016-05-12   77.056403  25.95  172.32       10694   
3  1844505072 2016-05-12   83.720222  30.12  166.72        6697   
4  1927972279 2016-04-13  133.506850  47.54  167.58         678   

   TotalDistance  VeryActiveDistance  ModeratelyActiveDistance  \
0           8.50                1.88                      0.55   
1           5.31                0.00                      0.00   
2           7.77                0.14                      2.30   
3           4.43                0.00                      0.00   
4           0.47                0.00                      0.00   

   LightActiveDistance  ...  SedentaryMinutes  Calories  age  ActiveMinutes  \
0                 6.06  ...               728      1985   38            366   
1                 5.31  ...              1

Tratamiento de los datos relacionados de nutrientes

In [46]:
#merged_df.to_csv(f"{data_folder}/final_usr_dataset.csv", index=False)
df_vegetables = pd.read_csv(f"{data_folder_nutrition}/vegetables_dataset.csv")
df_vegetables.head()

Unnamed: 0,name,energy (kcal/kJ),water (g),protein (g),total fat (g),carbohydrates (g),fiber (g),sugars (g),calcium (mg),iron (mg),...,potassium (mg),sodium (g),vitamin A (IU),vitamin C (mg),vitamin B1 (mg),vitamin B2 (mg),viatmin B3 (mg),vitamin B5 (mg),vitamin B6 (mg),vitamin E (mg)
0,Apple nutrition facts,48/200,86.7,0.27,0.13,12.7,1.3,10.1,5,0.07,...,90,0,38,4.0,19.0,28.0,91.0,71.0,37,0.05
1,Apricot nutrition facts,48/201,86.4,1.4,0.39,11.12,2.0,9.24,13,0.39,...,259,1,1926,10.0,0.03,0.04,0.6,0.24,54,0.89
2,"Artichokes, cooked",53/220,84.08,2.89,0.34,11.95,8.6,0.99,21,0.61,...,286,60,13,7.4,0.05,89.0,1.11,0.24,81,0.19
3,"Asparagus, cooked",22/94,92.63,2.4,0.22,4.11,2.0,1.3,23,0.91,...,224,14,1006,7.7,162.0,139.0,1084.0,225.0,79,1.5
4,Avocado nutrition,160/670,73.23,2.0,14.7,8.53,6.7,0.66,12,0.55,...,485,7,146,10.0,67.0,0.13,1738.0,1389.0,257,2.07


In [47]:
calories_per_gram_fat = 9
calories_per_gram_protein = 4
calories_per_gram_carbohydrates = 4
conversion_factor = 1000
#Calculamos nuevas columnas para unificar la unidad de medida
df_vegetables['Fats'] = df_vegetables['total fat (g)'] * calories_per_gram_fat
df_vegetables['Proteins'] = df_vegetables['protein (g)'] * calories_per_gram_protein
df_vegetables[['energy_kcal', 'energy_kJ']] = df_vegetables['energy (kcal/kJ)'].str.split('/', expand=True)
df_vegetables['Calories'] = pd.to_numeric(df_vegetables['energy_kcal'])
df_vegetables['Carbohydrates'] = df_vegetables['carbohydrates (g)'] * calories_per_gram_carbohydrates
df_vegetables.rename(columns={'fiber (g)': 'Fiber'}, inplace=True)
df_vegetables.rename(columns={'vitamin C (mg)': 'Vitamin C'}, inplace=True)
df_vegetables.rename(columns={'sugars (g)': 'Sugar'}, inplace=True)
df_vegetables.rename(columns={'calcium (mg)': 'Calcium'}, inplace=True)
columns_names = df_vegetables.columns.values
columns_keep = ['name','Fiber','Vitamin C', 'Fats', 'Sugar','Calcium','Proteins', 'Calories', 'Carbohydrates']
columns_to_delete = df_vegetables.columns.difference(columns_keep)
df_vegetables = df_vegetables.drop(columns=columns_to_delete)
df_vegetables['Category'] = 'vegetables'

In [48]:
meals_dataset = pd.read_csv("Data Nutrition/meals_dataset.csv")

In [49]:
new_column_names = ['name','Fiber','Vitamin C','Fats','Proteins','Calories','Sugar','Calcium','Carbohydrates','Category']
meals_dataset.rename(columns=dict(zip(meals_dataset.columns, new_column_names)), inplace=True)
new_column_order = meals_dataset.columns
df_vegetables = df_vegetables.reindex(columns=new_column_order)

In [None]:
#df_vegetables.to_csv("final_vegetables.csv", index=False)
#concatenated_df = pd.concat([df_vegetables, meals_dataset], axis=0)
#concatenated_df.to_csv("complete_nutrition_dataset.csv", index=False)

* El dataset de alimentos fue ampliado para incorporar alimentos mas variados
* La conversion entre unidad de medidas fue realizada siguiendo fuentes de internet
* Cálculos sobre BMI realizados conforme fuentes de internet y sitios web especializados
* La estimacion del nivel de actividad de una persona es un ajuste de este proyecto y sin verificación científica.