In [152]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [153]:
import pandas as pd
import numpy as np
import joblib

In [154]:
ROOT = '/content/drive/Othercomputers/Mi MacBook Pro/Google Drive/tp2/'
dataSetPath = './data/smoking_prediction_entrega.csv'

df = pd.read_csv(ROOT + dataSetPath)
df_copy = df.copy()

In [155]:
ids = df.pop('ID')

# **Feature engineering**

Categorizamos por grupo de edades

In [156]:
def categorize_age(age):
  if age < 30:
    return "18-29"
  elif age < 40:
    return "30-39"
  elif age < 50:
    return "40-49"
  elif age < 60:
    return "50-59"
  else:
    return "60+"

df["age_group"] = df["age"].apply(categorize_age)
df.drop(columns=['age'], axis=1, inplace=True)

# **Índices Antropométricos Derivados**

In [157]:
# Índice de masa corporal
df['BMI'] = df['weight(kg)'] / (df['height(cm)'] / 100) ** 2

# Relación cintura-altura
df['WHtR'] = df['waist(cm)'] / df['height(cm)']

# **Índices Cardiovasculares**

In [158]:
# Presión de pulso
df['pulse pressure'] = (df['systolic'] + df['relaxation']) / 2

# Presión arterial media
df['mean arterial pressure'] = df['relaxation'] + (df['pulse pressure'] / 3)

# **Perfil Lipídico Avanzado**

In [159]:
# Índice aterogénico (Castelli I)
df['atherogenic index'] = df['Cholesterol'] / df['HDL']

# Ratio triglicéridos/HDL (resistencia insulina)
df['TG HDL ratio'] = df['triglyceride'] / df['HDL']

# Colesterol no-HDL
df['no-HDL'] = df['Cholesterol'] - df['HDL']

# LDL/HDL ratio
df['LDL-HDL ratio'] = df['LDL'] / df['HDL']

# **Salud Sensorial**

In [160]:
# Promedio de agudeza visual
df['eyesight avg'] = (df['eyesight(left)'] + df['eyesight(right)']) / 2

# Promedio de agudeza auditiva
df['hearing avg'] = (df['hearing(left)'] + df['hearing(right)']) / 2

df.drop(columns=['eyesight(left)',	'eyesight(right)',	'hearing(left)',	'hearing(right)'], axis=1, inplace=True)

# **Marcadores Metabólicos**

In [161]:
# Ratio AST/ALT (daño hepático)
df['AST-ALT ratio'] = df['AST'] / df['ALT']

# **Salud Oral**

In [162]:
# Enfermedad periodontal severa
df['severe_periodontal'] = np.where((df['dental caries'] == 1) & (df['tartar'] == 'Y'), 1, 0)

df.drop(columns=['dental caries',	'tartar'], axis=1, inplace=True)

Eliminamos variable Oral

In [163]:
drop_columns = ['oral']
df.drop(columns=drop_columns, axis=1, inplace=True)

# **Preparación de los datos**

Convertimos los valores de la variable *Gender*

In [164]:
df['gender'] = df['gender'].astype('category')
df['gender'] = df['gender'].cat.codes

Generamos dummies para la variable age_group

In [165]:
df = pd.get_dummies(df, columns=['age_group'])

In [166]:
X = df.values

In [167]:
pipe = joblib.load(ROOT + 'models/pipeline.joblib')

In [168]:
prediction = pipe.predict(X)

In [169]:
prediction_df = pd.DataFrame({ 'ID': ids, 'Prediction': prediction })

In [170]:
df_copy['Prediction'] = prediction

In [171]:
prediction_df

Unnamed: 0,ID,Prediction
0,27358,0
1,27364,1
2,27368,1
3,27378,1
4,27381,1
...,...,...
5687,55676,0
5688,55681,0
5689,55683,0
5690,55684,0


In [172]:
df_copy.to_csv(ROOT + './predictions/smoking_prediction_resolved.csv')
prediction_df.to_csv(ROOT + './predictions/predictions.csv')
np.savetxt(ROOT + './predictions/predictions.txt', prediction_df.astype(int), fmt="%d")