In [None]:
# --- Celda 1: Importación de librerías ---
import boto3
import pandas as pd
from datetime import datetime
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [None]:
# --- Celda 2: Configuración de conexión a S3 ---
bucket_name = 'tu-bucket-s3'
ruta = 'ruta/al/archivo/limpio'

# Conexión al cliente S3
s3_client = boto3.client('s3')

# Listar archivos en la ruta del bucket
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=ruta)
files = [content['Key'] for content in response.get('Contents', [])]

# Seleccionar el archivo .pkl
pkl_file = next((file for file in files if file.endswith('.pkl')), None)

if not pkl_file:
    raise ValueError("No se encontró un archivo .pkl en la ruta especificada.")

# Cargar el archivo .pkl
obj = s3_client.get_object(Bucket=bucket_name, Key=pkl_file)
df = pd.read_pickle(obj['Body'])

In [None]:
# --- Celda 3: Eliminación de observaciones específicas ---
exclude_results = ['Business Not Located', 'No Entry', 'Out of Business']
df = df[~df['results'].isin(exclude_results)]

In [None]:
# --- Celda 4: Transformación de resultados de inspección ---
df['results'] = df['results'].replace({
    'Pass': 'pass',
    'Pass w/ Conditions': 'pass'
}).apply(lambda x: 'fail' if x not in ['pass'] else x)

In [None]:
# --- Celda 5: Transformación de la variable risk ---
df['risk'] = df['risk'].replace({
    'Risk 1 (High)': 'high',
    'Risk 2 (Medium)': 'medium',
    'Risk 3 (Low)': 'low',
    'All': 'all'
})

In [None]:
# --- Celda 6: Transformación de la variable facility_type ---
df['facility_type'] = df['facility_type'].replace({
    '.*Daycare.*': 'daycare',
    '.*Restaurant.*': 'restaurant',
    '.*Mobile Food.*': 'mobile food'
}, regex=True)

In [None]:
# Cambiar facility_type no populares a "other"
top_20 = df['facility_type'].value_counts().nlargest(20).index
df['facility_type'] = df['facility_type'].apply(
    lambda x: x if x in top_20 else 'other'
)

In [None]:
# --- Celda 7: Generación de características adicionales ---
df['inspection_date'] = pd.to_datetime(df['inspection_date'])
df['month'] = df['inspection_date'].dt.month
df['year'] = df['inspection_date'].dt.year
df['day_of_month'] = df['inspection_date'].dt.day
df['week_of_year'] = df['inspection_date'].dt.isocalendar().week
df['day_of_week'] = df['inspection_date'].dt.dayofweek
df['week_day'] = df['day_of_week'].apply(lambda x: 1 if x < 5 else 0)
df['weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

In [None]:
# --- Celda 8: Creación de la matriz de diseño ---
features = ['facility_type', 'risk', 'latitude', 'longitude', 'results', 
            'month', 'year', 'day_of_month', 'week_of_year', 'week_day', 
            'weekend', 'day_of_week']

df_features = df[features]
column_transformer = ColumnTransformer(transformers=[
    ('facility_type', OneHotEncoder(), ['facility_type']),
    ('risk', OneHotEncoder(), ['risk'])
], remainder='passthrough')

feature_matrix = column_transformer.fit_transform(df_features)

# Convertir a DataFrame
feature_matrix_df = pd.DataFrame(feature_matrix, columns=column_transformer.get_feature_names_out())

In [None]:
# --- Celda 9: Guardar la matriz de diseño en S3 ---
today_date = datetime.today().strftime('%Y-%m-%d')
file_name = f"feature-matrix/feature-matrix.{today_date}.pkl"

# Guardar temporalmente el archivo en el disco local
with open('/tmp/temp_feature_matrix.pkl', 'wb') as temp_file:
    feature_matrix_df.to_pickle(temp_file)
    s3_client.upload_file('/tmp/temp_feature_matrix.pkl', bucket_name, file_name)

print("Matriz de diseño creada y guardada exitosamente.")