# Proyecto 1 - MLops

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from ucimlrepo import fetch_ucirepo


### Cargar el dataser Covertype

In [None]:
# Cargar el dataset desde ucimlrepo
covertype = fetch_ucirepo(id=31)

# Separar características (X) y etiquetas (y)
X = covertype.data.features
y = covertype.data.targets

# Combinar X e y en un solo DataFrame
df = pd.concat([X, y], axis=1)

### Verificar la forma del Dataset

In [None]:
# Verificar la forma del DataFrame y su tamaño
print(df.shape)
df.head()

## Preprocesamiento

### Definición de listas de tipos de suelo y áreas silvestres

In [None]:
# Tpos de suelos: Se muestran 3 elementos en el siguiente orden [ID, Codigo del suelo, Descripcion tipo de suelo]
soil_type = [
"1", "C2702", "Cathedral family - Rock outcrop complex, extremely stony.",
"2", "C2703", "Vanet - Ratake families complex, very stony.",
"3", "C2704", "Haploborolis - Rock outcrop complex, rubbly.",
"4", "C2705", "Ratake family - Rock outcrop complex, rubbly.",
"5", "C2706", "Vanet family - Rock outcrop complex complex, rubbly.",
"6", "C2717", "Vanet - Wetmore families - Rock outcrop complex, stony.",
"7", "C3501", "Gothic family.",
"8", "C3502", "Supervisor - Limber families complex.",
"9", "C4201", "Troutville family, very stony.",
"10", "C4703", "Bullwark - Catamount families - Rock outcrop complex, rubbly.",
"11", "C4704", "Bullwark - Catamount families - Rock land complex, rubbly.",
"12", "C4744", "Legault family - Rock land complex, stony.",
"13", "C4758", "Catamount family - Rock land - Bullwark family complex, rubbly.",
"14", "C5101", "Pachic Argiborolis - Aquolis complex.",
"15", "C5151", "unspecified in the USFS Soil and ELU Survey.",
"16", "C6101", "Cryaquolis - Cryoborolis complex.",
"17", "C6102", "Gateview family - Cryaquolis complex.",
"18", "C6731", "Rogert family, very stony.",
"19", "C7101", "Typic Cryaquolis - Borohemists complex.",
"20", "C7102", "Typic Cryaquepts - Typic Cryaquolls complex.",
"21", "C7103", "Typic Cryaquolls - Leighcan family, till substratum complex.",
"22", "C7201", "Leighcan family, till substratum, extremely bouldery.",
"23", "C7202", "Leighcan family, till substratum - Typic Cryaquolls complex.",
"24", "C7700", "Leighcan family, extremely stony.",
"25", "C7701", "Leighcan family, warm, extremely stony.",
"26", "C7702", "Granile - Catamount families complex, very stony.",
"27", "C7709", "Leighcan family, warm - Rock outcrop complex, extremely stony.",
"28", "C7710", "Leighcan family - Rock outcrop complex, extremely stony.",
"29", "C7745", "Como - Legault families complex, extremely stony.",
"30", "C7746", "Como family - Rock land - Legault family complex, extremely stony.",
"31", "C7755", "Leighcan - Catamount families complex, extremely stony.",
"32", "C7756", "Catamount family - Rock outcrop - Leighcan family complex, extremely stony.",
"33", "C7757", "Leighcan - Catamount families - Rock outcrop complex, extremely stony.",
"34", "C7790", "Cryorthents - Rock land complex, extremely stony.",
"35", "C8703", "Cryumbrepts - Rock outcrop - Cryaquepts complex.",
"36", "C8707", "Bross family - Rock land - Cryumbrepts complex, extremely stony.",
"37", "C8708", "Rock outcrop - Cryumbrepts - Cryorthents complex, extremely stony.",
"38", "C8771", "Leighcan - Moran families - Cryaquolls complex, extremely stony.",
"39", "C8772", "Moran family - Cryorthents - Leighcan family complex, extremely stony.",
"40", "C8776", "Moran family - Cryorthents - Rock land complex, extremely stony.",
]
# Tipos de areas silvestres
# Se presentan 2 elementos [Abreviatura del area Silvestre, Nombre completo del area silvestre]
wilderness_area = [
"Rawah", "Rawah Wilderness Area",
"Neota", "Neota Wilderness Area",
"Commanche", "Comanche Peak Wilderness Area",
"Cache", "Cache la Poudre Wilderness Area"
]

### Decodificaciòn de los valores one-hot a dominios categóricos

In [None]:
# Se toman los codigos de suelo que se encuentran desde la columna 14 a la 53
# Se decodifican los valores binarios para traer el Codigo del suelo
soil = df.iloc[:, 14:54].apply(lambda x: soil_type[1::3][x.to_numpy().nonzero()[0][0]], axis=1)
soil

0         C7745
1         C7745
2         C4744
3         C7746
4         C7745
          ...  
581007    C2703
581008    C2703
581009    C2703
581010    C2703
581011    C2703
Length: 581012, dtype: object

In [None]:
# Se toman los tipos de suelo que se encuentran desde la columna 10 a la 13
# Se decodifican los valores binarios para traer la abreviatura del area silvestre
wilderness = df.iloc[:, 10:13].apply(lambda x: wilderness_area[0::2][x.to_numpy().nonzero()[0][0]] if x.to_numpy().nonzero()[0].size > 0 else None, axis=1)
wilderness

0             Rawah
1             Rawah
2             Rawah
3             Rawah
4             Rawah
            ...    
581007    Commanche
581008    Commanche
581009    Commanche
581010    Commanche
581011    Commanche
Length: 581012, dtype: object

### Cracion de dataset con las caracteristicas del tipo de suelo y  áreas silvestres

In [None]:
# Lista de caracterisias del suelo y area silvestre
COLUMN_NAMES = [
    'Elevation', 
    'Aspect', 
    'Slope', 
    'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology',
    'Horizontal_Distance_To_Roadways',
    'Hillshade_9am',
    'Hillshade_Noon',
    'Hillshade_3pm',
    'Horizontal_Distance_To_Fire_Points',
    'Wilderness_Area',
    'Soil_Type',
    'Cover_Type']
# Se toman las caracteristicas de la columna 0 a la 10
df_full = pd.concat([df.iloc[:, 0:10], wilderness, soil, df.iloc[:, 54]], axis=1, ignore_index=True)
#se renombran las columnas para que quede con valores categoricos
df_full.columns = COLUMN_NAMES
df_full

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,Rawah,C7745,5
1,2590,56,2,212,-6,390,220,235,151,6225,Rawah,C7745,5
2,2804,139,9,268,65,3180,234,238,135,6121,Rawah,C4744,2
3,2785,155,18,242,118,3090,238,238,122,6211,Rawah,C7746,2
4,2595,45,2,153,-1,391,220,234,150,6172,Rawah,C7745,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
581007,2396,153,20,85,17,108,240,237,118,837,Commanche,C2703,3
581008,2391,152,19,67,12,95,240,237,119,845,Commanche,C2703,3
581009,2386,159,17,60,7,90,236,241,130,854,Commanche,C2703,3
581010,2384,170,15,60,5,90,230,245,143,864,Commanche,C2703,3


### Ajustar los rangos del tipo de tipo de cuvierta

In [None]:
# Originalmente vienen en valores de 0 a 7 y lo convertimos en un rango de 0 a 6
df_full['Cover_Type'] = df_full['Cover_Type'] - 1
# Nos aseguramos de que el tipo de suelo sea tratado como un tipo string
df_full['Soil_Type'] = df_full['Soil_Type'].astype(str)

### Exploracion de tipos de suelo

In [None]:
# Contamos la cantidad de valores de cada tipo de suelo
df_full.Soil_Type.value_counts()

In [None]:
# Vamos a separar los suelos de tipo C5151 por que es el que corresponde a suelos NO ESPECIFICADOS
df_5151 = df_full[df_full['Soil_Type']=='C5151']
df_no_5151 = df_full[df_full['Soil_Type']!='C5151']

In [None]:
#Dataset con los tipos de suelo NO ESPECIFICADOS
df_5151

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
241543,2078,34,10,0,0,212,219,218,134,484,Cache,C5151,5
241544,2080,13,19,30,0,192,198,197,132,499,Cache,C5151,5
241545,2076,27,24,30,5,175,201,180,105,516,Cache,C5151,5


In [None]:
# Dataset que NO tiene tipos de suelo NO ESPECIFICADOS
df_no_5151

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,Rawah,C7745,4
1,2590,56,2,212,-6,390,220,235,151,6225,Rawah,C7745,4
2,2804,139,9,268,65,3180,234,238,135,6121,Rawah,C4744,1
3,2785,155,18,242,118,3090,238,238,122,6211,Rawah,C7746,1
4,2595,45,2,153,-1,391,220,234,150,6172,Rawah,C7745,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
581007,2396,153,20,85,17,108,240,237,118,837,Commanche,C2703,2
581008,2391,152,19,67,12,95,240,237,119,845,Commanche,C2703,2
581009,2386,159,17,60,7,90,236,241,130,854,Commanche,C2703,2
581010,2384,170,15,60,5,90,230,245,143,864,Commanche,C2703,2


## Preparación del Conjunto de Datos para Modelado

### División Estratificada del Conjunto de Datos para Entrenamiento y Evaluación

In [None]:
# Nos vamos a quedar con los tipos de suelo que si estan especificado y vamos adividir el dataset
# df_small es un subconjunto de df_no_5151 con una muestra estratificada de 100000
# df_other son los demas que no hacen parte de df_small
df_small, df_other = train_test_split(df_no_5151, train_size=100000, stratify=df_no_5151.Cover_Type)

In [None]:
# Dividimos df_no_5151 en dos conjuntos: df_train (431,009 filas) y df_other (resto de las filas)
# Se mantiene la proporción de la variable 'Cover_Type' en ambos conjuntos (estratificación).
df_train, df_other = train_test_split(df_no_5151, train_size=431009, stratify=df_no_5151.Cover_Type)

# Dividimos df_other en dos subconjuntos: df_evaluate (75,000 filas) y df_serving (resto de las filas)
# Se sigue manteniendo la estratificación basada en 'Cover_Type'.
df_evaluate, df_serving = train_test_split(df_other, train_size=75000, stratify=df_other.Cover_Type)

# Se elimina la columna 'Cover_Type' del conjunto df_serving, ya que este dataset puede usarse 
# en un entorno de inferencia, donde la variable objetivo no está disponible.
df_serving = df_serving.drop(columns=['Cover_Type'])

# Imprimimos las dimensiones de los DataFrames generados
print(df_train.shape)
print(df_evaluate.shape)
print(df_serving.shape)

(431009, 13)
(75000, 13)
(75000, 12)


### Simulación de Datos Faltantes en la Columna Horizontal_Distance_To_Hydrology

In [None]:
# Reinicia los índices del DataFrame df_train, eliminando el índice anterior
df_train_missing = df_train.reset_index(drop=True)

# Asignamos 'None' (NaN en pandas) a los primeros 9,000 valores de la columna 
# 'Horizontal_Distance_To_Hydrology', para simular datos faltantes.
df_train_missing.loc[0:8999, 'Horizontal_Distance_To_Hydrology'] = None
df_train_missing

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,3212,191,10,,15,5455,220,248,161,1126,Rawah,C7745,0
1,3205,3,14,,16,4230,200,213,149,2065,Commanche,C7201,0
2,2605,74,16,,-14,1195,237,208,98,295,Rawah,C7201,1
3,2768,73,31,,42,268,238,164,34,2049,Rawah,C7746,4
4,3230,45,13,,20,2809,223,211,120,1075,Commanche,C7201,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
431004,2950,36,4,108.0,1,2037,219,230,148,306,Commanche,C7756,1
431005,2837,278,10,30.0,6,604,193,242,189,1664,Commanche,C4758,1
431006,3101,152,9,150.0,-1,1018,232,240,139,1655,Rawah,C7201,0
431007,3228,136,14,216.0,41,2797,241,234,119,997,Commanche,C7202,0


### Generación de Datos de Evaluación con Anomalías

In [None]:
# Restablecemos el índice del DataFrame de evaluación para evitar problemas de indexación
df_evaluate_anomalies = df_evaluate.reset_index(drop=True)

# Se introducen valores anómalos en la columna 'Slope' (pendiente) en las primeras 5 filas (índices 0 a 4)
# Se asigna un valor de 110, que está fuera del rango normal para esta variable
df_evaluate_anomalies.loc[0:4, 'Slope'] = 110

# Se combinan los datos de evaluación con las muestras del tipo de suelo 'C5151' (NO ESPECIFICADO)
df_evaluate_anomalies = pd.concat([df_evaluate_anomalies, df_5151])
df_evaluate_anomalies

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,3001,96,110,534,16,5234,231,231,133,5454,Rawah,C4744,1
1,3005,139,110,175,33,2405,236,237,131,612,Rawah,C7745,0
2,2768,91,110,242,119,256,248,193,59,1890,Rawah,C7746,4
3,3153,346,110,277,-32,1328,203,227,162,902,Neota,C7700,1
4,3379,68,110,150,-12,3609,236,192,77,2658,Rawah,C8708,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
74998,2958,61,24,234,99,5432,231,182,69,834,Rawah,C7746,1
74999,3159,132,14,150,17,3353,241,232,118,633,Commanche,C7201,1
241543,2078,34,10,0,0,212,219,218,134,484,Cache,C5151,5
241544,2080,13,19,30,0,192,198,197,132,499,Cache,C5151,5


In [None]:
# Contamos la cantidad de valores de cada tipo de suelo en el dataframe de anomalias
df_evaluate_anomalies.Soil_Type.value_counts()

C7745    14939
C7202     7512
C7756     6731
C7757     5740
C7201     4249
C4703     4167
C4744     3905
C7746     3851
C7755     3408
C7700     2838
C4758     2231
C8771     1992
C8772     1841
C4704     1598
C2705     1592
C7102     1091
C8776     1078
C2703      968
C2717      871
C2704      636
C7101      489
C6102      452
C2702      420
C6101      383
C7702      316
C8703      257
C6731      242
C7790      214
C2706      211
C4201      147
C7709      145
C7710      135
C7103      109
C5101       74
C7701       63
C8708       52
C3502       21
C3501       16
C8707       16
C5151        3
Name: Soil_Type, dtype: int64

### Lista Final con descripcion de Datasets de entrenamiento, validacion, anomalias y faltantes

In [None]:

# Dataset entrenamiento normal
df_train
# Dataset de evaluacion
df_evaluate
# Dataset entrenamiento pequeño (100000 estratificado)
df_small
# Dataset de entrenamiento con valores faltantes
df_train_missing
# Dataset de evaluacion con anomalias
df_evaluate_anomalies
# Dataset que no conntiene tipos de suelo para probar inferencia
df_serving