# PROYECTO 1

**Curso:** Operaciones de Aprendizaje de Máquina

**Estudiantes:**
- Juan José García
- Ruben Dario Hoyos
- José Rafael Peña

## Setup

### Importación de librerías

In [None]:
# General modules
from pathlib import Path
import os
import requests
from typing import List
from dataclasses import dataclass
import pandas as pd

In [None]:
# Sklearn modules
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

In [None]:
# Tensorflow module
import tensorflow as tf

# TFX components
from tfx.components import CsvExampleGen
from tfx.components import ExampleValidator
from tfx.components import SchemaGen
from tfx.components import StatisticsGen
from tfx.components import Transform
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
from google.protobuf.json_format import MessageToDict

# TFDV modules
import tensorflow_data_validation as tfdv
from tensorflow_metadata.proto.v0 import schema_pb2

### Definición de carpetas

In [None]:
# Directory of the raw data files
data_root = Path('./data/covertype')

# Directory of the preprocessed data files
data_root_prepro = Path('./data/covertype_prepro')

# Path to the raw training data
data_filepath = data_root / 'covertype_train.csv'

# Ensure the data_root directory exists
data_root.mkdir(parents=True, exist_ok=True)

# Ensure the data_root_prepro directory exists
data_root_prepro.mkdir(parents=True, exist_ok=True)

# Directory of the pipeline metadata store
pipeline_root = Path('./pipeline/')

# Ensure the pipeline_root directory exists
pipeline_root.mkdir(parents=True, exist_ok=True)

### Carga de datos

In [None]:
# Download data if it doesn't exist
if not data_filepath.is_file():
    # URL for the dataset
    # https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/
    url = 'https://docs.google.com/uc?export=download&confirm={{VALUE}}&id=1lVF1BCWLH4eXXV_YOJzjR7xZjj-wAGj9'
    
    r = requests.get(url, allow_redirects=True, stream=True)
    
    data_filepath.write_bytes(r.content)

## Pasos proyecto

### **2.1** Carga el dataset

In [None]:
df = pd.read_csv(data_filepath)

In [None]:
df.info()

### **3** Selección de características

In [None]:
@dataclass
class DataConfig:
    target_col: str
    non_numeric_cols: List[str]
    final_df_path: Path

# Creating an instance with specific values
config = DataConfig(
    target_col="Cover_Type",
    non_numeric_cols=list(df.select_dtypes(include=['object']).columns),
    final_df_path= data_root_prepro / "covertype_preprocessed.csv"
)

La ejecución de la siguiente celda se omite mediante el comando `%%script false --no-raise-error`, ya que contiene la normalización de los datos, un proceso que ya se realizó previamente según lo indicado en el documento:  

> **"Recuerde que, primero, debe preparar las características de entrada y de destino:"**  

Sin embargo, más adelante en el documento se asume que los datos conservan sus valores originales, por lo que la normalización se aplica posteriormente utilizando las herramientas de TFX.

In [None]:
%%script false --no-raise-error
# Drop non-numeric columns
df_1 = df.drop(columns=config.non_numeric_cols)

# Separate features and label
X = df_1.drop(columns=[config.target_col])
y = df_1[config.target_col].astype('category')

# Scale data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame with original column names
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Implement f_classif as score function and select the 8 best columns
selector = SelectKBest(score_func=f_classif, k=8)
selector.fit(X, y)

# Create and print a df comparing the column and the result (if its retained or not)
selected_columns_df = pd.DataFrame({
    'Column': X_scaled.columns,
    'Retain': selector.get_support()
})
selected_columns_df

In [None]:
# Drop non-numeric columns
df_1 = df.drop(columns=config.non_numeric_cols)

# Separate features and label
X = df_1.drop(columns=[config.target_col])
y = df_1[config.target_col].astype('category')

# Implement f_classif as score function and select the 8 best columns
selector = SelectKBest(score_func=f_classif, k=8)
selector.fit(X, y)

# Select the best features using boolean mask
X_selected = X.loc[:, selector.get_support()]

# Create and print a df comparing the column and the result (if its retained or not)
selected_columns_df = pd.DataFrame({
    'Column': X.columns,
    'Retain': selector.get_support()
})
selected_columns_df

In [None]:
# Add the target column back
final_df = X_selected.copy()
final_df[config.target_col] = y.values

# Save the updated dataframe to CSV
final_df.to_csv(config.final_df_path, index=False)

### **4.1** Configurar el contexto interactivo

In [None]:
context = InteractiveContext(pipeline_root=str(pipeline_root))

### **4.2** Generando ejemplos

In [None]:
# Instantiate ExampleGen with the input CSV dataset
example_gen = CsvExampleGen(input_base=str(data_root_prepro))

# Execute the component
context.run(example_gen)

print("CsvExampleGen ok")

### **4.3** Estadísticas

In [None]:
# get the artifact object
artifact = example_gen.outputs['examples'].get()[0]

# print split names and uri
print(f'split names: {artifact.split_names}')
print(f'artifact uri: {artifact.uri}')

In [None]:
# Instantiate StatisticsGen with the ExampleGen ingested dataset
statistics_gen = StatisticsGen(
    examples=example_gen.outputs['examples'])

# Execute the component
context.run(statistics_gen)

print('StatisticsGen OK')

In [None]:
# Show the output statistics
context.show(statistics_gen.outputs['statistics'])

### **4.4** Inferir Esquema

In [None]:
# Instantiate SchemaGen with the StatisticsGen ingested dataset
schema_gen = SchemaGen(
    statistics=statistics_gen.outputs['statistics'],
    )

# Run the component
context.run(schema_gen)

print('SchemaGen OK')

In [None]:
# Visualize the schema
context.show(schema_gen.outputs['schema'])

### **4.5** Curando Esquema

In [None]:
# Load schema as tensorflow_metadata.proto.v0.schema_pb2
schema_path = schema_gen.outputs['schema'].get()[0].uri + "/schema.pbtxt"
schema = tfdv.load_schema_text(schema_path)
type(schema)

In [None]:
# Set domains for Hillshade_9am, Hillshade_Noon, Slope, Cover_Type
tfdv.set_domain(schema, 'Hillshade_9am', schema_pb2.IntDomain(min=0, max=255))
tfdv.set_domain(schema, 'Hillshade_Noon', schema_pb2.IntDomain(min=0, max=255))
tfdv.set_domain(schema, 'Slope', schema_pb2.IntDomain(min=0, max=99))
tfdv.set_domain(schema, 'Cover_Type', schema_pb2.StringDomain(value=['0','1', '2', '3', '4', '5', '6']))

En los siguientes bloques se muestra que el esquema ha cambiado solo en memoria. Sin embargo, aún es necesario guardarlo en la metadata de SchemaGen para que los cambios sean persistentes y reconocidos por el pipeline.

In [None]:
# Display the updated schema with domains
tfdv.display_schema(schema)

In [None]:
# Display the old schema with no domains
context.show(schema_gen.outputs['schema'])

In [None]:
# Overwrite the file
tfdv.write_schema_text(schema, schema_path)

# Display updated schema with SchemaGen
context.show(schema_gen.outputs['schema'])