# PROYECTO 1

**Curso:** Operaciones de Aprendizaje de Máquina

**Estudiantes:**
- Juan José García
- Ruben Dario Hoyos
- José Rafael Peña

## Setup

### Importación de librerías

In [None]:
from pathlib import Path
import os
import requests

import pandas as pd
from dataclasses import dataclass
from typing import List

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

### Carga de datos

In [None]:
# Directory of the raw data files
data_root = Path('./data/covertype')

# Path to the raw training data
data_filepath = data_root / 'covertype_train.csv'

# Ensure the directory exists
data_root.mkdir(parents=True, exist_ok=True)

# Download data if it doesn't exist
if not data_filepath.is_file():
    # URL for the dataset
    # https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/
    url = 'https://docs.google.com/uc?export=download&confirm={{VALUE}}&id=1lVF1BCWLH4eXXV_YOJzjR7xZjj-wAGj9'
    
    r = requests.get(url, allow_redirects=True, stream=True)
    
    data_filepath.write_bytes(r.content)

## Pasos proyecto

### **2.1** Carga el dataset

In [None]:
df = pd.read_csv(data_filepath)

In [None]:
df.info()

### **3** Selección de características

In [None]:
@dataclass
class DataConfig:
    target_col: str
    non_numeric_cols: List[str]

# Creating an instance with specific values
config = DataConfig(
    target_col="Cover_Type",
    non_numeric_cols=list(df.select_dtypes(include=['object']).columns)
)

In [None]:
# Drop non-numeric columns
df_1 = df.drop(columns=config.non_numeric_cols)

# Separate features and label
X = df_1.drop(columns=[config.target_col])
y = df_1[config.target_col].astype('category')

# Scale data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame with original column names
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Implement f_classif as score function and select the 8 best columns
selector = SelectKBest(score_func=f_classif, k=8)
X_selected = selector.fit_transform(X_scaled, y)

# Create and print a df comparing the column and the result (if its retained or not)
selected_columns_df = pd.DataFrame({
    'Column': X_scaled.columns,
    'Retain': selector.get_support()
})
selected_columns_df

### **4.1** Configurar el contexto interactivo