# Data processing

# Libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append('../data')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.manifold import TSNE

from ian_constants import RENAME_DICT, DROP_COLUMNS

from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, make_scorer
from category_encoders import BinaryEncoder
import optuna
from optuna.samplers import TPESampler

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# Constants

In [2]:
NUM_FEATURES = [
    'ORE_TONNAGE', 'INITIAL_COST', 'COPPER_GRADE', 'GOLD_DENSITY', 
    'SILVER_DENSITY', 'PRECIOUS_ORE_DENSITY',
    'GOLD_TONNAGE',
    'SILVER_TONNAGE', 'PRECIOUS_TONNAGE', 'COPPER_TONNAGE', 
    'ECONOMIC_TONNAGE',
    'LOG_10_ORE_TONNAGE',
    'LOG_10_INITIAL_COST', 'LOG_10_COPPER_GRADE',
    'LOG_10_GOLD_DENSITY', 'LOG_10_SILVER_DENSITY',
    'LOG_10_PRECIOUS_ORE_DENSITY', 'LOG_10_GOLD_TONNAGE',
    'LOG_10_SILVER_TONNAGE', 'LOG_10_PRECIOUS_TONNAGE', 'LOG_10_COPPER_TONNAGE',
    'LOG_10_ECONOMIC_TONNAGE'
    ]

CAT_FEATURES = [
    'GLOBAL_REGION', 
    'GEOLOGIC_ORE_BODY_TYPE'
    ]

TO_LOG = [
    'ORE_TONNAGE', 'INITIAL_COST', 'COPPER_GRADE', 'GOLD_DENSITY', 
    'SILVER_DENSITY', 'PRECIOUS_ORE_DENSITY',
    'GOLD_TONNAGE',
    'SILVER_TONNAGE', 'PRECIOUS_TONNAGE', 'COPPER_TONNAGE', 
    'ECONOMIC_TONNAGE'
]

FEATURES = NUM_FEATURES + CAT_FEATURES

FILL_COLS = ['COPPER_GRADE', 'LEAD_GRADE', 'ZINC_GRADE',
       'GOLD_DENSITY', 'SILVER_DENSITY']

# Data Loading

In [13]:
raw_copper_mines_df = pd.read_excel('../data/raw/Cu_v2.xls', decimal=',', thousands='.')
raw_copper_mines_df.rename(columns=RENAME_DICT, inplace=True)

# Preprocessamento

In [14]:
raw_copper_mines_df.rename(columns=RENAME_DICT, inplace=True)

# Fill specified columns with NaN where 'y' is 0
raw_copper_mines_df.loc[raw_copper_mines_df['ORE_TONNAGE'] == 0, FILL_COLS] = np.nan

raw_copper_mines_df['ORE_TONNAGE'].replace(0, np.nan, inplace=True)
raw_copper_mines_df.drop(columns=DROP_COLUMNS, inplace=True)
raw_copper_mines_df.dropna(how='all', inplace=True)

# Feature Engineering

In [15]:
raw_copper_mines_df['LEAD_GRADE'] = raw_copper_mines_df['LEAD_GRADE']/100
raw_copper_mines_df['ZINC_GRADE'] = raw_copper_mines_df['ZINC_GRADE']/100
raw_copper_mines_df['COPPER_GRADE'] = raw_copper_mines_df['COPPER_GRADE']/100

raw_copper_mines_df['PRECIOUS_ORE_DENSITY'] = raw_copper_mines_df['GOLD_DENSITY'] + raw_copper_mines_df['SILVER_DENSITY']
raw_copper_mines_df['GOLD_TONNAGE'] = raw_copper_mines_df['ORE_TONNAGE'] * raw_copper_mines_df['GOLD_DENSITY']
raw_copper_mines_df['SILVER_TONNAGE'] = raw_copper_mines_df['ORE_TONNAGE'] * raw_copper_mines_df['SILVER_DENSITY']
raw_copper_mines_df['PRECIOUS_TONNAGE'] = raw_copper_mines_df['ORE_TONNAGE'] * raw_copper_mines_df['PRECIOUS_ORE_DENSITY']
raw_copper_mines_df['COPPER_TONNAGE'] = raw_copper_mines_df['ORE_TONNAGE'] * raw_copper_mines_df['COPPER_GRADE']
raw_copper_mines_df['ECONOMIC_TONNAGE'] = (raw_copper_mines_df['COPPER_TONNAGE'] + raw_copper_mines_df['PRECIOUS_TONNAGE'])
raw_copper_mines_df['GLOBAL_REGION'] = raw_copper_mines_df['GLOBAL_REGION'].apply(lambda x: "Africa-Europe-Middle East" if x in ["Africa", "Europe", "Middle East"] else x)
raw_copper_mines_df['GEOLOGIC_ORE_BODY_TYPE'] = raw_copper_mines_df['GEOLOGIC_ORE_BODY_TYPE'].apply(lambda x: "SKARN-SHD" if x in ["SKARN", "SHD"] else x)

for col in TO_LOG:
    raw_copper_mines_df['LOG_10_' + col] = np.log10(raw_copper_mines_df[col])
    raw_copper_mines_df['LOG_10_' + col].replace(-np.inf, -100, inplace=True)

# Separação de treino e teste

In [20]:
raw_copper_mines_df = raw_copper_mines_df.dropna(subset=['TIR'])
X = raw_copper_mines_df[FEATURES]
y = raw_copper_mines_df['TIR']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=94, random_state=42)

X_train = X_train[y_train < 125]
y_train = y_train[y_train < 125]

traind_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

y_train_cat = y_train < 15
y_test_cat = y_test < 15

del y_train
del y_test

print(X_train.shape)
print(X_test.shape)

(100, 24)
(94, 24)


# Otimização

In [59]:
categorical_transformer = Pipeline(steps=[
    ('binary_encoder', BinaryEncoder())
])

def objective(trial):
    
    numeric_transformer_pca = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('imputer', KNNImputer(n_neighbors=5)),
        #('pca', PCA())  # PCA will be tuned
    ])

    NUM_COLS_DICT = {col: trial.suggest_int(col, 0, 1) for col in NUM_FEATURES}
    CAT_COLS_DICT = {col: trial.suggest_int(col, 0, 1) for col in CAT_FEATURES}

    SEL_NUM_COLS = [col for col, value in NUM_COLS_DICT.items() if value == 1]
    SEL_CAT_COLS = [col for col, value in CAT_COLS_DICT.items() if value == 1]

    #n_components = trial.suggest_int('n_components', 1, len(SEL_NUM_COLS))

    if len(SEL_NUM_COLS) == 0 and len(SEL_CAT_COLS) == 0:
        return 0

    if len(SEL_CAT_COLS) > 0 and len(SEL_NUM_COLS) > 0:
        numeric_transformer_pca.set_params(pca__n_components=n_components)

        preprocessor = ColumnTransformer(
                transformers=[
                    ('num_pca', numeric_transformer_pca, SEL_NUM_COLS),
                    ('cat', categorical_transformer, SEL_CAT_COLS)
                ]
            )
    elif len(SEL_CAT_COLS) > 0:

        preprocessor = ColumnTransformer(
                transformers=[
                    ('cat', categorical_transformer, SEL_CAT_COLS)
                ]
            )

    else:
        numeric_transformer_pca.set_params(pca__n_components=n_components)

        preprocessor = ColumnTransformer(
                transformers=[
                    ('num_pca', numeric_transformer_pca, SEL_NUM_COLS)
                ]
            )    
    
    # Suggest parameters for RandomForest
    n_estimators = trial.suggest_int('n_estimators', 1, 10)
    max_depth = trial.suggest_int('max_depth', 2, 7)
    #max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2'])
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)

    # Define SMOTE and RandomUnderSampler
    over = SMOTE(sampling_strategy="auto")
    under = RandomUnderSampler(sampling_strategy="auto")

    # Create a pipeline with PCA and RandomForest
    model = ImbPipeline(steps=[
        ('preprocessor', preprocessor),
        ('over', over),
        ('under', under),
        ('classifier', RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            #max_features=max_features,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42,
            n_jobs=-1,
            class_weight='balanced'
        ))
    ])

    # Cross-validation
    rkf= RepeatedStratifiedKFold(n_splits=3, n_repeats=10, random_state=42)
    scores = cross_val_score(model, X_train, y_train_cat, cv=rkf, scoring=make_scorer(roc_auc_score))
    return np.mean(scores), np.std(scores)

study = optuna.create_study(directions=['maximize', 'minimize'], sampler=TPESampler())
study.optimize(objective, n_trials=1000)

# Best parameters
#print("Best parameters:", study.best_params)
#print("Best ROC AUC score:", study.best_value)


[I 2024-10-11 14:15:03,694] A new study created in memory with name: no-name-76624cf2-31ae-46d0-bf68-8fda9f5ce56b


[I 2024-10-11 14:15:05,238] Trial 0 finished with values: [0.578898259161417, 0.09251812499434882] and parameters: {'ORE_TONNAGE': 0, 'INITIAL_COST': 0, 'COPPER_GRADE': 1, 'GOLD_DENSITY': 0, 'SILVER_DENSITY': 1, 'PRECIOUS_ORE_DENSITY': 1, 'GOLD_TONNAGE': 1, 'SILVER_TONNAGE': 0, 'PRECIOUS_TONNAGE': 1, 'COPPER_TONNAGE': 1, 'ECONOMIC_TONNAGE': 0, 'LOG_10_ORE_TONNAGE': 1, 'LOG_10_INITIAL_COST': 1, 'LOG_10_COPPER_GRADE': 1, 'LOG_10_GOLD_DENSITY': 0, 'LOG_10_SILVER_DENSITY': 0, 'LOG_10_PRECIOUS_ORE_DENSITY': 0, 'LOG_10_GOLD_TONNAGE': 0, 'LOG_10_SILVER_TONNAGE': 1, 'LOG_10_PRECIOUS_TONNAGE': 0, 'LOG_10_COPPER_TONNAGE': 0, 'LOG_10_ECONOMIC_TONNAGE': 1, 'GLOBAL_REGION': 0, 'GEOLOGIC_ORE_BODY_TYPE': 1, 'n_components': 1, 'n_estimators': 2, 'max_depth': 6, 'min_samples_split': 5, 'min_samples_leaf': 1}.
[I 2024-10-11 14:15:06,493] Trial 1 finished with values: [0.5366671182460656, 0.09023904851115669] and parameters: {'ORE_TONNAGE': 1, 'INITIAL_COST': 1, 'COPPER_GRADE': 1, 'GOLD_DENSITY': 1, 'SIL

In [60]:
optuna.visualization.plot_pareto_front(study, target_names=["mean ROC-AUC", "std ROC-AUC"])

In [62]:
study = study.trials_dataframe()

In [77]:
sample = study[(study['values_0'] > 0.63) & (study['values_0'] < 0.64) & (study['values_1'] < 0.06)].T
sample[6:-1][sample[6:-1]>0].dropna()

Unnamed: 0,799
params_COPPER_TONNAGE,1
params_GEOLOGIC_ORE_BODY_TYPE,1
params_GLOBAL_REGION,1
params_GOLD_DENSITY,1
params_INITIAL_COST,1
params_LOG_10_COPPER_GRADE,1
params_LOG_10_COPPER_TONNAGE,1
params_LOG_10_ECONOMIC_TONNAGE,1
params_LOG_10_GOLD_TONNAGE,1
params_LOG_10_INITIAL_COST,1


In [53]:
study_df[study_df['number'] == 453]

Unnamed: 0,number,values_0,values_1,datetime_start,datetime_complete,duration,params_COPPER_GRADE,params_COPPER_TONNAGE,params_ECONOMIC_TONNAGE,params_GEOLOGIC_ORE_BODY_TYPE,...,params_PRECIOUS_ORE_DENSITY,params_PRECIOUS_TONNAGE,params_SILVER_DENSITY,params_SILVER_TONNAGE,params_max_depth,params_min_samples_leaf,params_min_samples_split,params_n_components,params_n_estimators,state
453,453,0.651159,0.060823,2024-10-11 14:04:36.508834,2024-10-11 14:04:38.768477,0 days 00:00:02.259643,0,0,0,0,...,1,0,1,0,6,5,7,2,9,COMPLETE
