# Copper Data Processing

# Libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from constants import (
    RENAME_DICT, TO_LOG10, FEATURES, MIN_TIR,
    FILL_COLS, LOG_INF_REPL, UPPER_LIMIT_TIR
)

# Data Loading

In [2]:
raw_copper_mines_df = pd.read_excel('../../../data/raw/Cu_v2.xls', decimal=',', thousands='.')

# Preprocessamento

In [3]:
raw_copper_mines_df.rename(columns=RENAME_DICT, inplace=True)

raw_copper_mines_df.loc[raw_copper_mines_df['ORE_TONNAGE'] == 0, FILL_COLS] = np.nan

raw_copper_mines_df['ORE_TONNAGE'].replace(0, np.nan, inplace=True)
#raw_copper_mines_df.drop(columns=DROP_COLUMNS, inplace=True)
raw_copper_mines_df.dropna(how='all', inplace=True)

# Feature Engineering

In [4]:
raw_copper_mines_df['COPPER_GRADE'] = raw_copper_mines_df['COPPER_GRADE']/100

raw_copper_mines_df['PRECIOUS_ORE_DENSITY'] = raw_copper_mines_df['GOLD_DENSITY'] + raw_copper_mines_df['SILVER_DENSITY']
raw_copper_mines_df['GOLD_TONNAGE'] = raw_copper_mines_df['ORE_TONNAGE'] * raw_copper_mines_df['GOLD_DENSITY']
raw_copper_mines_df['SILVER_TONNAGE'] = raw_copper_mines_df['ORE_TONNAGE'] * raw_copper_mines_df['SILVER_DENSITY']
raw_copper_mines_df['PRECIOUS_TONNAGE'] = raw_copper_mines_df['ORE_TONNAGE'] * raw_copper_mines_df['PRECIOUS_ORE_DENSITY']
raw_copper_mines_df['COPPER_TONNAGE'] = raw_copper_mines_df['ORE_TONNAGE'] * raw_copper_mines_df['COPPER_GRADE']
raw_copper_mines_df['ECONOMIC_TONNAGE'] = (raw_copper_mines_df['COPPER_TONNAGE'] + raw_copper_mines_df['PRECIOUS_TONNAGE'])
raw_copper_mines_df['GEOLOGIC_ORE_BODY_TYPE'] = raw_copper_mines_df['GEOLOGIC_ORE_BODY_TYPE'].apply(lambda x: "SKARN-SHD" if x in ["SKARN", "SHD"] else x)
raw_copper_mines_df['INITIAL_COST_PER_TONNE'] = raw_copper_mines_df['INITIAL_COST'] / raw_copper_mines_df['ECONOMIC_TONNAGE']
raw_copper_mines_df['PRICE_PER_TONNE_MAIN_ORE'] = raw_copper_mines_df['PRICE_PER_TONNE_MAIN_ORE'].fillna(-1)
raw_copper_mines_df['MINE_TYPE'] = raw_copper_mines_df['MINE_TYPE'].apply(lambda x: "Open Pit" if x in ["In-Situ Leach", "Tailings"] else x)

for col in TO_LOG10:
    raw_copper_mines_df['LOG_10_' + col] = np.log10(raw_copper_mines_df[col])
    raw_copper_mines_df['LOG_10_' + col].replace(-np.inf, LOG_INF_REPL, inplace=True)

# Separação de treino e teste

In [10]:
raw_copper_mines_df = raw_copper_mines_df.dropna(subset=['TIR'])
X = raw_copper_mines_df[FEATURES]
y = raw_copper_mines_df['TIR']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=94, random_state=42)

X_train = X_train[y_train < UPPER_LIMIT_TIR]
y_train = y_train[y_train < UPPER_LIMIT_TIR]

train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

y_train_cat = y_train < MIN_TIR
y_test_cat = y_test < MIN_TIR

y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)
y_train_cat = pd.DataFrame(y_train_cat)
y_test_cat = pd.DataFrame(y_test_cat)

print(X_train.shape)
print(X_test.shape)

(100, 27)
(94, 27)


# Saving

In [13]:
raw_copper_mines_df.to_parquet("../../../data/processed/copper/copper_data.parquet")
train_df.to_parquet("../../../data/processed/copper/train_data.parquet")
test_df.to_parquet("../../../data/processed/copper/test_data.parquet")
X_train.to_parquet("../../../data/interim/copper/X_train.parquet")
X_test.to_parquet("../../../data/interim/copper/X_test.parquet")
y_train.to_parquet("../../../data/interim/copper/y_train.parquet")
y_test.to_parquet("../../../data/interim/copper/y_test.parquet")
y_train_cat.to_parquet("../../../data/interim/copper/y_train_cat.parquet")
y_test_cat.to_parquet("../../../data/interim/copper/y_test_cat.parquet")