# Zyfra Gold Mining

## Introduction

### Prepare the Data

#### Open the files and look into the data

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Load the data
try:
    train = pd.read_csv('./datasets/gold_recovery_train.csv')
    test = pd.read_csv('./datasets/gold_recovery_test.csv')
    full = pd.read_csv('./datasets/gold_recovery_full.csv')
except:
    train = pd.read_csv('https://practicum-content.s3.us-west-1.amazonaws.com/datasets/gold_recovery_train.csv')
    test = pd.read_csv('https://practicum-content.s3.us-west-1.amazonaws.com/datasets/gold_recovery_test.csv')
    full = pd.read_csv('https://practicum-content.s3.us-west-1.amazonaws.com/datasets/gold_recovery_full.csv')

#### Define functions that will be used throughout the notebook

In [3]:
def explore_data(df):
    display(df.sample(5))
    print('--------------------------------')
    print(df.isna().sum())
    print('--------------------------------')
    print(df.info())
    print('--------------------------------')
    print(df.describe())

In [4]:
# Explore train data
explore_data(train)

Unnamed: 0,date,final.output.concentrate_ag,final.output.concentrate_pb,final.output.concentrate_sol,final.output.concentrate_au,final.output.recovery,final.output.tail_ag,final.output.tail_pb,final.output.tail_sol,final.output.tail_au,...,secondary_cleaner.state.floatbank4_a_air,secondary_cleaner.state.floatbank4_a_level,secondary_cleaner.state.floatbank4_b_air,secondary_cleaner.state.floatbank4_b_level,secondary_cleaner.state.floatbank5_a_air,secondary_cleaner.state.floatbank5_a_level,secondary_cleaner.state.floatbank5_b_air,secondary_cleaner.state.floatbank5_b_level,secondary_cleaner.state.floatbank6_a_air,secondary_cleaner.state.floatbank6_a_level
7414,2017-03-20 21:59:59,5.558176,11.539416,9.947735,43.794312,68.285968,8.474038,2.59299,12.663216,2.842776,...,25.036494,-397.100974,23.02056,-399.182318,25.141723,-447.792236,19.926155,-448.768719,25.0306,-499.215137
6839,2017-02-24 22:59:59,4.536247,9.906146,17.243469,46.710641,70.211569,12.088308,5.016743,9.716461,4.405981,...,25.036894,-398.759142,23.013718,-399.619306,22.987744,-449.947755,20.009104,-450.117964,25.01306,-500.206743
7401,2017-03-20 08:59:59,4.69516,11.98739,9.653173,44.694202,62.592998,8.425074,2.826068,12.1091,3.05584,...,24.99692,-399.438923,22.936806,-399.544365,25.604646,-450.193656,19.976553,-448.555867,25.00293,-500.163334
8403,2017-05-01 02:59:59,4.173962,12.408987,10.892748,44.150162,67.468182,7.640183,3.462145,12.136147,2.925332,...,25.004152,-398.463186,23.036681,-399.644556,26.006801,-449.73188,23.980274,-449.914963,30.024204,-500.371625
2472,2016-04-26 23:59:59,8.912745,10.614082,13.012948,39.524567,68.848043,12.447715,3.556821,6.665528,4.580331,...,11.954949,-500.120401,10.100053,-499.371359,9.006842,-500.78875,8.982027,-500.380252,16.01025,-500.82746


--------------------------------
date                                            0
final.output.concentrate_ag                    72
final.output.concentrate_pb                    72
final.output.concentrate_sol                  370
final.output.concentrate_au                    71
                                             ... 
secondary_cleaner.state.floatbank5_a_level     85
secondary_cleaner.state.floatbank5_b_air       85
secondary_cleaner.state.floatbank5_b_level     84
secondary_cleaner.state.floatbank6_a_air      103
secondary_cleaner.state.floatbank6_a_level     85
Length: 87, dtype: int64
--------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16860 entries, 0 to 16859
Data columns (total 87 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   date                                                16860 non-null  object 
 1   fi

In [5]:
# Explore the test data
explore_data(test)

Unnamed: 0,date,primary_cleaner.input.sulfate,primary_cleaner.input.depressant,primary_cleaner.input.feed_size,primary_cleaner.input.xanthate,primary_cleaner.state.floatbank8_a_air,primary_cleaner.state.floatbank8_a_level,primary_cleaner.state.floatbank8_b_air,primary_cleaner.state.floatbank8_b_level,primary_cleaner.state.floatbank8_c_air,...,secondary_cleaner.state.floatbank4_a_air,secondary_cleaner.state.floatbank4_a_level,secondary_cleaner.state.floatbank4_b_air,secondary_cleaner.state.floatbank4_b_level,secondary_cleaner.state.floatbank5_a_air,secondary_cleaner.state.floatbank5_a_level,secondary_cleaner.state.floatbank5_b_air,secondary_cleaner.state.floatbank5_b_level,secondary_cleaner.state.floatbank6_a_air,secondary_cleaner.state.floatbank6_a_level
5008,2017-11-26 16:59:59,204.030058,10.966694,7.5,1.967878,1600.924018,-501.251033,1597.588919,-497.492985,1599.934816,...,16.985971,-499.685566,15.020728,-499.207852,10.948015,-501.963483,9.008584,-500.800376,16.019234,-500.901752
3078,2017-09-07 06:59:59,,,6.44,,0.0,-797.408631,0.0,-799.709651,0.0,...,0.0,-799.734266,0.0,-799.861995,0.501476,-798.06907,0.657183,-800.035928,0.291482,-809.551876
3896,2017-10-11 08:59:59,178.472862,5.678455,6.51,2.201921,1398.347111,-499.778317,1398.811253,-501.164622,1398.167055,...,18.015913,-500.152755,16.004237,-399.71355,12.939741,-500.302646,10.027145,-499.946975,14.000252,-500.510923
3079,2017-09-07 07:59:59,,,6.44,,0.0,-797.596816,0.0,-799.189274,0.0,...,0.0,-799.730764,0.0,-799.864521,0.507308,-798.682121,0.650173,-800.038423,0.295164,-809.087251
4362,2017-10-30 18:59:59,159.676959,8.49408,5.86,1.807445,1700.728047,-500.553772,1701.721354,-501.404399,1574.939385,...,17.97677,-500.935508,15.985427,-500.551176,15.004584,-500.717948,11.02791,-500.281167,15.99774,-500.467464


--------------------------------
date                                            0
primary_cleaner.input.sulfate                 302
primary_cleaner.input.depressant              284
primary_cleaner.input.feed_size                 0
primary_cleaner.input.xanthate                166
primary_cleaner.state.floatbank8_a_air         16
primary_cleaner.state.floatbank8_a_level       16
primary_cleaner.state.floatbank8_b_air         16
primary_cleaner.state.floatbank8_b_level       16
primary_cleaner.state.floatbank8_c_air         16
primary_cleaner.state.floatbank8_c_level       16
primary_cleaner.state.floatbank8_d_air         16
primary_cleaner.state.floatbank8_d_level       16
rougher.input.feed_ag                          16
rougher.input.feed_pb                          16
rougher.input.feed_rate                        40
rougher.input.feed_size                        22
rougher.input.feed_sol                         67
rougher.input.feed_au                          16
rougher.input.flo

In [6]:
# Explore the full data
explore_data(full)

Unnamed: 0,date,final.output.concentrate_ag,final.output.concentrate_pb,final.output.concentrate_sol,final.output.concentrate_au,final.output.recovery,final.output.tail_ag,final.output.tail_pb,final.output.tail_sol,final.output.tail_au,...,secondary_cleaner.state.floatbank4_a_air,secondary_cleaner.state.floatbank4_a_level,secondary_cleaner.state.floatbank4_b_air,secondary_cleaner.state.floatbank4_b_level,secondary_cleaner.state.floatbank5_a_air,secondary_cleaner.state.floatbank5_a_level,secondary_cleaner.state.floatbank5_b_air,secondary_cleaner.state.floatbank5_b_level,secondary_cleaner.state.floatbank6_a_air,secondary_cleaner.state.floatbank6_a_level
15196,2017-10-09 03:59:59,3.968567,10.146832,7.078702,48.765178,65.577395,6.728662,2.826512,13.271684,2.363352,...,18.030177,-499.995578,15.866969,-400.280523,12.961143,-499.583905,10.011219,-500.272605,13.962048,-505.081366
11845,2017-05-22 12:59:59,5.424818,10.342266,9.839766,44.815139,,7.565142,3.17732,9.999742,3.348208,...,24.97544,-484.88506,23.018985,-499.135656,25.991191,-496.597004,23.976154,-499.785681,17.992983,-500.529834
21994,2018-07-19 09:59:59,4.221066,12.014648,6.163379,42.41769,100.0,0.0,0.0,0.0,0.0,...,17.011931,-511.215151,14.218637,-291.036007,11.282656,-759.375195,10.415785,-800.085449,3.144339,-761.185028
21390,2018-06-24 05:59:59,5.511955,9.786088,6.989073,43.30551,89.503222,6.656381,1.646867,3.42113,1.629016,...,30.029484,-499.539503,22.960678,-498.436761,19.968942,-500.150825,14.989911,-499.887152,15.996422,-500.577994
2166,2016-04-14 06:00:00,6.389894,11.091212,8.112045,42.642192,63.842948,8.066798,2.275514,9.547084,2.704609,...,10.087016,-799.567146,10.049865,-799.733015,18.972284,-501.362727,18.996633,-500.645361,25.051744,-504.170456


--------------------------------
date                                            0
final.output.concentrate_ag                    89
final.output.concentrate_pb                    87
final.output.concentrate_sol                  385
final.output.concentrate_au                    86
                                             ... 
secondary_cleaner.state.floatbank5_a_level    101
secondary_cleaner.state.floatbank5_b_air      101
secondary_cleaner.state.floatbank5_b_level    100
secondary_cleaner.state.floatbank6_a_air      119
secondary_cleaner.state.floatbank6_a_level    101
Length: 87, dtype: int64
--------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22716 entries, 0 to 22715
Data columns (total 87 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   date                                                22716 non-null  object 
 1   fi

In [7]:
# Calculate recovery for the rougher.output.recovery feature
C = train['rougher.output.concentrate_au']
F = train['rougher.input.feed_au']
T = train['rougher.output.tail_au']

train['recovery_calculated'] = ((C * (F - T)) / (F * (C - T))) * 100

# Drop the rows with NaN values in 'rougher.output.recovery' and 'recovery_calculated'
train = train.dropna(subset=['rougher.output.recovery', 'recovery_calculated'])

# Now calculate the MAE
mae = mean_absolute_error(train['rougher.output.recovery'], train['recovery_calculated'])
print('Mean Absolute Error:', mae)

0        87.107763
1        86.843261
2        86.842308
3        87.226430
4        86.688794
           ...    
16855    89.574376
16856    87.724007
16857    88.890579
16858    89.858126
16859    89.514960
Name: recovery_calculated, Length: 16860, dtype: float64
Mean Absolute Error: 9.303415616264301e-15


In [48]:
# Analyze the features not available in the test set
train_features = set(train.columns)
test_features = set(test.columns)

missing_features = train_features - test_features
print('Missing features:', missing_features)

# Check their types
for feature in missing_features:
    print('Type of', feature, ':', train[feature].dtype)

Missing features: {'primary_cleaner.output.tail_pb', 'final.output.tail_au', 'primary_cleaner.output.tail_au', 'secondary_cleaner.output.tail_au', 'recovery_calculated', 'rougher.calculation.sulfate_to_au_concentrate', 'final.output.recovery', 'secondary_cleaner.output.tail_pb', 'primary_cleaner.output.concentrate_au', 'rougher.output.tail_pb', 'final.output.tail_sol', 'secondary_cleaner.output.tail_sol', 'rougher.output.concentrate_ag', 'rougher.output.tail_au', 'primary_cleaner.output.concentrate_sol', 'rougher.calculation.floatbank11_sulfate_to_au_feed', 'rougher.calculation.au_pb_ratio', 'rougher.output.recovery', 'primary_cleaner.output.tail_sol', 'final.output.concentrate_ag', 'primary_cleaner.output.concentrate_ag', 'rougher.output.tail_ag', 'rougher.output.concentrate_pb', 'rougher.output.concentrate_sol', 'final.output.concentrate_au', 'secondary_cleaner.output.tail_ag', 'rougher.calculation.floatbank10_sulfate_to_au_feed', 'primary_cleaner.output.concentrate_pb', 'primary_cle

In [24]:
# Perform data analysis
# This could involve visualizing the data, checking for correlations between features, etc.

In [25]:
# Define the target and features for your model
target = train['final.output.recovery']
features = train.drop(['final.output.recovery', 'rougher.output.recovery'], axis=1)


In [26]:
# Split the data into training and validation sets
train_features, val_features, train_target, val_target = train_test_split(
    features, target, test_size=0.2, random_state=12345)

In [27]:
# Initialize the model
model = RandomForestRegressor(random_state=12345)

In [52]:
# Train the model
model.fit(train_features, train_target)

TypeError: float() argument must be a string or a real number, not 'set'

In [None]:
# Make predictions
train_pred = model.predict(train_features)
val_pred = model.predict(val_features)

In [None]:
# Calculate the mean absolute error
train_mae = mean_absolute_error(train_target, train_pred)
val_mae = mean_absolute_error(val_target, val_pred)

In [None]:
print('Training MAE:', train_mae)
print('Validation MAE:', val_mae)