# HAICK 2023<br/>
### Sonatrach ROP Prediction

# Utils

In [None]:
!kaggle competitions download -c sonatrach-rop-prediction

In [None]:
!unzip sonatrach-rop-prediction.zip

In [None]:
! pip install optuna

In [None]:
! pip install featurewiz

# Importations

In [2]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

import seaborn as sns
import matplotlib.pyplot as plt

import optuna
from featurewiz import featurewiz

Imported 0.2.04 version. Select nrows to a small number when running on huge datasets.
output = featurewiz(dataname, target, corr_limit=0.90, verbose=2, sep=',', 
		header=0, test_data='',feature_engg='', category_encoders='',
		dask_xgboost_flag=False, nrows=None, skip_sulov=False)
Create new features via 'feature_engg' flag : ['interactions','groupby','target']



In [3]:
train_df = pd.read_csv('./train_dataset.csv')
test_df = pd.read_csv('./test_dataset.csv')

# Data preparation

### Feature engineering and selection

Searching for Uncorrelated List Of Variables (SULOV) and using recurcive xgboost feature selection.

In [None]:
target = 'ROP (Time)'
 
features, train = featurewiz(train_df, target, corr_limit=0.7, verbose=2, sep=",",
header=0,test_data="", feature_engg="", category_encoders="")

In [None]:
train_df = train

In [5]:
# Extract the features and target variable
X = train_df.drop('ROP (Time)', axis=1)
y = train_df['ROP (Time)']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a scaler object and fit it on the training data
scaler = StandardScaler()
scaler.fit(X)

# Transform the training and validation data using the scaler
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Modeling

### Simple ML Models

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost.sklearn import XGBRegressor
from lightgbm import LGBMRegressor

In [8]:
# Define models to fit and evaluate
models = {
    'Linear Regression' : LinearRegression(),
    'Decision Tree' : DecisionTreeRegressor(),
    'Random Forest' : RandomForestRegressor(n_estimators=100),
    'LGBM' :  LGBMRegressor(),
    'XGBoost' : XGBRegressor()
}

# Fit and evaluate each model
for name, model in models.items():
    # Fit the model
    model.fit(X_train_scaled, y_train)
    
    # Predict on test set
    y_pred = model.predict(X_val_scaled)
    
    # Print results
    print(f'{name}:')
    r2 = r2_score(y_val, y_pred)
    rmse = mean_squared_error(y_val, y_pred , squared=False)

    print("Root Mean squared error:", rmse)
    print('R-squared:', r2)

Linear Regression:
Root Mean squared error: 13.27957244621788
R-squared: 0.6684060977630195
Decision Tree:
Root Mean squared error: 5.096187572860498
R-squared: 0.9511652285590678
Random Forest:
Root Mean squared error: 3.7082631722888415
R-squared: 0.974142921933382
LGBM:
Root Mean squared error: 6.001762270522202
R-squared: 0.9322676835783047
XGBoost:
Root Mean squared error: 5.05779506433026
R-squared: 0.9518982577276551


### Hyperparameters tuning

We use Optuna to automize the hyperparameters tuning for Random Forest Regressor.

In [9]:
def objective(trial):
    # Define the hyperparameters to optimize
    n_estimators = trial.suggest_int('n_estimators', 100, 1000, step=100)
    
    # Create a model with the optimized hyperparameters
    model = RandomForestRegressor( n_estimators=n_estimators)
    
    # Fit the model on the training data
    model.fit(X_train_scaled, y_train)
    
    # Make predictions on the validation data
    y_pred = model.predict(X_val_scaled)
    
    # Calculate the mean squared error
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    
    # Return the mean squared error as the objective value
    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

In [None]:
print('Best hyperparameters: ', study.best_params)
print('Best MSE: ', study.best_value)

In [None]:
best_params = {'n_estimators': 200}

final_model = RandomForestRegressor(n_estimators=best_params['n_estimators'])

final_model.fit(X_train_scaled, y_train)

RandomForestRegressor(n_estimators=200)

In [None]:
y_pred = final_model.predict(X_val_scaled)

rmse = mean_squared_error(y_val, y_pred , squared=False)
print("Root Mean squared error:", rmse)

Root Mean squared error: 3.684890897962144


### Boosting

Train Ada Boost Regressor for the entire dataset.

In [None]:
from sklearn.ensemble import AdaBoostRegressor

adaboost = AdaBoostRegressor(base_estimator=RandomForestRegressor(), n_estimators=200)

# fit the model
adaboost.fit(X, y)

# make predictions
y_pred = adaboost.predict(X_val)

In [72]:
y_test_pred = adaboost.predict(X_test)

In [70]:
r2 = r2_score(y_val, y_pred)
rmse = mean_squared_error(y_val, y_pred , squared=False)
print("Root Mean squared error:", rmse)
print('R-squared:', r2)

Root Mean squared error: 0.17900668671557662
R-squared: 0.9999397472520198


# Submission

In [10]:
X_test = pd.read_csv('./test_dataset.csv')

In [11]:
X_test = X_test[features]

In [None]:
X_test_scaled = scaler.transform(X_test)

In [55]:
y_test_pred = final_model.predict(X_test_scaled)

In [None]:
y_test_pred

array([0., 0., 0., ..., 0., 0., 0.])

Create a submission DataFrame

In [73]:
submission_df = pd.DataFrame(y_test_pred, columns=['ROP (Time)'])

In [74]:
submission_df['ID'] = submission_df.reset_index().index
submission_df = submission_df[['ID','ROP (Time)']]
submission_df

Unnamed: 0,ID,ROP (Time)
0,0,0.0
1,1,0.0
2,2,0.0
3,3,0.0
4,4,0.0
...,...,...
14389,14389,0.0
14390,14390,0.0
14391,14391,0.0
14392,14392,0.0


In [78]:
submission_df.to_csv('submission.csv', index=False)