In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import cross_val_score

# Read the CSV file
df = pd.read_csv('/content/drive/MyDrive/train_dataset.csv')
df

Unnamed: 0,DTronc,LTronc,DChar,LChar,DChar1,LChar1,DChar2,LChar2,DChar3,LChar3,...,LRam,PFr1,PFr2,PFr3,PFr4,PFr5,PFr6,year,variety,Total_Poids
0,75.0,34.0,34.5,190.5,106.30,50.0,76.10,7.5,,,...,27.0,170.0,,,,,,2022,2,170.0
1,75.0,34.0,34.5,188.0,66.80,0.0,,,,,...,24.0,,,,,,,2022,2,0.0
2,64.5,34.0,32.5,66.0,55.50,0.0,35.00,36.0,,,...,17.0,194.0,152.0,148.0,200.0,,,2019,2,694.0
3,70.0,55.0,37.0,203.0,71.30,14.0,31.20,35.0,,,...,47.0,156.0,,,,,,2022,1,156.0
4,71.0,62.0,35.0,110.0,53.03,43.0,21.66,32.0,,,...,25.0,,,,,,,2021,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4407,71.5,31.0,40.3,63.0,83.50,139.0,12.14,10.0,8.8,30.0,...,11.5,93.0,,,,,,2021,2,93.0
4408,81.0,14.0,37.0,263.0,54.80,3.0,27.00,4.0,,,...,26.0,140.0,140.0,,,,,2022,2,280.0
4409,81.0,19.0,40.0,199.5,39.00,34.0,11.50,2.3,,,...,50.0,80.0,120.0,,,,,2022,2,200.0
4410,66.0,22.5,34.0,127.0,40.00,79.2,,,,,...,57.5,130.0,90.0,,,,,2022,2,220.0


## Classes for Pipline of preprocessing :

In [18]:
class DropColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(self.columns, axis=1)


class RemoveRTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()
        X_transformed[self.column] = X_transformed[self.column].str.replace('R', '').astype(float)
        return X_transformed
        
class ConvertToNaNTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()

        def convert_to_nan(x):
            try:
                return pd.to_numeric(x)
            except:
                return np.nan

        for cl in self.columns:
            X_transformed[cl] = X_transformed[cl].apply(convert_to_nan)
        return X_transformed

## Preprocessing_pipeline :

In [19]:
preprocessing_pipeline = Pipeline([
    ('drop_columns', DropColumnsTransformer(['DChar5', 'LChar5', 'DChar4', 'LChar4', 'PFr1', 'PFr2', 'PFr3', 'PFr4', 'PFr5', 'PFr6'])),
    ('convert_to_nan', ConvertToNaNTransformer(['DChar2', 'DChar1', 'LChar1', 'DRam'])),
    ('remove_R', RemoveRTransformer('NumRam')),
    ('imputer', SimpleImputer(strategy='median'))
])

## Pipline of Model using StandardScaler() and SVR() : 

In [20]:
# Model pipeline
model_pipeline_svr = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])

param_grid_svr = {
    'svr__C': [0.1, 1, 10],
    'svr__gamma': [0.01, 0.1, 1],
    'svr__kernel': [ 'rbf']
}

# Grid search with cross-validation
grid_search_svr = GridSearchCV(model_pipeline_svr, param_grid_svr, cv=5)

# Preprocessing
df = pd.read_csv("/content/drive/MyDrive/train_dataset.csv")
y = df['Total_Poids']
scaler = StandardScaler()
y =  scaler.fit_transform(y.values.reshape(-1, 1)).ravel()
X=df.drop('Total_Poids', axis=1)
X_preprocessed = preprocessing_pipeline.fit_transform(X)
print(X_preprocessed.shape)

# Model training 
grid_search_svr.fit(X_preprocessed, y)

(4412, 15)


In [21]:
print("Best parameters: ", grid_search_svr.best_params_)
print("Best score: ", grid_search_svr.best_score_)

Best parameters:  {'svr__C': 1, 'svr__gamma': 0.1, 'svr__kernel': 'rbf'}
Best score:  0.38105134236074145


## Using linear Regression Model:

In [22]:
from sklearn.linear_model import LinearRegression

# Model pipeline with LinearRegression
model_pipeline_rgl = Pipeline([
    ('scaler', StandardScaler()),
    ('linear_regression', LinearRegression())
])


param_grid_rgl = {}

grid_search_rgl = GridSearchCV(model_pipeline_rgl, param_grid_rgl, cv=5)


df = pd.read_csv("/content/drive/MyDrive/train_dataset.csv")
y = df['Total_Poids']
scaler = StandardScaler()
y =  scaler.fit_transform(y.values.reshape(-1, 1)).ravel()

X=df.drop('Total_Poids', axis=1)
X_preprocessed = preprocessing_pipeline.fit_transform(X)

 
grid_search_rgl.fit(X_preprocessed, y)

print("Best parameters: ", grid_search_rgl.best_params_)
print("Best score: ", grid_search_rgl.best_score_)


Best parameters:  {}
Best score:  0.33060288533329957


## Random Forest Model

In [23]:
from sklearn.ensemble import RandomForestRegressor


model_pipeline_forest = Pipeline([
    ('scaler', StandardScaler()),
    ('random_forest', RandomForestRegressor())
])


param_grid_forest = {
    'random_forest__n_estimators': [100, 200, 300], 
    'random_forest__max_depth': [None, 5, 10]  
}


grid_search_forest = GridSearchCV(model_pipeline_forest, param_grid_forest, cv=5)


df = pd.read_csv("/content/drive/MyDrive/train_dataset.csv")
y = df['Total_Poids']
X=df.drop('Total_Poids', axis=1)
X_preprocessed = preprocessing_pipeline.fit_transform(X)
y =  scaler.fit_transform(y.values.reshape(-1, 1)).ravel()


grid_search_forest.fit(X_preprocessed, y)

print("Best parameters: ", grid_search_forest.best_params_)
print("Best score: ", grid_search_forest.best_score_)


Best parameters:  {'random_forest__max_depth': None, 'random_forest__n_estimators': 300}
Best score:  0.40807083518028264


## Using xgBoost Model:

In [24]:
import xgboost as xgb


model_pipeline_xg = Pipeline([
    ('scaler', StandardScaler()),
    ('xgboost', xgb.XGBRegressor())
])


param_grid_xg = {
    'xgboost__n_estimators': [100, 200, 300], 
    'xgboost__max_depth': [3, 5, 7]  
}


grid_search_xg = GridSearchCV(model_pipeline_xg, param_grid_xg, cv=5)

# Preprocessing step
df = pd.read_csv("/content/drive/MyDrive/train_dataset.csv")

y = df['Total_Poids']
y =  scaler.fit_transform(y.values.reshape(-1, 1)).ravel()

X=df.drop('Total_Poids', axis=1)
X_preprocessed = preprocessing_pipeline.fit_transform(X)


# Model training step
grid_search_xg.fit(X_preprocessed, y)

print("Best parameters: ", grid_search_xg.best_params_)
print("Best score: ", grid_search_xg.best_score_)


Best parameters:  {'xgboost__max_depth': 3, 'xgboost__n_estimators': 100}
Best score:  0.3722104248788488


### Test New Dataset using SVR():

In [25]:
from sklearn.metrics import mean_squared_error
# Read the test dataset
df_test = pd.read_csv('/content/drive/MyDrive/test_dataset.csv')
X_test=df_test.drop('Total_Poids', axis=1)
y_test = df_test['Total_Poids']

y_test = scaler.transform(y_test.values.reshape(-1, 1))
# Preprocessing for test data
X_test_preprocessed = preprocessing_pipeline.fit_transform(X_test)

# Model prediction
y_test_pred = grid_search_svr.predict(X_test_preprocessed)

# Calculate mean squared error (MSE)

mse = mean_squared_error(y_test, y_test_pred)
print("Mean Squared Error (MSE): ", mse)

Mean Squared Error (MSE):  0.6274561805007853
