In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.kernel_approximation import RBFSampler
import sys
from IPython.display import display, Markdown
import seaborn as sns
%load_ext autoreload
%autoreload 2


In [2]:
preprocess = Pipeline([
    ('poly',PolynomialFeatures(degree=2)),
    ('min_max', MinMaxScaler())
])

putanje = ['../podaci/df_kanada.csv', '../podaci/df_gradiska_train_inter.csv', '../podaci/df_portugal.csv']
training_columns = ['AvgPixelCount', 'NumberOfDaysFromStart', 'PreviousWeight']

In [3]:
dfs = []
Xs = []
for putanja in putanje:
    dfs.append(pd.read_csv(putanja))
    Xs.append(preprocess.fit_transform(dfs[-1][training_columns]))

In [4]:
df = pd.concat(dfs)

In [38]:
train = df
gradiskatest = pd.read_csv('../podaci/df_gradiska_test_inter.csv')


print(train.head(10))

X_train = train[training_columns]
y_train = train['AverageWeight']
X_test = gradiskatest[training_columns]
y_test = gradiskatest['AverageWeight']


X_train_transformed = preprocess.fit_transform(X_train)
X_test_transformed = preprocess.transform(X_test)

   TurnusId        Date  AvgPixelCount  AvgBboxWidth  AvgBboxHeight  \
0     131.0  2023-06-03     344.461932    184.536740     179.683936   
1     131.0  2023-06-04     463.062580    197.471418     191.473504   
2     131.0  2023-06-05     430.997551    203.855550     196.402763   
3     131.0  2023-06-06     446.578511    211.449437     202.574249   
4     131.0  2023-06-07     477.485082    217.930284     211.826148   
5     131.0  2023-06-08     530.142223    227.983510     219.693399   
6     131.0  2023-06-09     576.723363    235.817352     229.819631   
7     131.0  2023-06-10     595.501761    242.945277     237.033505   
8     131.0  2023-06-11     692.874699    255.601084     251.259510   
9     131.0  2023-06-12     704.145396    261.617489     258.712156   

   NumberOfDaysFromStart  PreviousWeight  AverageWeight  HenhouseId  
0                      2            50.0           70.0         NaN  
1                      3            70.0           70.0         NaN  
2       

In [40]:
models_and_params = [
    (LinearRegression(), {
        'fit_intercept': [True, False]
    }),

   
    (Ridge(), {
        'alpha': [0.1, 1.0, 10.0],
        'solver': ['auto', 'svd', 'cholesky', 'lsqr'],
        'fit_intercept': [True, False]
    }),

   
    (Lasso(), {
        'alpha': [0.1, 1.0, 10.0],
        'max_iter': [1000, 2000, 3000],
        'fit_intercept': [True, False]
    }),

    
    (ElasticNet(), {
        'alpha': [0.1, 1.0, 10.0],
        'l1_ratio': [0.2, 0.5, 0.8],
        'max_iter': [1000, 2000, 3000],
        'fit_intercept': [True, False]
    }),

    
    (DecisionTreeRegressor(), {
        'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'max_depth': [None, 5, 10, 15],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 5, 10]
    }),

    
    (RandomForestRegressor(), {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 5, 10]
    }),

  

   
    (KNeighborsRegressor(), {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    }),

        
    (AdaBoostRegressor(), {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.1, 0.05, 0.02],
        'loss': ['linear', 'square', 'exponential']
    }),

    
    (GradientBoostingRegressor(), {
        'learning_rate': [0.1, 0.05, 0.02],
        'n_estimators': [50, 100, 150],
        'max_depth': [3, 5, 7, 10],
        'min_samples_split': [3, 5, 10],
        'min_samples_leaf': [2, 5, 10],
    })
]




In [42]:
for model, parameters in models_and_params:
    print(f"\nTrening modela: {model.__class__.__name__}")

   
    grid_search = GridSearchCV(estimator=model, param_grid=parameters, n_jobs=-1, scoring='neg_mean_absolute_error')
    grid_search.fit(X_train_transformed, y_train)

    
    print(f"Najbolji parametri za {model.__class__.__name__}: {grid_search.best_params_}")

   
    y_pred = grid_search.predict(X_test_transformed)

   
    mae = mean_absolute_error(y_test, y_pred)
    print(f"MAE na test skupu: {mae}")
    print(grid_search.best_score_)


Trening modela: LinearRegression
Najbolji parametri za LinearRegression: {'fit_intercept': True}
MAE na test skupu: 23.457954476652485
-28.126825705288734

Trening modela: Ridge
Najbolji parametri za Ridge: {'alpha': 0.1, 'fit_intercept': False, 'solver': 'svd'}
MAE na test skupu: 30.523653250539123
-45.242999539760305

Trening modela: Lasso


  model = cd_fast.enet_coordinate_descent(


Najbolji parametri za Lasso: {'alpha': 0.1, 'fit_intercept': True, 'max_iter': 1000}
MAE na test skupu: 27.7818363379842
-37.11567192786636

Trening modela: ElasticNet
Najbolji parametri za ElasticNet: {'alpha': 0.1, 'fit_intercept': False, 'l1_ratio': 0.8, 'max_iter': 1000}
MAE na test skupu: 54.245057315220826
-69.32471740796566

Trening modela: DecisionTreeRegressor


  _data = np.array(data, dtype=dtype, copy=copy,


Najbolji parametri za DecisionTreeRegressor: {'criterion': 'absolute_error', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
MAE na test skupu: 22.81433684539415
-27.816745212495515

Trening modela: RandomForestRegressor
Najbolji parametri za RandomForestRegressor: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
MAE na test skupu: 16.61871761788285
-28.18693737341288

Trening modela: KNeighborsRegressor
Najbolji parametri za KNeighborsRegressor: {'algorithm': 'brute', 'n_neighbors': 5, 'weights': 'distance'}
MAE na test skupu: 23.640738585081028
-35.71885405311509

Trening modela: AdaBoostRegressor
Najbolji parametri za AdaBoostRegressor: {'learning_rate': 0.1, 'loss': 'square', 'n_estimators': 200}
MAE na test skupu: 40.585476757406404
-45.70495624525854

Trening modela: GradientBoostingRegressor
Najbolji parametri za GradientBoostingRegressor: {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 10