In [89]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error


train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [90]:
# Tolgo colonna target dal set di train
y_train_no_split = train['SalePrice']  
X_train_no_split = train.drop(columns=['SalePrice','Id'])
X_test = test.drop(columns=['Id'])

In [91]:


# Carico il dataset
train = pd.read_csv('train.csv')

# Calcolo il numero e la percentuale di valori nulli per ogni colonna
null_counts = train.isnull().sum()  # Numero di valori nulli
null_percentage = (train.isnull().mean() * 100).round(2)  # Percentuale di valori nulli

# Creo un DataFrame per visualizzare i risultati
null_data = pd.DataFrame({'Colonna': train.columns, 
                          'Valori Nulli': null_counts, 
                          'Percentuale Nulli (%)': null_percentage})

# Filtra le colonne che hanno valori nulli (solo colonne con almeno un valore nullo)
null_data_filtered = null_data[null_data['Valori Nulli'] > 0]

# Ordina le colonne in base alla percentuale di valori nulli in ordine decrescente
null_data_filtered = null_data_filtered.sort_values(by='Percentuale Nulli (%)', ascending=False)

# Visualizza le colonne con valori nulli
print(null_data_filtered)


                   Colonna  Valori Nulli  Percentuale Nulli (%)
PoolQC              PoolQC          1453                  99.52
MiscFeature    MiscFeature          1406                  96.30
Alley                Alley          1369                  93.77
Fence                Fence          1179                  80.75
MasVnrType      MasVnrType           872                  59.73
FireplaceQu    FireplaceQu           690                  47.26
LotFrontage    LotFrontage           259                  17.74
GarageType      GarageType            81                   5.55
GarageYrBlt    GarageYrBlt            81                   5.55
GarageFinish  GarageFinish            81                   5.55
GarageQual      GarageQual            81                   5.55
GarageCond      GarageCond            81                   5.55
BsmtFinType2  BsmtFinType2            38                   2.60
BsmtExposure  BsmtExposure            38                   2.60
BsmtFinType1  BsmtFinType1            37

In [50]:
train['PoolQC'].unique()
train['MiscFeature'].unique()

array([nan, 'Shed', 'Gar2', 'Othr', 'TenC'], dtype=object)

In [92]:
# Categorizzo in automatico tutto il possibile. Allineo già le colonne con il dataset 
X_train_dummies = pd.get_dummies(X_train_no_split)
X_test_dummies = pd.get_dummies(test)
X_train_dummies, X_test_dummies = X_train_dummies.align(X_test_dummies, join='left', axis=1)
print(X_train_dummies.shape, X_test_dummies.shape)

(1460, 287) (1459, 287)


In [93]:
from sklearn.model_selection import train_test_split, GridSearchCV

X_train, X_val, y_train, y_val = train_test_split(X_train_dummies, y_train_no_split, test_size=0.2, random_state=42)

In [94]:
from xgboost import XGBRegressor
model_xgb = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=5)
model_xgb.fit(X_train, y_train)

y_pred_val = model_xgb.predict(X_val)

# Valutazione del modello sul validation set (MSE - Mean Squared Error)
mse = mean_squared_error(y_val, y_pred_val)
print(f'Mean Squared Error sul validation set: {mse}')
rmse = mse ** 0.5
print(f'Root Mean Squared Error sul validation set: {rmse}')
avg_price = y_train.mean()
print(f'Prezzo medio delle case nel dataset: {avg_price}')
print(f'Differenza in % {rmse/avg_price}')

Mean Squared Error sul validation set: 601801155.7710724
Root Mean Squared Error sul validation set: 24531.63581522994
Prezzo medio delle case nel dataset: 181441.5419520548
Differenza in % 0.1352040748293041


In [114]:
param_grid = {
    'n_estimators': [ 1000, 1500],
    'learning_rate': [0.01, 0.005],  
    'max_depth': [ 2, 3, 5, 7, 10],  
    'subsample': [ 0.8],  # Percentuale di campioni usati per ciascun albero
    'colsample_bytree': [ 0.8],  # Percentuale di feature usate per ciascun albero
    'gamma': [0],  # Penalizzazione per split, aiuta a ridurre l'overfitting
}
grid_search = GridSearchCV(model_xgb, param_grid, cv=3 , scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

In [115]:
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [116]:
grid_search.best_params_

{'colsample_bytree': 0.8,
 'gamma': 0,
 'learning_rate': 0.01,
 'max_depth': 3,
 'n_estimators': 1500,
 'subsample': 0.8}

In [117]:
best_model.fit(X_train, y_train)

y_pred_val = best_model.predict(X_val)

# Valutazione del modello sul validation set (MSE - Mean Squared Error)
mse = mean_squared_error(y_val, y_pred_val)
print(f'Mean Squared Error sul validation set: {mse}')
rmse = mse ** 0.5
print(f'Root Mean Squared Error sul validation set: {rmse}')
avg_price = y_train.mean()
print(f'Prezzo medio delle case nel dataset: {avg_price}')
print(f'Differenza in % {rmse/avg_price}')

Mean Squared Error sul validation set: 629526047.7478054
Root Mean Squared Error sul validation set: 25090.35766480433
Prezzo medio delle case nel dataset: 181441.5419520548
Differenza in % 0.13828342393295112


In [118]:
import numpy as np 
from sklearn.metrics import mean_absolute_error
absolute_errors = np.abs(y_val - y_pred_val)

# Calcolo della mediana dell'errore assoluto
median_error = np.median(absolute_errors)
np.mean(absolute_errors)
mae = mean_absolute_error(y_val, y_pred_val)
mae / y_val.mean()

0.0885132853068843

In [119]:
# Root Mean Squared Error in logartimo
from sklearn.metrics import mean_squared_error
import numpy as np


best_model.fit(X_train, y_train)

y_pred_val = best_model.predict(X_val)


y_val_log = np.log(y_val)
y_pred_val_log = np.log(y_pred_val)

rmse_log = mean_squared_error(y_val_log, y_pred_val_log) ** 0.5

# Visualizzazione del risultato
print(f'Root Mean Squared Error (log) sul validation set: {rmse_log}')

Root Mean Squared Error (log) sul validation set: 0.1295785790097223


In [122]:
y_test = model.predict(X_test_dummies)
y_test

array([130667.  , 154914.75, 179249.68, ..., 152734.5 , 115902.75,
       219349.83])

In [123]:
sub = pd.read_csv('sample_submission.csv')
sub['SalePrice'] = y_test
print(sub)
sub.to_csv('sub_da_mandare_3.csv', index = None)

        Id  SalePrice
0     1461  130667.00
1     1462  154914.75
2     1463  179249.68
3     1464  188801.35
4     1465  203991.00
...    ...        ...
1454  2915   89238.00
1455  2916   91307.50
1456  2917  152734.50
1457  2918  115902.75
1458  2919  219349.83

[1459 rows x 2 columns]
