<a href="https://colab.research.google.com/github/kwnstantinosRoumeliwtis/ML_course/blob/main/final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import joblib
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

df = pd.read_csv("train_hh_features.csv")
target = pd.read_csv("train_hh_gt.csv")
X = df.copy()
y = target["cons_ppp17"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
best_pipe = joblib.load("preprocess_pipeline.joblib")

In [None]:
feature_names = best_pipe.named_steps["preprocess"].get_feature_names_out()
mask = best_pipe.named_steps["select"].get_support()
selected_features = feature_names[mask]
print(selected_features)

['num__hhid' 'num__weight' 'num__strata' 'num__utl_exp_ppp17' 'num__hsize'
 'num__num_children5' 'num__num_children10' 'num__num_children18'
 'num__age' 'num__num_adult_female' 'num__num_adult_male'
 'num__sworkershh' 'num__share_secondary' 'num__sfworkershh'
 'num__region1' 'num__region2' 'num__region3' 'num__region5'
 'num__region6' 'num__region7' 'cat__male' 'cat__owner' 'cat__water'
 'cat__toilet' 'cat__sewer' 'cat__elect' 'cat__water_source'
 'cat__sanitation_source' 'cat__dweltyp' 'cat__employed' 'cat__educ_max'
 'cat__any_nonagric' 'cat__sector1d' 'cat__urban' 'cat__consumed100'
 'cat__consumed300' 'cat__consumed400' 'cat__consumed500'
 'cat__consumed600' 'cat__consumed700' 'cat__consumed800'
 'cat__consumed900' 'cat__consumed1000' 'cat__consumed1100'
 'cat__consumed1200' 'cat__consumed1300' 'cat__consumed1400'
 'cat__consumed1600' 'cat__consumed1700' 'cat__consumed1900'
 'cat__consumed2000' 'cat__consumed2100' 'cat__consumed2200'
 'cat__consumed2300' 'cat__consumed2400' 'cat__c

In [None]:
X_train_proc = best_pipe.named_steps["preprocess"].transform(X_train)
X_test_proc  = best_pipe.named_steps["preprocess"].transform(X_test)
X_train_sel = best_pipe.named_steps["select"].transform(X_train_proc)
X_test_sel  = best_pipe.named_steps["select"].transform(X_test_proc)
print(X_train_sel.shape, X_test_sel.shape)

(83387, 80) (20847, 80)


In [None]:
scaler = StandardScaler()
X_train_sel = scaler.fit_transform(X_train_sel)
X_test_sel  = scaler.transform(X_test_sel)

In [None]:
model = LinearRegression()
model.fit(X_train_sel, y_train)
y_pred = model.predict(X_test_sel)

In [None]:
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(np.mean((y_test - y_pred)**2))
r2 = r2_score(y_test, y_pred)
print("MAE:", mae)
print("RMSE:", rmse)
print("R2:", r2)

MAE: 4.290560968856828
RMSE: 7.108084410272669
R2: 0.4971165887383897


In [None]:
sgd_model = SGDRegressor(max_iter=1000, tol=1e-3)
sgd_model.fit(X_train_sel, y_train)
y_pred_sgd = sgd_model.predict(X_test_sel)

In [None]:
mae = mean_absolute_error(y_test, y_pred_sgd)
rmse = np.sqrt(np.mean((y_test -y_pred_sgd)**2))
r2 = r2_score(y_test, y_pred_sgd)
print("MAE:", mae)
print("RMSE:", rmse)
print("R2:", r2)

MAE: 4.263301855102727
RMSE: 7.139087415729887
R2: 0.4927202148936243


In [None]:
y_train_log = np.log1p(y_train)
model = LinearRegression()
model.fit(X_train_sel, y_train_log)
y_pred_log= model.predict(X_test_sel)
y_pred_final = np.expm1(y_pred_log)

In [None]:
mae = mean_absolute_error(y_test, y_pred_final)
rmse = np.sqrt(np.mean((y_test - y_pred_final)**2))
r2 = r2_score(y_test, y_pred_final)
print("MAE:" , mae)
print("RMSE:", rmse)
print("R2:", r2)

MAE: 3.644527044722255
RMSE: 6.92813432223127
R2: 0.5222565351441358


In [None]:
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)
X_train_t = torch.tensor(X_train_sel,dtype=torch.float32)
y_train_t = torch.tensor( y_train_log.values,dtype=torch.float32).view(-1,1)
X_test_t = torch.tensor(X_test_sel,dtype=torch.float32)
y_test_t = torch.tensor(y_test_log.values,dtype=torch.float32).view(-1, 1)

train_ds = TensorDataset(X_train_t, y_train_t)
train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)

class TabularNet(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3 ),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU() ,
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1)
        )

    def forward(self,x):
        return self.net(x)

epochs = 100
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TabularNet(input_dim=X_train_t.shape[1]).to( device)
criterion = nn.L1Loss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)



for epoch in range(epochs):
    model.train()
    train_loss = 0
    for xb, yb in train_loader :
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_loader)
    model.eval()
    with torch.no_grad():
        X_test_device = X_test_t.to(device)
        y_test_device = y_test_t.to(device)
        log_preds = model(X_test_device)
        val_loss_log = criterion(log_preds, y_test_device).item()
        real_preds = torch.expm1(log_preds)
        real_targets = torch.expm1(y_test_device)
        mae_dollars = torch.abs(real_preds - real_targets).mean().item()

    if (epoch + 1) % 10 == 0 or epoch == 0:
        print(f" Epoch {epoch+ 1:03d} | Train Log_MAE: {train_loss:.4f} | Val LOg-MAE: {val_loss_log:.4f} | Real Error: ${mae_dollars:.2f}")

 Epoch 001 | Train Log_MAE: 0.4511 | Val LOg-MAE: 0.3509 | Real Error: $4.29
 Epoch 010 | Train Log_MAE: 0.2732 | Val LOg-MAE: 0.2512 | Real Error: $3.33
 Epoch 020 | Train Log_MAE: 0.2555 | Val LOg-MAE: 0.2467 | Real Error: $3.29
 Epoch 030 | Train Log_MAE: 0.2483 | Val LOg-MAE: 0.2486 | Real Error: $3.32
 Epoch 040 | Train Log_MAE: 0.2433 | Val LOg-MAE: 0.2490 | Real Error: $3.31
 Epoch 050 | Train Log_MAE: 0.2406 | Val LOg-MAE: 0.2485 | Real Error: $3.30
 Epoch 060 | Train Log_MAE: 0.2376 | Val LOg-MAE: 0.2455 | Real Error: $3.25
 Epoch 070 | Train Log_MAE: 0.2348 | Val LOg-MAE: 0.2531 | Real Error: $3.40
 Epoch 080 | Train Log_MAE: 0.2336 | Val LOg-MAE: 0.2511 | Real Error: $3.32
 Epoch 090 | Train Log_MAE: 0.2327 | Val LOg-MAE: 0.2515 | Real Error: $3.34
 Epoch 100 | Train Log_MAE: 0.2307 | Val LOg-MAE: 0.2486 | Real Error: $3.29


In [None]:
model.eval()
with torch.no_grad():
    preds_log = model(X_test_t.to(device))
    preds_log = preds_log.cpu().numpy()
    y_pred_final = np.expm1(preds_log)
mae = mean_absolute_error(y_test, y_pred_final)
rmse = np.sqrt(np.mean((y_test.values.reshape(-1,1)- y_pred_final)**2))
r2 = r2_score(y_test, y_pred_final)
print("MAE:" , mae)
print("RMSE:", rmse)
print("R2:", r2)

MAE: 3.2924794161876436
RMSE: 6.233082940591587
R2: 0.6133055237791496
