In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer


In [2]:
raw = pd.read_csv('../data/raw/owid-co2-data.csv')

In [3]:
g7 = ['United Kingdom', 'Canada', 'Japan', 'France', 'Germany', 'Italy', 'United States']
columns = ['year', 'country', 'gas_co2_per_capita', 'coal_co2_per_capita',
           'oil_co2_per_capita', 'temperature_change_from_ghg']

df = raw.dropna()
df = df[df['country'].isin(g7)].filter(columns).reset_index()

In [4]:
rename = {
    'gas_co2_per_capita' : 'gas', 
    'coal_co2_per_capita' : 'coal', 
    'oil_co2_per_capita' : 'oil',
    'temperature_change_from_ghg' : 'temp'
}
df = df.rename(columns=rename)
df.drop(columns=['index'], inplace=True)

# df = df.sort_values(by=['country', 'year'])
# df['gas-lag'] = df.groupby('country')['gas'].shift(1)
# df['coal-lag'] = df.groupby('country')['coal'].shift(1)
# df['oil-lag'] = df.groupby('country')['oil'].shift(1)
# df.dropna(inplace=True)

df.to_csv('../data/processed/temperature.csv', index=False)

In [5]:
train, test = train_test_split(df, test_size=0.2)
X_train = train.drop(columns=['temp'])
X_test = test.drop(columns=['temp'])
y_train = train['temp']
y_test = test['temp']

In [6]:
categorical_features = ["country"]
numerical_features = ["year", "gas", "coal", "oil"]

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

# Create pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", Ridge())
])

In [7]:
# Define hyperparameter search space
param_distributions = {
    "regressor__alpha": np.logspace(-3, 3, 100)  # Search alpha between 0.001 and 1000
}

# Perform Randomized Search
random_search = RandomizedSearchCV(
    pipeline, 
    param_distributions, 
    n_iter=20,
    cv=5,
    random_state=42,
    n_jobs=-1
)

# Fit the model
model = random_search.fit(X_train, y_train)

# Print best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

Best Parameters: {'regressor__alpha': np.float64(0.005336699231206312)}
Best Score: 0.9912062036257316


In [8]:
with open("model.pkl", "wb") as file:
    pickle.dump(model, file)