In [None]:
import pandas as pd
import joblib
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
import lightgbm as lgbß

# --- Configuration ---
PROCESSED_DATA_FILE = 'proxy_training_data.csv'
MODEL_OUTPUT_FILE = 'reference_proxy_model.joblib'
RESULTS_OUTPUT_FILE = 'grid_search_results.csv'
# --- End Configuration ---

# Load the dataset
df = pd.read_csv(PROCESSED_DATA_FILE)

print("Dataset loaded successfully!")
df.head()

Dataset loaded successfully!


Unnamed: 0,text_id,source,candidate,target_score,sts_with_source,len_ratio_chars,len_ratio_words,abs_len_words,flesch_reading_ease,avg_syl_per_word,sentence_count
0,01-a2,Now NASA is working towards logging some of th...,NASA is now trying to find smaller asteroids. ...,0.841,0.7641,0.843823,0.938272,76,72.889068,1.197368,8
1,01-b1,Now NASA is working towards logging some of th...,NASA is now trying to record some of the small...,0.848,0.8374,0.979021,1.049383,85,71.772459,1.235294,5
2,02-a2,"Earthquakes damage all structures, including b...","Earthquakes can break things, like bridges. Lu...",0.7825,0.6584,0.787402,0.880952,37,73.540215,1.216216,3
3,02-b1,"Earthquakes damage all structures, including b...","Earthquakes can damage all buildings, includin...",0.9453,0.8578,0.877953,0.928571,39,54.67,1.410256,3
4,03-a2,"The Hunger Games are an annual event, which th...",The Hunger Games happen every year. The Capito...,0.6548,0.8438,0.853018,0.907895,69,79.274506,1.188406,6


In [5]:
feature_columns = [
    'sts_with_source', 'len_ratio_chars', 'len_ratio_words', 
    'abs_len_words', 'flesch_reading_ease', 'avg_syl_per_word', 
    'sentence_count'
]
target_column = 'target_score'

X = df[feature_columns]
y = df[target_column]

print(f"Using {len(feature_columns)} features for {len(X)} samples.")

Using 7 features for 1601 samples.


In [6]:
# We will define a list of models and their corresponding parameter grids
models_to_search = [
    {
        'name': 'Ridge',
        'estimator': Ridge(),
        'params': {
            'alpha': [0.1, 1.0, 10.0, 100.0]
        }
    },
    {
        'name': 'XGBoost',
        'estimator': XGBRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'learning_rate': [0.05, 0.1],
            'max_depth': [3, 5]
        }
    },
    {
        'name': 'LightGBM',
        'estimator': lgb.LGBMRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'learning_rate': [0.05, 0.1],
            'num_leaves': [20, 31]
        }
    }
]

In [7]:
# Define the cross-validation strategy
# We use 5 splits. The data is shuffled.
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

all_results = []

print("Starting Grid Search with 5-fold Cross-Validation...")

for model_config in models_to_search:
    print(f"--- Tuning model: {model_config['name']} ---")
    
    # GridSearchCV will test all parameter combinations using cross-validation
    grid_search = GridSearchCV(
        estimator=model_config['estimator'],
        param_grid=model_config['params'],
        cv=cv_strategy,
        scoring='r2',  # We'll rank models by their R-squared score
        n_jobs=-1,     # Use all available CPU cores
        verbose=1
    )
    
    # Fit the grid search on the entire dataset
    # It handles the internal train/validation splits automatically
    grid_search.fit(X, y)
    
    # Store the results
    result = {
        'model_name': model_config['name'],
        'best_score_r2': grid_search.best_score_,
        'best_params': grid_search.best_params_
    }
    all_results.append(result)

print("\n✅ Grid Search complete!")

Starting Grid Search with 5-fold Cross-Validation...
--- Tuning model: Ridge ---
Fitting 5 folds for each of 4 candidates, totalling 20 fits
--- Tuning model: XGBoost ---
Fitting 5 folds for each of 8 candidates, totalling 40 fits
--- Tuning model: LightGBM ---
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.395627 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1391
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.485575 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1395
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.371649 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1391
[LightGBM] [Info] Number of data points in the train set: 1281, number of used features: 7

In [8]:
# Create a DataFrame from the results
results_df = pd.DataFrame(all_results)

# Sort by the best score to find the winner
results_df = results_df.sort_values(by='best_score_r2', ascending=False)

# Save the results to a CSV for your records
results_df.to_csv(RESULTS_OUTPUT_FILE, index=False)

print(f"Grid Search results saved to {RESULTS_OUTPUT_FILE}")
print("\n--- Model Comparison ---")
print(results_df)

Grid Search results saved to grid_search_results.csv

--- Model Comparison ---
  model_name  best_score_r2                                        best_params
1    XGBoost       0.388698  {'learning_rate': 0.05, 'max_depth': 5, 'n_est...
2   LightGBM       0.382549  {'learning_rate': 0.05, 'n_estimators': 100, '...
0      Ridge       0.271309                                     {'alpha': 0.1}


In [9]:
# Get the name and best parameters of the winning model
best_model_config = results_df.iloc[0]
best_model_name = best_model_config['model_name']
best_params = best_model_config['best_params']

print(f"🏆 Winning Model: {best_model_name}")
print(f"Best Hyperparameters: {best_params}")

# Find the original estimator object
final_estimator = None
for model_info in models_to_search:
    if model_info['name'] == best_model_name:
        final_estimator = model_info['estimator']
        break

# Set the best parameters and retrain on ALL data
final_model = final_estimator.set_params(**best_params)
final_model.fit(X, y)

print("\nFinal model has been retrained on the full dataset.")

# Save the final, optimized model
joblib.dump(final_model, MODEL_OUTPUT_FILE)
print(f"💾 Final model saved to: {MODEL_OUTPUT_FILE}")

🏆 Winning Model: XGBoost
Best Hyperparameters: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100}

Final model has been retrained on the full dataset.
💾 Final model saved to: reference_proxy_model.joblib
