In [20]:
import pandas as pd
import numpy as np
import re
import ast

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, FunctionTransformer

# LightGBM
!pip install lightgbm
import lightgbm as lgb

# For CatBoost, uncomment:
!pip install catboost
from catboost import CatBoostRegressor


Collecting catboost
  Obtaining dependency information for catboost from https://files.pythonhosted.org/packages/1c/e1/78e635a1e5f0066bd02a1ecfd658ad09fe30d275c65c2d0dd76fe253e648/catboost-1.2.7-cp311-cp311-win_amd64.whl.metadata
  Downloading catboost-1.2.7-cp311-cp311-win_amd64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Obtaining dependency information for graphviz from https://files.pythonhosted.org/packages/00/be/d59db2d1d52697c6adc9eacaf50e8965b6345cc143f671e1ed068818d5cf/graphviz-0.20.3-py3-none-any.whl.metadata
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.7-cp311-cp311-win_amd64.whl (101.7 MB)
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.7 MB 487.6 kB/s eta 0:03:29
   ---------------------------------------- 0.1/101.7 MB 871.5 kB/s eta 0:01:57
   --------------------------

In [11]:
!pip install xgboost
from xgboost import XGBRegressor



In [13]:
# Adjust these paths for your Kaggle setup:
train_path = "train.csv"
test_path = "test.csv"
sample_sub_path = "sample_submission.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_sub_path)

print("Initial train shape:", train_df.shape)
print("Initial test shape:", test_df.shape)

# A) Drop columns with >80% missing
col_threshold = 0.8
cols_to_drop = []
for col in train_df.columns:
    if train_df[col].isnull().mean() > col_threshold:
        cols_to_drop.append(col)

train_df.drop(cols_to_drop, axis=1, inplace=True, errors='ignore')
test_df.drop(cols_to_drop, axis=1, inplace=True, errors='ignore')
print("Dropped columns:", cols_to_drop)

# B) Drop rows with <5 non-null in train
row_threshold = 5
before = len(train_df)
train_df.dropna(thresh=row_threshold, axis=0, inplace=True)
after = len(train_df)
print(f"Dropped {before - after} rows with fewer than {row_threshold} non-null columns.")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)


Initial train shape: (7635, 35)
Initial test shape: (1909, 35)
Dropped columns: ['address', 'languages', 'proficiency_levels']
Dropped 0 rows with fewer than 5 non-null columns.
Train shape: (7635, 32)
Test shape: (1909, 32)


In [14]:
def parse_list(x):
    if pd.isnull(x):
        return []
    try:
        return ast.literal_eval(x)
    except:
        return []

if 'skills' in train_df.columns:
    train_df['skills'] = train_df['skills'].apply(parse_list)
if 'skills' in test_df.columns:
    test_df['skills'] = test_df['skills'].apply(parse_list)

if 'skills_required' in train_df.columns:
    train_df['skills_required'] = train_df['skills_required'].apply(parse_list)
if 'skills_required' in test_df.columns:
    test_df['skills_required'] = test_df['skills_required'].apply(parse_list)


In [15]:
def skill_overlap(row):
    """
    Calculate overlap between candidate's skills and job's required skills.
    """
    candidate_skills = set(row['skills']) if row.get('skills') else set()
    required_skills = set(row['skills_required']) if row.get('skills_required') else set()
    if not candidate_skills or not required_skills:
        return 0.0
    # For example, Jaccard similarity:
    intersection = candidate_skills.intersection(required_skills)
    union = candidate_skills.union(required_skills)
    jaccard = len(intersection) / float(len(union)) if union else 0.0
    return jaccard

if all(col in train_df.columns for col in ['skills','skills_required']):
    train_df['skill_overlap'] = train_df.apply(skill_overlap, axis=1)
    test_df['skill_overlap']  = test_df.apply(skill_overlap, axis=1)
else:
    # If the columns don't exist, create a dummy overlap = 0
    train_df['skill_overlap'] = 0.0
    test_df['skill_overlap'] = 0.0


In [16]:
def extract_numeric_years(text):
    if pd.isnull(text):
        return None
    match = re.search(r'(\d+)', str(text))
    if match:
        return float(match.group(1))
    return None

if 'experiencere_requirement' in train_df.columns:
    train_df['experience_required'] = train_df['experiencere_requirement'].apply(extract_numeric_years)
    test_df['experience_required'] = test_df['experiencere_requirement'].apply(extract_numeric_years)
if 'age_requirement' in train_df.columns:
    train_df['age_required'] = train_df['age_requirement'].apply(extract_numeric_years)
    test_df['age_required'] = test_df['age_requirement'].apply(extract_numeric_years)


In [17]:
text_cols = []
for col in ['career_objective', 'responsibilities', 'responsibilities.1', 'educationaL_requirements']:
    if col in train_df.columns:
        text_cols.append(col)

# Fill missing text fields with empty string
for col in text_cols:
    train_df[col] = train_df[col].fillna("")
    test_df[col] = test_df[col].fillna("")


In [18]:
target_col = 'matched_score'
if target_col not in train_df.columns:
    raise ValueError(f"Target column {target_col} not found in train_df")

# Drop rows with missing target
train_df = train_df.dropna(subset=[target_col])

# Numeric features we want to include:
numeric_features = []
for col in ['experience_required', 'age_required', 'skill_overlap']:
    if col in train_df.columns:
        numeric_features.append(col)

print("Using text columns:", text_cols)
print("Using numeric columns:", numeric_features)


Using text columns: ['career_objective', 'responsibilities', 'responsibilities.1', 'educationaL_requirements']
Using numeric columns: ['experience_required', 'age_required', 'skill_overlap']


In [19]:
X = train_df.copy()
y = train_df[target_col].values
X_test = test_df.copy()

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train set shape:", X_train.shape, "Val set shape:", X_val.shape, "Test set shape:", X_test.shape)


Train set shape: (6108, 35) Val set shape: (1527, 35) Test set shape: (1909, 35)


In [21]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

numeric_features = []
for col in ['experience_required','age_required','skill_overlap']:
    if col in train_df.columns:
        numeric_features.append(col)

# Sub-pipeline for numeric
num_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

# Sub-pipelines for text columns
def make_tfidf_pipeline():
    return TfidfVectorizer(
        stop_words='english',
        max_features=1000  # adjust as needed
    )

transformers = []
for txt_col in text_cols:
    transformers.append((f"tfidf_{txt_col}", make_tfidf_pipeline(), txt_col))

# Add numeric pipeline
transformers.append(("numeric", num_pipeline, numeric_features))

preprocessor = ColumnTransformer(
    transformers=transformers,
    remainder='drop'
)

xgb_model = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=300,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

lgb_model = lgb.LGBMRegressor(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=6,
    bagging_fraction=0.8,  # ~subsample
    feature_fraction=0.8,  # ~colsample_bytree
    random_state=42,
    n_jobs=-1
)

cat_model = CatBoostRegressor(
    iterations=300,
    learning_rate=0.1,
    depth=6,
    rsm=0.8,            # ~colsample_bytree
    subsample=0.8,
    random_state=42,
    verbose=0           # no logging
)

# Create 3 pipelines
from sklearn.pipeline import Pipeline

pipeline_xgb = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb_model)
])

pipeline_lgb = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', lgb_model)
])

pipeline_cat = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', cat_model)
])

# Put them in a dict for easy iteration
models = {
    'XGBoost': pipeline_xgb,
    'LightGBM': pipeline_lgb,
    'CatBoost': pipeline_cat
}

# =========================================================
# 7. TRAIN & VALIDATE EACH MODEL
# =========================================================
results = {}
for name, pipe in models.items():
    print(f"\nTraining {name}...")
    pipe.fit(X_train, y_train)
    
    y_val_pred = pipe.predict(X_val)
    mse_val = mean_squared_error(y_val, y_val_pred)
    rmse_val = np.sqrt(mse_val)
    r2_val = r2_score(y_val, y_val_pred)
    
    results[name] = {
        'MSE': mse_val,
        'RMSE': rmse_val,
        'R2': r2_val
    }
    
    print(f"{name} Validation MSE:  {mse_val:.4f}")
    print(f"{name} Validation RMSE: {rmse_val:.4f}")
    print(f"{name} Validation R^2:  {r2_val:.4f}")

print("\nSummary of Validation Results:")
for name, metrics in results.items():
    print(f"{name}: MSE={metrics['MSE']:.4f}, RMSE={metrics['RMSE']:.4f}, R2={metrics['R2']:.4f}")

# =========================================================
# 8. (Optional) PREDICT ON TEST SET & CREATE SUBMISSIONS
#    We'll produce 3 separate CSV files for comparison.
# =========================================================
for name, pipe in models.items():
    # Predict
    test_preds = pipe.predict(X_test)
    
    # Clip if domain knowledge says matched_score in [0,1]
    test_preds = np.clip(test_preds, 0, 1)
    
    # Save submission
    submission_df = sample_submission.copy()
    submission_df['matched_score'] = test_preds
    out_name = f"submission_{name}.csv"
    submission_df.to_csv(out_name, index=False)
    print(f"{name} submission saved to {out_name}")


Training XGBoost...
XGBoost Validation MSE:  0.0141
XGBoost Validation RMSE: 0.1187
XGBoost Validation R^2:  0.4630

Training LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011991 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5364
[LightGBM] [Info] Number of data points in the train set: 6108, number of used features: 1303
[LightGBM] [Info] Start training from score 0.658422
LightGBM Validation MSE:  0.0140
LightGBM Validation RMSE: 0.1183
LightGBM Validation R^2:  0.4669

Training CatBoost...
CatBoost Validation MSE:  0.0141
CatBoost Validation RMSE: 0.1186
CatBoost Validation R^2:  0.4647

Summary of Validation Results:
XGBoost: MSE=0.0141, RMSE=0.1187, R2=0.4630
LightGBM: MSE=0.0140, RMSE=0.1183, R2=0.4669
CatBoost: MSE=0.0141, RMSE=0.1186, R2=0.4647
XGBoost submission saved to submission_XGBoost.csv
LightGBM submissi