In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# =========================================================
import pandas as pd
import numpy as np
import ast
import re
from datetime import datetime
from dateutil import parser
from collections import Counter

from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import HistGradientBoostingRegressor, GradientBoostingRegressor

from textblob import TextBlob
from fuzzywuzzy import fuzz
from sklearn.preprocessing import RobustScaler


import gc
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [None]:
train_path = '/kaggle/input/bitfest-datathon-2025/train.csv'
test_path  = '/kaggle/input/bitfest-datathon-2025/test.csv'
sample_sub_path = '/kaggle/input/bitfest-datathon-2025/sample_submission.csv'

In [None]:
seed=42
def parse_list( x):
    """Parse string representation of lists"""
    if pd.isnull(x):
        return []
    try:
        return ast.literal_eval(x)
    except:
        return []

def extract_skill_overlap( row):
    """Calculate skill overlap ratio"""
    candidate = set(row['skills']) if 'skills' in row else set()
    required  = set(row['skills_required']) if 'skills_required' in row else set()
    if not candidate or not required:
        return 0.0
    inter = candidate.intersection(required)
    union = candidate.union(required)
    return len(inter) / len(union) if union else 0.0

def extract_total_experience_months( row):
    """Calculate total months of experience"""
    start_str = row['start_dates']
    end_str = row['end_dates']
    start = parse_date(start_str)
    end = parse_date(end_str)
    if start is None or end is None:
        return 0
    diff = (end.year - start.year)*12 + (end.month - start.month)
    return max(diff, 0)

def parse_date( date_str):
    """Parse date string"""
    if pd.isnull(date_str):
        return None
    if any(word.lower() in str(date_str).lower() for word in ['till','present']):
        return datetime.now()
    try:
        return parser.parse(str(date_str))
    except:
        return None

def extract_passing_year( val):
    """Extract passing year as integer"""
    if pd.isnull(val):
        return 0
    try:
        yrs = ast.literal_eval(val)
        if isinstance(yrs, list) and len(yrs) > 0:
            yrs_numeric = [int(x) for x in yrs if str(x).isdigit()]
            if len(yrs_numeric) == 0:
                return 0
            return max(yrs_numeric)
    except:
        pass
    return 0


def extract_education_features( df):
    """Enhanced education feature extraction"""
    def get_education_level(degree):
        if pd.isna(degree):
            return 0
        degree = degree.lower()
        if 'phd' in degree or 'doctorate' in degree:
            return 4
        elif 'master' in degree or 'mba' in degree:
            return 3
        elif 'bachelor' in degree or 'bsc' in degree:
            return 2
        elif 'diploma' in degree or 'certificate' in degree:
            return 1
        return 0

    df['education_level'] = df['degree_names'].apply(get_education_level)

    def extract_numeric_result(x):
        if pd.isna(x):
            return 0.0
        try:
            matches = re.findall(r'\d+\.?\d*', str(x))
            if matches:
                return float(matches[0])
            return 0.0
        except:
            return 0.0

    df['numeric_result'] = df['educational_results'].apply(extract_numeric_result)

    # Education-job requirement match score using fuzzy matching
    df['education_match'] = df.apply(
        lambda x: fuzz.ratio(
            str(x['degree_names']).lower(), 
            str(x['educationaL_requirements']).lower()
        ) / 100.0, 
        axis=1
    )

    return df

def extract_experience_features( df):
    """Enhanced experience feature extraction"""
    df['required_years'] = df['experiencere_requirement'].apply(extract_years)

    # Calculate total experience in months
    df['total_experience_months'] = df.apply(
        extract_total_experience_months, axis=1
    )

    # Experience match score
    df['experience_match'] = (
        df['total_experience_months'] / df['required_years'].replace(0, 1)
    ).clip(0, 2)

    return df

def extract_years( text):
    """Extract years from text"""
    if pd.isna(text):
        return 0
    years = re.findall(r'(\d+)[\s-]*year', text.lower())
    return int(years[0]) if years else 0

def extract_skill_features( df):
    """Enhanced skill feature extraction"""
    df['skills'] = df['skills'].apply(parse_list)
    df['skills_required'] = df['skills_required'].apply(parse_list)

    df['skill_overlap'] = df.apply(extract_skill_overlap, axis=1)

    # Skill coverage
    df['skill_coverage'] = df.apply(
        lambda x: len(x['skills']) / max(len(x['skills_required']), 1) 
        if len(x['skills_required']) > 0 else 0, axis=1
    )

    # Skill importance score
    skill_freq = Counter()
    for skills in df['skills']:
        skill_freq.update(skills)
    
    df['skill_importance_score'] = df.apply(
        lambda x: sum(1/np.log2(skill_freq[skill] + 1) 
                        for skill in set(x['skills']).intersection(set(x['skills_required'])))
        if len(x['skills_required']) > 0 else 0,
        axis=1
    )

    df.drop(['skills', 'skills_required'], axis=1, inplace=True)

    return df

def extract_text_features( df):
    """Enhanced text feature extraction"""
    # Sentiment analysis
    df['objective_sentiment'] = df['career_objective'].apply(
        lambda x: TextBlob(str(x)).sentiment.polarity
    )

    # Text length features
    df['objective_length'] = df['career_objective'].str.len()
    df['resp_length'] = df['responsibilities'].str.len()

    # Keyword matching between responsibilities
    df['resp_match'] = df.apply(
        lambda x: fuzz.ratio(
            str(x['responsibilities']), 
            str(x['responsibilities.1'])
        ) / 100.0,
        axis=1
    )

    return df


def create_interaction_features( df):
    """Create interaction features"""
    df['skill_edu_interaction'] = df['skill_overlap'] * df['education_match']
    df['skill_exp_interaction'] = df['skill_overlap'] * df['experience_match']
    df['edu_exp_interaction'] = df['education_match'] * df['experience_match']

    return df

# def encode_categorical_features( train_df, test_df, categorical_cols):
#     """Encode categorical features using Label Encoding"""
#     for col in categorical_cols:
#         if col in train_df.columns:
#             le = LabelEncoder()
#             train_df[f'{col}_encoded'] = le.fit_transform(train_df[col].fillna('unknown'))
#             test_df[f'{col}_encoded'] = le.transform(test_df[col].fillna('unknown'))
#     return train_df, test_df

# from sklearn.preprocessing import LabelEncoder

# def encode_categorical_features(train_df, test_df, categorical_cols):
#     """
#     Encode categorical features using Label Encoding.
#     If a category is present in the test set but not in the train set, it will be labeled as -1.
#     """
#     for col in categorical_cols:
#         if col in train_df.columns:
#             le = LabelEncoder()

#             # Combine unique categories from train and test to avoid unseen errors
#             all_categories = pd.concat([train_df[col], test_df[col]], axis=0).fillna('unknown').unique()
#             le.fit(all_categories)

#             # Transform train and test datasets
#             train_df[f'{col}_encoded'] = le.transform(train_df[col].fillna('unknown'))
#             test_df[f'{col}_encoded'] = le.transform(test_df[col].fillna('unknown'))

#     return train_df, test_df

def encode_categorical_features(train_df, test_df, categorical_cols):
    """
    Encode categorical features where each cell can have multiple categories (e.g., lists) using One-Hot Encoding.
    """
    for col in categorical_cols:
        print(f"Raw data sample for column {col} in train_df:")
        print(train_df[col].head(10))
        print(f"Raw data sample for column {col} in test_df:")
        print(test_df[col].head(10))
        # print(f"Data type of column {col} in train_df: {train_df[col].dtype}")
        # print(f"Data type of column {col} in test_df: {test_df[col].dtype}")
        if col in train_df.columns:
            # Ensure cells are parsed correctly as lists and handle empty/null values
            train_df[col] = train_df[col].apply(lambda x: eval(x) if isinstance(x, str) and x.startswith('[') else x)
            test_df[col] = test_df[col].apply(lambda x: eval(x) if isinstance(x, str) and x.startswith('[') else x)

            train_df[col] = train_df[col].apply(lambda x: x if isinstance(x, list) and len(x) > 0 else ['unknown'])
            test_df[col] = test_df[col].apply(lambda x: x if isinstance(x, list) and len(x) > 0 else ['unknown'])
            
            # Combine unique categories from both train and test
            all_categories = pd.concat(
                [train_df[col].explode(), test_df[col].explode()], axis=0
            ).fillna('unknown').unique()

            #print(f"All categories for column {col}: {all_categories}")  # Debugging
            
            # Create binary columns for each category
            for category in all_categories:
                train_df[f'{col}_{category}'] = train_df[col].apply(lambda x: 1 if category in x else 0)
                test_df[f'{col}_{category}'] = test_df[col].apply(lambda x: 1 if category in x else 0)
                
    return train_df, test_df





def extract_text_features_tf_idf_svd_scaled(train_df, test_df, text_cols, max_features=500, n_components=50):
    """Encode text features using TF-IDF Vectorization, SVD, and scale the SVD features."""
    scaler = StandardScaler()  # Initialize the scaler
    svd_cols = []  # List to store names of SVD columns created

    for col in text_cols:
        print(f"Processing TF-IDF for column: {col}")
        tfidf = TfidfVectorizer(
            stop_words='english',
            max_features=max_features,
            ngram_range=(1, 2)
        )

        # Fit TF-IDF on train and transform train/test
        tfidf.fit(train_df[col].fillna('').astype(str))
        train_tfidf = tfidf.transform(train_df[col].fillna('').astype(str))
        test_tfidf = tfidf.transform(test_df[col].fillna('').astype(str))

        # Truncated SVD
        svd = TruncatedSVD(n_components=n_components, random_state=42)
        svd.fit(train_tfidf)
        train_svd = svd.transform(train_tfidf)
        test_svd = svd.transform(test_tfidf)

        # Scale the SVD components
        train_svd_scaled = scaler.fit_transform(train_svd)  # Fit and transform on training data
        test_svd_scaled = scaler.transform(test_svd)        # Transform on test data

        # Add scaled SVD components as new features
        for i in range(n_components):
            svd_col_name = f'{col}_svd_scaled_{i}'
            train_df[svd_col_name] = train_svd_scaled[:, i]
            test_df[svd_col_name] = test_svd_scaled[:, i]
            svd_cols.append(svd_col_name)  # Append to the list of SVD columns

    # Drop original text columns
    train_df.drop(text_cols, axis=1, inplace=True)
    test_df.drop(text_cols, axis=1, inplace=True)
    gc.collect()

    return train_df, test_df, svd_cols






def prepare_features( train_df, test_df,numeric_cols):
    """Prepare features for modeling"""
    # Fill missing numeric values with 0
    # numeric_cols = ['skill_overlap', 'skill_coverage', 'skill_importance_score',
    #                 'objective_sentiment', 'objective_length', 'resp_length',
    #                 'resp_match', 'skill_edu_interaction', 'skill_exp_interaction',
    #                 'edu_exp_interaction', 'total_experience_months', 'passing_year']
    for col in numeric_cols:
        if col in train_df.columns:
            train_df[col] = train_df[col].fillna(0)
        if col in test_df.columns:
            test_df[col] = test_df[col].fillna(0)

    return train_df, test_df

In [None]:

# def emphasize_numerical_features(train_df, test_df, numerical_cols, svd_cols, weight_factor=2.0):
#     """
#     Emphasize numerical features by scaling them and optionally weighting them more heavily than SVD features.
#     """
#     # Initialize scalers
#     numerical_cols = [col for col in numerical_cols if col in train_df.columns and col in test_df.columns]
#     numerical_scaler = StandardScaler()
#     svd_scaler = StandardScaler()

#     # Scale and emphasize numerical features
#     train_df[numerical_cols] = numerical_scaler.fit_transform(train_df[numerical_cols])
#     test_df[numerical_cols] = numerical_scaler.transform(test_df[numerical_cols])
    
#     train_df[numerical_cols] *= weight_factor
#     test_df[numerical_cols] *= weight_factor

#     # Scale SVD features
#     train_df[svd_cols] = svd_scaler.fit_transform(train_df[svd_cols])
#     test_df[svd_cols] = svd_scaler.transform(test_df[svd_cols])

#     # Combine emphasized numerical and scaled SVD features into separate DataFrames for clarity
#     train_combined = train_df[numerical_cols + svd_cols + ['matched_score']].copy()
#     test_combined = test_df[numerical_cols + svd_cols].copy()

#     return train_combined, test_combined


def emphasize_numerical_features(train_df, test_df, numerical_cols, svd_cols, weight_factor=2.0):
    """
    Emphasize numerical features by scaling them and optionally weighting them more heavily than SVD features.
    Includes one-hot encoded columns dynamically.
    """

    numerical_cols = [col for col in numerical_cols if col in train_df.columns and col in test_df.columns]
    # Identify all numerical columns (explicit + dynamically generated one-hot encoded columns)
    one_hot_encoded_cols = [col for col in train_df.columns if col not in numerical_cols + svd_cols + ['matched_score'] and train_df[col].dtype in [np.int64, np.float64]]
    all_numerical_cols = numerical_cols + one_hot_encoded_cols
    
    print("Final numerical columns (including one-hot encoded):", all_numerical_cols)
    
    # Initialize scalers
    numerical_scaler = StandardScaler()
    svd_scaler = StandardScaler()
    
    # Scale and emphasize numerical features
    train_df[all_numerical_cols] = numerical_scaler.fit_transform(train_df[all_numerical_cols])
    test_df[all_numerical_cols] = numerical_scaler.transform(test_df[all_numerical_cols])
    
    train_df[all_numerical_cols] *= weight_factor
    test_df[all_numerical_cols] *= weight_factor
    
    # Scale SVD features
    train_df[svd_cols] = svd_scaler.fit_transform(train_df[svd_cols])
    test_df[svd_cols] = svd_scaler.transform(test_df[svd_cols])
    
    # Combine emphasized numerical and scaled SVD features into separate DataFrames for clarity
    train_combined = train_df[all_numerical_cols + svd_cols + ['matched_score']].copy()
    test_combined = test_df[all_numerical_cols + svd_cols].copy()
    
    return train_combined, test_combined





In [None]:
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

def cross_validate_model(train_df, features, target, params_hgb, params_gb, test_df, n_folds=5, random_state=42):
    """
    Perform K-Fold Cross-Validation with HistGradientBoostingRegressor and GradientBoostingRegressor.
    
    Args:
    - train_df (pd.DataFrame): Training data.
    - features (list): List of feature column names.
    - target (str): Target column name.
    - params_hgb (dict): Parameters for HistGradientBoostingRegressor.
    - params_gb (dict): Parameters for GradientBoostingRegressor.
    - test_df (pd.DataFrame): Testing data (features only).
    - n_folds (int): Number of folds for cross-validation.
    - random_state (int): Random state for reproducibility.

    Returns:
    - final_test_preds (np.array): Averaged predictions for the test set.
    - avg_mse (float): Average MSE across folds for both models.
    """
    # Initialize KFold
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)

    # Initialize variables for predictions and metrics
    oof_preds_hgb = np.zeros(len(train_df))
    oof_preds_gb = np.zeros(len(train_df))
    test_preds_hgb = np.zeros(len(test_df))
    test_preds_gb = np.zeros(len(test_df))
    fold_mses_hgb = []
    fold_mses_gb = []

    # Perform K-Fold Cross-Validation
    for fold, (train_idx, val_idx) in enumerate(kf.split(train_df)):
        print(f'\nTraining fold {fold + 1} / {n_folds}')
        
        # Split data into training and validation folds
        X_train_fold = train_df.iloc[train_idx][features]
        y_train_fold = train_df.iloc[train_idx][target]
        X_val_fold = train_df.iloc[val_idx][features]
        y_val_fold = train_df.iloc[val_idx][target]
        
        # Train HistGradientBoostingRegressor
        hgb = HistGradientBoostingRegressor(**params_hgb)
        hgb.fit(X_train_fold, y_train_fold)
        y_pred_hgb = hgb.predict(X_val_fold)
        mse_hgb = mean_squared_error(y_val_fold, y_pred_hgb)
        fold_mses_hgb.append(mse_hgb)
        print(f"Fold {fold + 1} HistGradientBoostingRegressor MSE: {mse_hgb:.6f}")
        oof_preds_hgb[val_idx] = y_pred_hgb
        
        # Predict on test set
        y_test_pred_hgb = hgb.predict(test_df[features])
        test_preds_hgb += y_test_pred_hgb / n_folds
        
        # Train GradientBoostingRegressor
        gb = GradientBoostingRegressor(**params_gb)
        gb.fit(X_train_fold, y_train_fold)
        y_pred_gb = gb.predict(X_val_fold)
        mse_gb = mean_squared_error(y_val_fold, y_pred_gb)
        fold_mses_gb.append(mse_gb)
        print(f"Fold {fold + 1} GradientBoostingRegressor MSE: {mse_gb:.6f}")
        oof_preds_gb[val_idx] = y_pred_gb
        
        # Predict on test set
        y_test_pred_gb = gb.predict(test_df[features])
        test_preds_gb += y_test_pred_gb / n_folds
    
    # Calculate average MSE
    avg_mse_hgb = np.mean(fold_mses_hgb)
    avg_mse_gb = np.mean(fold_mses_gb)
    print(f"\nAverage HistGradientBoostingRegressor MSE: {avg_mse_hgb:.6f}")
    print(f"Average GradientBoostingRegressor MSE: {avg_mse_gb:.6f}")
    
    # Average predictions from both models
    final_test_preds = (test_preds_hgb + test_preds_gb) / 2
    
    return final_test_preds, (avg_mse_hgb + avg_mse_gb) / 2


In [None]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_sub_path)

print("Initial train shape:", train_df.shape)
print("Initial test shape:", test_df.shape)


In [None]:
# Drop columns >80% missing
col_threshold = 0.8
cols_to_drop = [col for col in train_df.columns if train_df[col].isnull().mean() > col_threshold]
train_df.drop(cols_to_drop, axis=1, inplace=True, errors='ignore')
test_df.drop(cols_to_drop, axis=1, inplace=True, errors='ignore')
print("Dropped columns:", cols_to_drop)

# Drop rows with <5 non-null in train
row_threshold = 5
before_rows = len(train_df)
train_df.dropna(thresh=row_threshold, axis=0, inplace=True)
after_rows = len(train_df)
print(f"Dropped {before_rows - after_rows} rows with < {row_threshold} non-null columns.")

print("Train shape after drops:", train_df.shape)
print("Test shape after drops:", test_df.shape)

In [None]:
print("\nExtracting features...")
for df in [train_df, test_df]:
    df = extract_education_features(df)
    df = extract_experience_features(df)
    df = extract_skill_features(df)
    df = extract_text_features(df)
    df = create_interaction_features(df)


In [None]:
pd.set_option('display.max_rows', None)


In [None]:
#train_df.head()
train_df.info()
#train_df['educational_institution_name']
numerical = train_df.select_dtypes(include=['int64', 'float64','int32', 'float32'])
numerical_cols = numerical.columns.tolist()

In [None]:
train_df['result_types'].head(10)

In [None]:
# Define categorical columns
categorical_cols = [
    'educational_institution_name',
    'major_field_of_studies',
    'degree_names',
    'result_types',
    'professional_company_names',
    'role_positions',
    'certification_providers',
    'extra_curricular_organization_names'
]

# Encode categorical features
train_df, test_df = encode_categorical_features(train_df, test_df, categorical_cols)

In [None]:
train_df.info()

In [None]:

# Define text columns
text_cols = [
    'career_objective',
    'responsibilities',
    'educationaL_requirements',
    'related_skils_in_job',
    'certification_skills',
    'extra_curricular_activity_types',
    '﻿job_position_name'
]

# Encode text features using TF-IDF and SVD
train_df, test_df, svd_cols = extract_text_features_tf_idf_svd_scaled(
    train_df, test_df, text_cols, max_features=500, n_components=70
)





In [None]:
train_df.info()

In [None]:
numerical_cols

In [None]:
train_st,test_st=train_df,test_df
train_st.info()

In [None]:
train_st,test_st=train_df,test_df

In [None]:
train_df.info()

In [None]:
train_df,test_df=train_st,test_st
train_df.info()

In [None]:
train_df, test_df = emphasize_numerical_features(train_df, test_df, numerical_cols, svd_cols, weight_factor=20.0)

In [None]:
train_df.info()

In [None]:
numerical_cols

In [None]:
numerical_cols = [col for col in numerical_cols if col != target_col]
outlier_cols=numerical_cols+svd_cols
#outlier_cols

In [None]:

drop_cols = ['matched_score', 'ID']
target_col='matched_score'
feature_cols = [
    c for c in train_df.columns
    if c not in drop_cols
        and (train_df[c].dtype in [np.float64, np.int64, np.float32, np.int32])
]



# Fill missing numeric values
train_df, test_df = prepare_features(train_df, test_df,numerical_cols)
outlier_threshold = 3.0  # Define threshold for identifying outliers (z-score method)
        
def handle_outliers(df, columns):
    for col in columns:
        if df[col].dtype in [np.float64, np.int64, np.float32, np.int32]:
            # Calculate z-scores
            z_scores = (df[col] - df[col].mean()) / df[col].std()
            
            # Winsorize extreme values (cap outliers)
            upper_limit = df[col].mean() + outlier_threshold * df[col].std()
            lower_limit = df[col].mean() - outlier_threshold * df[col].std()
            
            df[col] = np.clip(df[col], lower_limit, upper_limit)
    return df


# Apply outlier handling on both train and test sets
train_df = handle_outliers(train_df, outlier_cols)
test_df = handle_outliers(test_df, outlier_cols)

In [None]:
train_df.info()

In [None]:
train_st,test_st=train_df,test_df

In [None]:
train_df,test_df=train_st,test_st
train_df.info()

In [None]:
from sklearn.ensemble import RandomForestRegressor

# def select_important_features(train_df, test_df, target, top_k=20):
#     """
#     Select top `k` features based on importance using a random forest.
#     """
#     # Split into features and target
#     X_train = train_df.drop(columns=[target])
#     y_train = train_df[target]
    
#     # Train a random forest to get feature importance
#     model = RandomForestClassifier(random_state=42)
#     model.fit(X_train, y_train)
    
#     # Get feature importance
#     feature_importance = pd.Series(model.feature_importances_, index=X_train.columns)
#     important_features = feature_importance.nlargest(top_k).index
    
#     # Filter top features
#     train_df = train_df[important_features]
#     test_df = test_df[important_features]
    
#     return train_df, test_df


# Use Random Forest to estimate feature importance
model = RandomForestRegressor(random_state=42)
model.fit(train_df[feature_cols], train_df[target_col])

# Feature importance
feature_importances = pd.Series(model.feature_importances_, index=feature_cols)
print("Feature Importances:")
print(feature_importances)

# Select features based on importance threshold
important_features = feature_importances[feature_importances > 0.0002].index.tolist()
print(f"Selected features: {len(important_features)} out of {len(feature_cols)}")

# Drop less important features
train_df = train_df[important_features + [target_col]]
test_df = test_df[important_features]



In [None]:
print("md")

In [None]:
train_st_imp,test_st_imp=train_df,test_df


In [None]:
train_st_imp.info()

In [None]:
train_df,test_df=train_st_imp,test_st_imp

In [None]:


# Define target and features
target_col = 'matched_score'
if target_col not in train_df.columns:
    raise ValueError(f"{target_col} not found in train_df columns!")

# Drop rows with missing target
train_df = train_df.dropna(subset=[target_col])

In [None]:
X = train_df.copy()
y = train_df[target_col].values
X_test = test_df.copy()

# Define feature columns
drop_cols = ['matched_score', 'ID']
feature_cols = [
    c for c in train_df.columns
    if c not in drop_cols
        and (train_df[c].dtype in [np.float64, np.int64, np.float32, np.int32])
]
print(f"Total features: {len(feature_cols)}")

# Scale features

# Scale features
scaler = RobustScaler()
X[feature_cols] = scaler.fit_transform(X[feature_cols])
X_test[feature_cols] = scaler.transform(X_test[feature_cols])

# scaler = StandardScaler()
# X[feature_cols] = scaler.fit_transform(X[feature_cols])
# X_test[feature_cols] = scaler.transform(X_test[feature_cols])

# Re-assign after scaling
train_df = X
test_df = X_test

In [None]:
train_df.info()

In [None]:
numerical = train_df.select_dtypes(include=['int64', 'float64'])
numerical_cols = numerical.columns
Correlation = numerical.corr()

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from tensorflow.keras.callbacks import EarlyStopping


from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential

In [None]:
def create_neural_network(input_dim):
    """
    Create a denser Neural Network model for datasets with sufficient samples.
    Args:
    - input_dim (int): Number of input features.

    Returns:
    - model: Compiled Keras model.
    """
    model = Sequential()

    # First hidden layer
    model.add(Dense(256, input_dim=input_dim))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(0.3))  # Higher dropout for denser layers

    # Second hidden layer
    model.add(Dense(128))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(0.2))

    # Third hidden layer
    model.add(Dense(64))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(0.2))

    # Output layer
    model.add(Dense(1, activation='linear'))  # Linear activation for regression

    # Compile the model
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='mse',
        metrics=['mse']
    )
    
    return model


In [None]:
def cross_validate_neural_network(train_df, features, target, test_df, n_folds=5, random_state=42):
    """
    Perform K-Fold Cross-Validation with a Neural Network.
    
    Args:
    - train_df (pd.DataFrame): Training data.
    - features (list): List of feature column names.
    - target (str): Target column name.
    - test_df (pd.DataFrame): Testing data (features only).
    - n_folds (int): Number of folds for cross-validation.
    - random_state (int): Random state for reproducibility.

    Returns:
    - final_test_preds (np.array): Averaged predictions for the test set.
    - avg_mse (float): Average MSE across folds.
    """
    # Initialize KFold
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)

    # Initialize variables for predictions and metrics
    oof_preds_nn = np.zeros(len(train_df))
    test_preds_nn = np.zeros(len(test_df))
    fold_mses_nn = []

    # Perform K-Fold Cross-Validation
    for fold, (train_idx, val_idx) in enumerate(kf.split(train_df)):
        print(f'\nTraining fold {fold + 1} / {n_folds}')
        
        # Split data into training and validation folds
        X_train_fold = train_df.iloc[train_idx][features].values
        y_train_fold = train_df.iloc[train_idx][target].values
        X_val_fold = train_df.iloc[val_idx][features].values
        y_val_fold = train_df.iloc[val_idx][target].values

        # Create Neural Network model
        model = create_neural_network(input_dim=len(features))

        # Early stopping
        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

        # Train the model
        model.fit(
            X_train_fold, y_train_fold,
            validation_data=(X_val_fold, y_val_fold),
            epochs=200,
            batch_size=32,
            callbacks=[early_stopping],
            verbose=1
        )

        # Predict on validation set
        y_pred_nn = model.predict(X_val_fold).flatten()
        mse_nn = mean_squared_error(y_val_fold, y_pred_nn)
        fold_mses_nn.append(mse_nn)
        print(f"Fold {fold + 1} Neural Network MSE: {mse_nn:.6f}")
        oof_preds_nn[val_idx] = y_pred_nn

        # Predict on test set
        test_preds_nn += model.predict(test_df[features].values).flatten() / n_folds

    # Calculate average MSE
    avg_mse_nn = np.mean(fold_mses_nn)
    print(f"\nAverage Neural Network MSE: {avg_mse_nn:.6f}")
    
    return test_preds_nn, avg_mse_nn

In [None]:




# Perform Cross-Validation
print("\n========== Starting Cross-Validation with Neural Network ==========")
test_preds_nn, mean_mse_nn = cross_validate_neural_network(
    train_df, 
    feature_cols, 
    target_col, 
    test_df
)

# Clip predictions to valid range
test_preds_nn = np.clip(test_preds_nn, 0, 1)
# Prepare submission
print("\nPreparing Submission...")

# Check if 'ID' column exists, rename or create if necessary
if 'ID' not in test_df.columns:
    if 'id' in test_df.columns:
        test_df.rename(columns={'id': 'ID'}, inplace=True)
    elif 'identifier' in test_df.columns:
        test_df.rename(columns={'identifier': 'ID'}, inplace=True)
    else:
        print("No 'ID' column found in test_df. Generating a sequential ID column...")
        test_df['ID'] = range(1, len(test_df) + 1)

# Ensure sample_submission has matching 'ID' structure
if 'ID' not in sample_submission.columns:
    raise ValueError("Sample submission file does not contain 'ID' column. Please check the file structure.")

output_path='/kaggle/working/submission_final_nn.csv'
submission = sample_submission.copy()
submission['matched_score'] = test_preds_nn
submission.to_csv(output_path, index=False)
print(f"Submission saved: {output_path}")
print(f"Final Average MSE: {mean_mse_nn:.6f}")


In [None]:
Correlation.iloc[0]

In [None]:

params_hgb = {
    'loss': 'squared_error',
    'max_iter': 300,
    'learning_rate': 0.05,
    'max_depth': 6,
    'random_state': seed,
    'n_iter_no_change': 50,       # Early stopping
    'tol': 1e-4,                   # Tolerance for early stopping
    'validation_fraction': 0.1,    # Fraction of data to use for early stopping
    'verbose': 0                   # Silent
}
params_gb = {
    'n_estimators': 300,
    'learning_rate': 0.05,
    'max_depth': 6,
    'random_state': seed,
    'n_iter_no_change': 50,      # Early stopping
    'tol': 1e-4,                 # Tolerance for early stopping
    'validation_fraction': 0.1,  # Fraction of data to use for early stopping
    'verbose': 0                 # Silent
}

# Perform Cross-Validation
print("\n========== Starting Cross-Validation ==========")
test_preds, mean_mse = cross_validate_model(
    train_df, 
    feature_cols, 
    target_col, 
    params_hgb, 
    params_gb,
    test_df
)

# Clip predictions to valid range
test_preds = np.clip(test_preds, 0, 1)
# Prepare submission
print("\nPreparing Submission...")

# Check if 'ID' column exists, rename or create if necessary
if 'ID' not in test_df.columns:
    if 'id' in test_df.columns:
        test_df.rename(columns={'id': 'ID'}, inplace=True)
    elif 'identifier' in test_df.columns:
        test_df.rename(columns={'identifier': 'ID'}, inplace=True)
    else:
        print("No 'ID' column found in test_df. Generating a sequential ID column...")
        test_df['ID'] = range(1, len(test_df) + 1)

# Ensure sample_submission has matching 'ID' structure
if 'ID' not in sample_submission.columns:
    raise ValueError("Sample submission file does not contain 'ID' column. Please check the file structure.")

output_path='/kaggle/working/submission_final.csv'
submission = sample_submission.copy()
submission['matched_score'] = test_preds
submission.to_csv(output_path, index=False)
print(f"Submission saved: {output_path}")
print(f"Final Average MSE: {mean_mse:.6f}")

In [None]:
output_path='/kaggle/working/submission_final.csv'
submission.to_csv(output_path, index=False)

In [None]:
import csv

# Open file for writing
file = open('submission.csv', 'w', encoding='utf-8')
log_file = open('log.csv', 'w', encoding='utf-8')

# Create CSV writers
submission_writer = csv.writer(file)
log_writer = csv.writer(log_file)

# Write headers
submission_writer.writerow(['ID', 'matched_score'])  # Assuming your DataFrame has these columns
log_writer.writerow(['ID', 'Agent ID', 'matched_score'])  # Example log file headers

# Write submission content
for index, row in submission.iterrows():
    submission_writer.writerow([row['ID'], row['matched_score']])  # Adjust columns as per your DataFrame
    log_writer.writerow([row['ID'], 'Agent_X', row['matched_score']])  # Example log content

# Close files
file.close()
log_file.close()

print("Submission and log files written successfully.")


In [None]:
plt.figure(figsize=(20,7))

sns.heatmap(Correlation,
            annot=True,
            cmap='coolwarm',
            fmt='.2f',
            linewidths=0.5)

plt.title('Correlation Matrix')
plt.show()


In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from tensorflow.keras.callbacks import EarlyStopping


from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential

def create_denser_neural_network(input_dim):
    """
    Create a denser Neural Network model for datasets with sufficient samples.
    Args:
    - input_dim (int): Number of input features.

    Returns:
    - model: Compiled Keras model.
    """
    model = Sequential()

    # First hidden layer
    model.add(Dense(256, input_dim=input_dim))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(0.3))  # Higher dropout for denser layers

    # Second hidden layer
    model.add(Dense(128))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(0.2))

    # Third hidden layer
    model.add(Dense(64))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(0.2))

    # Output layer
    model.add(Dense(1, activation='linear'))  # Linear activation for regression

    # Compile the model
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='mse',
        metrics=['mse']
    )
    
    return model



def cross_validate_neural_network(train_df, features, target, test_df, n_folds=5, random_state=42):
    """
    Perform K-Fold Cross-Validation with a Neural Network.
    
    Args:
    - train_df (pd.DataFrame): Training data.
    - features (list): List of feature column names.
    - target (str): Target column name.
    - test_df (pd.DataFrame): Testing data (features only).
    - n_folds (int): Number of folds for cross-validation.
    - random_state (int): Random state for reproducibility.

    Returns:
    - final_test_preds (np.array): Averaged predictions for the test set.
    - avg_mse (float): Average MSE across folds.
    """
    # Initialize KFold
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)

    # Initialize variables for predictions and metrics
    oof_preds_nn = np.zeros(len(train_df))
    test_preds_nn = np.zeros(len(test_df))
    fold_mses_nn = []

    # Perform K-Fold Cross-Validation
    for fold, (train_idx, val_idx) in enumerate(kf.split(train_df)):
        print(f'\nTraining fold {fold + 1} / {n_folds}')
        
        # Split data into training and validation folds
        X_train_fold = train_df.iloc[train_idx][features].values
        y_train_fold = train_df.iloc[train_idx][target].values
        X_val_fold = train_df.iloc[val_idx][features].values
        y_val_fold = train_df.iloc[val_idx][target].values

        # Create Neural Network model
        model = create_neural_network(input_dim=len(features))

        # Early stopping
        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

        # Train the model
        model.fit(
            X_train_fold, y_train_fold,
            validation_data=(X_val_fold, y_val_fold),
            epochs=100,
            batch_size=32,
            callbacks=[early_stopping],
            verbose=1
        )

        # Predict on validation set
        y_pred_nn = model.predict(X_val_fold).flatten()
        mse_nn = mean_squared_error(y_val_fold, y_pred_nn)
        fold_mses_nn.append(mse_nn)
        print(f"Fold {fold + 1} Neural Network MSE: {mse_nn:.6f}")
        oof_preds_nn[val_idx] = y_pred_nn

        # Predict on test set
        test_preds_nn += model.predict(test_df[features].values).flatten() / n_folds

    # Calculate average MSE
    avg_mse_nn = np.mean(fold_mses_nn)
    print(f"\nAverage Neural Network MSE: {avg_mse_nn:.6f}")
    
    return test_preds_nn, avg_mse_nn


# Perform Cross-Validation
print("\n========== Starting Cross-Validation with Neural Network ==========")
test_preds_nn, mean_mse_nn = cross_validate_neural_network(
    train_df, 
    feature_cols, 
    target_col, 
    test_df
)

# Clip predictions to valid range
test_preds_nn = np.clip(test_preds_nn, 0, 1)

# Prepare submission
print("\nPreparing Submission...")
if 'ID' not in test_df.columns:
    raise ValueError("Test dataset does not contain 'ID' column. Please adjust the identifier column name.")

submission = sample_submission.copy()
submission['matched_score'] = test_preds_nn
submission.to_csv(output_path, index=False)
print(f"Submission saved: {output_path}")
print(f"Final Average MSE: {mean_mse_nn:.6f}")
