In [None]:
import pandas as pd 


file_name = "ICLR.cc-2023-Conference.csv"
df = pd.read_csv(file_name)
print(df.iloc[0])

In [29]:
# 1. Import Necessary Libraries
import pandas as pd
import numpy as np
import re
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# 2. Load the CSV Data
# Replace 'reviews.csv' with your actual CSV file path if different
# 3. Extract Numerical Features

# a. Define a helper function to extract numerical prefixes
def extract_numeric_prefix(s):
    """
    Extracts the leading integer from a string.
    If no integer is found, returns NaN.
    """
    match = re.match(r'^(\d+)', str(s).strip())
    return int(match.group(1)) if match else np.nan

# b. List of reviewers
reviewers = ['reviewer1', 'reviewer2', 'reviewer3', 'reviewer4']

# c. Columns to extract numeric prefixes from
prefix_columns = [
    'recommendation',
    'confidence',
    'correctness',
    'technical_novelty_and_significance',
    'empirical_novelty_and_significance'
]

# d. Extract numerical prefixes for each reviewer and specified columns
for reviewer in reviewers:
    for col in prefix_columns:
        original_col = f'{reviewer}_{col}'
        new_col = f'{reviewer}_{col}_num'
        if original_col in df.columns:
            df[new_col] = df[original_col].apply(extract_numeric_prefix)
        else:
            # If the expected column is missing, create it with NaN
            df[new_col] = np.nan

# e. Convert Date Columns to Numerical Features
# General date columns
date_columns = ['cdate', 'tmdate']

# Reviewer-specific date columns
for reviewer in reviewers:
    date_columns.extend([f'{reviewer}_cdate', f'{reviewer}_tmdate'])

for col in date_columns:
    if col in df.columns:
        # Convert to datetime, coercing errors to NaT
        df[col] = pd.to_datetime(df[col], errors='coerce')
        
        # Option 1: Convert to timestamp (seconds since epoch)
        # Handle potential NaT by filling with 0 or another placeholder
        df[f'{col}_timestamp'] = df[col].astype(np.int64) / 1e9
        df[f'{col}_timestamp'] = df[f'{col}_timestamp'].replace([np.inf, -np.inf], np.nan)
        
        # Option 2: Extract date components
        df[f'{col}_year'] = df[col].dt.year
        df[f'{col}_month'] = df[col].dt.month
        df[f'{col}_day'] = df[col].dt.day
        df[f'{col}_hour'] = df[col].dt.hour
        df[f'{col}_minute'] = df[col].dt.minute
        df[f'{col}_second'] = df[col].dt.second

# f. Ensure Direct Numerical Columns are in Numeric Format
# For example: 'reviewer1_length', 'reviewer2_length', etc.
for reviewer in reviewers:
    length_col = f'{reviewer}_length'
    if length_col in df.columns:
        df[length_col] = pd.to_numeric(df[length_col], errors='coerce')

# 4. Handle Missing Values

# a. Identify all numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# b. Fill NaNs with the median of each numerical column
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

# 5. Compute the Target Variable

# a. Extract numerical recommendation scores
recommendation_cols = [f'{reviewer}_recommendation_num' for reviewer in reviewers]

# b. Ensure recommendation columns exist
recommendation_cols = [col for col in recommendation_cols if col in df.columns]

# c. Compute variance across reviewers for each row
df['recommendation_variance'] = df[recommendation_cols].var(axis=1)

# 6. Prepare Features and Target

# a. Define target
target = 'recommendation_variance'

# b. Define feature columns
# Exclude original string columns and columns related to recommendations
excluded_columns = [
    'title', 'keywords', 'TL;DR', 'abstract', 'decision',
    'recommendation_variance'
]

# Additionally exclude original string-based reviewer columns
string_based_cols = []
for reviewer in reviewers:
    string_based_cols.extend([
        f'{reviewer}_recommendation',
        f'{reviewer}_confidence',
        f'{reviewer}_summary_of_the_paper',
        f'{reviewer}_strength_and_weaknesses',
        f'{reviewer}_clarity_quality_novelty_and_reproducibility',
        f'{reviewer}_summary_of_the_review',
        f'{reviewer}_flag_for_ethics_review',
        f'{reviewer}_cdate',
        f'{reviewer}_tmdate',
        f'{reviewer}_recommendation_num'
    ])
excluded_columns.extend(string_based_cols)

# Remove any excluded columns from numerical_cols
feature_cols = [col for col in numerical_cols if col not in excluded_columns]

feature_cols = []

for col in numerical_cols:
    if col not in excluded_columns:
        feature_cols.append(col)
        

# c. Prepare the feature matrix X and target vector y
X = df[feature_cols]
y = df[target]

# 7. (Optional) Feature Scaling

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the features
X_scaled = scaler.fit_transform(X)

# Convert back to a DataFrame for easier handling (optional)
X_scaled = pd.DataFrame(X_scaled, columns=feature_cols)

# 8. (Optional) Train-Test Split and Model Training

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("=== Linear Regression Model Evaluation ===")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R²): {r2:.4f}")

# 9. Save Preprocessed Data (Optional)

# If you wish to save the preprocessed features and target for future use:
# X.to_csv('features.csv', index=False)
# y.to_csv('target.csv', index=False)

# Alternatively, save the scaled features
# X_scaled.to_csv('features_scaled.csv', index=False)

=== Linear Regression Model Evaluation ===
Mean Squared Error (MSE): 3.0004
Root Mean Squared Error (RMSE): 1.7322
R-squared (R²): 0.0138


['reviewer1_length',
 'reviewer2_length',
 'reviewer3_length',
 'reviewer4_length',
 'reviewer1_confidence_num',
 'reviewer1_correctness_num',
 'reviewer1_technical_num',
 'reviewer1_empirical_num',
 'reviewer2_confidence_num',
 'reviewer2_correctness_num',
 'reviewer2_technical_num',
 'reviewer2_empirical_num',
 'reviewer3_confidence_num',
 'reviewer3_correctness_num',
 'reviewer3_technical_num',
 'reviewer3_empirical_num',
 'reviewer4_confidence_num',
 'reviewer4_correctness_num',
 'reviewer4_technical_num',
 'reviewer4_empirical_num',
 'cdate_timestamp',
 'cdate_year',
 'cdate_month',
 'cdate_day',
 'cdate_hour',
 'cdate_minute',
 'cdate_second',
 'tmdate_timestamp',
 'tmdate_year',
 'tmdate_month',
 'tmdate_day',
 'tmdate_hour',
 'tmdate_minute',
 'tmdate_second',
 'reviewer1_cdate_timestamp',
 'reviewer1_cdate_year',
 'reviewer1_cdate_month',
 'reviewer1_cdate_day',
 'reviewer1_cdate_hour',
 'reviewer1_cdate_minute',
 'reviewer1_cdate_second',
 'reviewer1_tmdate_timestamp',
 'revi