In [None]:
import optuna
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.ensemble import RandomForestRegressor , GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBRegressor
import matplotlib.pyplot as mp
import gender_guesser.detector as gender

In [None]:


# Load datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_ids = test['id'].copy()








d = gender.Detector()
df = train.copy()
df_test = test.copy()

# Drop unnecessary columns
def impute_gender(df):
    for index, row in df.iterrows():
        if row['Gender'] == 'Prefer not to say':
            prenom = row['Name'].split()[0].capitalize()
            guess = d.get_gender(prenom)
            if 'female' in guess:
                df.at[index, 'Gender'] = 'Female'
            elif 'male' in guess:
                df.at[index, 'Gender'] = 'Male'
            else:
                df.at[index, 'Gender'] = 'Female'
    return df

df = impute_gender(df)
df_test = impute_gender(df_test)
df["Mean_Age_by_Location"] = df.groupby("Location")["Age"].transform("mean")
df_test["Mean_Age_by_Location"] = df_test.groupby("Location")["Age"].transform("mean")
# Exemples de features supplémentaires liées à Location
df["Count_by_Location"] = df.groupby("Location")["Age"].transform("count")
df_test["Count_by_Location"] = df_test.groupby("Location")["Age"].transform("count")









train = df.drop(['id', 'Name', 'Location'], axis=1)
test = df_test.drop(['id', 'Name', 'Location'], axis=1)

# Separate target
X_train = train.drop('Addiction_Level', axis=1)
y_train = train['Addiction_Level']

# Handle categorical variables
cat_cols = ['Gender', 'Phone_Usage_Purpose', 'School_Grade']
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
num_cols = [col for col in num_cols if col not in cat_cols]

# Enhanced feature engineering
def create_features(df):
    # Core interactions
    df['Mental_Health_Index'] = (df['Anxiety_Level'] * 0.6 + df['Depression_Level'] * 0.4)
    df['Total_Screen_Time'] = df['Time_on_Social_Media'] + df['Time_on_Gaming'] + df['Time_on_Education']
    
    # Advanced interactions
    df['Usage_Mental_Health'] = df['Daily_Usage_Hours'] * df['Mental_Health_Index']
    df['Sleep_Quality'] = df['Sleep_Hours'] / (df['Screen_Time_Before_Bed'] + 1)
    df['Productivity_Ratio'] = df['Time_on_Education'] / (df['Total_Screen_Time'] + 0.01)
    
    # Non-linear transformations
    df['Log_Usage'] = np.log1p(df['Daily_Usage_Hours'])
    df['Sqrt_Anxiety'] = np.sqrt(df['Anxiety_Level'])
    df['Exp_Sleep'] = np.expm1(df['Sleep_Hours'] * 0.1)
    
    # Behavioral patterns
    df['Night_Owl'] = (df['Screen_Time_Before_Bed'] > 2).astype(int)
    df['Social_Dominant'] = (df['Time_on_Social_Media'] > df['Time_on_Gaming']).astype(int)
    
    # Binning with interactions
    df['Usage_Bin'] = pd.qcut(df['Daily_Usage_Hours'], q=5, labels=False)
    df['Sleep_Bin'] = pd.qcut(df['Sleep_Hours'], q=5, labels=False)
    df['Usage_Sleep_Interaction'] = df['Usage_Bin'] * df['Sleep_Bin']
    
    return df

# Apply feature engineering
X_train = create_features(X_train)
test = create_features(test)

# Ensure test has all columns from train
for col in X_train.columns:
    if col not in test.columns:
        test[col] = 0

test = test[X_train.columns]

# Enhanced preprocessing
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('transformer', PowerTransformer()),
    ('selector', SelectFromModel(LGBMRegressor(), threshold='1.25*mean'))
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

# Optimized base models (CPU-only)
xgb_model = xgb.XGBRegressor(
    n_estimators=2000,
    max_depth=8,
    learning_rate=0.015,
    subsample=0.6,
    colsample_bytree=0.6,
    gamma=0.2,
    reg_alpha=0.2,
    reg_lambda=0.2,
    random_state=42,
    tree_method='hist'  # Changed from GPU to CPU method
)

lgbm_model = LGBMRegressor(
    n_estimators=2000,
    max_depth=7,
    learning_rate=0.02,
    num_leaves=50,
    subsample=0.6,
    colsample_bytree=0.6,
    reg_alpha=0.2,
    reg_lambda=0.2,
    random_state=42,
    force_row_wise=True  # Added for stability
)

cat_model = CatBoostRegressor(
    iterations=2000,
    depth=6,
    learning_rate=0.025,
    l2_leaf_reg=0.5,
    random_seed=42,
    verbose=0,
    task_type='CPU'  # Explicitly set to CPU
)

# Two-level stacking approach
first_level = [
    ('xgb', xgb_model),
    ('lgbm', lgbm_model),
    ('cat', cat_model)
]

meta_models = {
    'hgb': HistGradientBoostingRegressor(
        max_iter=300, 
        max_depth=5, 
        learning_rate=0.05, 
        random_state=42
    ),
    'kernel': KernelRidge(
        alpha=0.5, 
        kernel='polynomial', 
        degree=3, 
        coef0=3
    )
}

# Create stacking models
stack_models = {}
for name, meta_model in meta_models.items():
    stack = Pipeline([
        ('preprocessor', preprocessor),
        ('stack', StackingRegressor(
            estimators=first_level,
            final_estimator=meta_model,
            cv=5,
            n_jobs=-1,
            passthrough=True
        ))
    ])
    stack_models[name] = stack

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.05, random_state=50
)

# Train and evaluate
predictions = []
weights = []
for name, model in stack_models.items():
    model.fit(X_train_split, y_train_split)
    val_pred = model.predict(X_val_split)
    mse = mean_squared_error(y_val_split, val_pred)
    print(f"{name} MSE: {mse:.6f}")
    predictions.append(model.predict(test))
    weights.append(1 / mse)

# Weighted ensemble
weights = np.array(weights) / sum(weights)
y_pred_test = np.sum([w * p for w, p in zip(weights, predictions)], axis=0)

# Full training with best model
best_model_name = min(stack_models.keys(), key=lambda x: weights[list(stack_models.keys()).index(x)])
best_model = stack_models[best_model_name]
best_model.fit(X_train, y_train)

# Final prediction blending
final_pred = best_model.predict(test)
y_pred_test = 0.2* final_pred + 0.8 * y_pred_test

# Post-processing
y_pred_test = np.clip(y_pred_test, y_train.min(), y_train.max())

# Cross-validation
cv_scores = cross_val_score(best_model, X_train, y_train, 
                          cv=5, scoring='neg_mean_squared_error')
final_mse = -np.mean(cv_scores)
print(f"\nFinal CV MSE: {final_mse:.6f}")

# Submission
submission = pd.DataFrame({'id': test_ids, 'Addiction_Level': y_pred_test})
submission.to_csv('submission.csv', index=False)
print("Submission file generated")