In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

try:
    df1= pd.read_csv('/content/drive/My Drive/santander-customer-transaction-prediction/train.csv')
    df2= pd.read_csv('/content/drive/My Drive/santander-customer-transaction-prediction/test.csv')
except FileNotFoundError as e:
    print(f"Error: {e}. Please check the file paths and ensure the files exist in your Google Drive.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


df1.head()
df2.head()
df1.shape
df2.shape

from ydata_profiling import ProfileReport
profile = ProfileReport(df1, title="Pandas Profiling Report")
profile.to_file(output_file = 'output1.html')

from ydata_profiling import ProfileReport
profile = ProfileReport(df2, title="Pandas Profiling Report2")
profile.to_file(output_file = 'output2.html')


variable_cols_df1 = df1.columns[2:]
variable_cols_df2 = df2.columns[1:]


#EDA Analysis starts here .......
# Ensure both lists of variable columns have the same length and correspond to the same variables
# Assuming the variable columns are in the same order and named consistently
variable_cols = [col for col in variable_cols_df1 if col in variable_cols_df2]


# Determine the number of rows and columns for subplots
n_cols_per_row = 4
n_rows = (len(variable_cols) + n_cols_per_row - 1) // n_cols_per_row

fig, axes = plt.subplots(n_rows, n_cols_per_row, figsize=(20, n_rows * 5))
fig.suptitle('Distribution Comparison of Variables in df1 and df2', y=1.02)

# Flatten the axes array for easy iteration
axes = axes.flatten()

for i, col in enumerate(variable_cols):
    if i < len(axes): # Ensure we don't go out of bounds of the axes array
        sns.histplot(df1[col], ax=axes[i], color='skyblue', label='df1', kde=True)
        sns.histplot(df2[col], ax=axes[i], color='lightcoral', label='df2', kde=True)
        axes[i].set_title(col)
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')
        axes[i].legend()

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


df1_means = df1.iloc[:, 2:].mean()
df2_means = df2.iloc[:, 1:].mean()

# Plot using subplots
fig, ax = plt.subplots(figsize=(8, 6))
sns.kdeplot(df1_means, ax=ax, label='df1', color='blue')
sns.kdeplot(df2_means, ax=ax, label='df2', color='red')
ax.set_title('Distribution of column means for df1 and df2')
ax.set_xlabel('Mean values')
ax.legend()
plt.show()


# Compute row means:
# For df1, start from the 3rd column (index 2)
df1_row_means = df1.iloc[:, 2:].mean(axis=1)
# For df2, start from the 2nd column (index 1)
df2_row_means = df2.iloc[:, 1:].mean(axis=1)

# Plot
plt.figure(figsize=(8, 6))
sns.kdeplot(df1_row_means, label='df1', color='blue')
sns.kdeplot(df2_row_means, label='df2', color='red')
plt.title('Distribution of row means for df1 and df2')
plt.xlabel('Row mean value')
plt.legend()
plt.show()


# Column-wise min and max (skip ID columns)
df1_col_min = df1.iloc[:, 2:].min()
df2_col_min = df2.iloc[:, 1:].min()
df1_col_max = df1.iloc[:, 2:].max()
df2_col_max = df2.iloc[:, 1:].max()

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.kdeplot(df1_col_min, label='df1', color='blue')
sns.kdeplot(df2_col_min, label='df2', color='red')
plt.title('Column-wise Min')
plt.xlabel('Min value')
plt.legend()

plt.subplot(1, 2, 2)
sns.kdeplot(df1_col_max, label='df1', color='blue')
sns.kdeplot(df2_col_max, label='df2', color='red')
plt.title('Column-wise Max')
plt.xlabel('Max value')
plt.legend()

plt.tight_layout()
plt.show()


# Row-wise min and max (skip ID columns)
df1_row_min = df1.iloc[:, 2:].min(axis=1)
df2_row_min = df2.iloc[:, 1:].min(axis=1)
df1_row_max = df1.iloc[:, 2:].max(axis=1)
df2_row_max = df2.iloc[:, 1:].max(axis=1)

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.kdeplot(df1_row_min, label='df1', color='blue')
sns.kdeplot(df2_row_min, label='df2', color='red')
plt.title('Row-wise Min')
plt.xlabel('Min value')
plt.legend()

plt.subplot(1, 2, 2)
sns.kdeplot(df1_row_max, label='df1', color='blue')
sns.kdeplot(df2_row_max, label='df2', color='red')
plt.title('Row-wise Max')
plt.xlabel('Max value')
plt.legend()

plt.tight_layout()
plt.show()


df1_cols = df1.iloc[:, 2:]
df2_cols = df2.iloc[:, 1:]

# Calculate column-wise stats
df1_skew_col = df1_cols.skew()
df2_skew_col = df2_cols.skew()
df1_std_col = df1_cols.std()
df2_std_col = df2_cols.std()
df1_kurt_col = df1_cols.kurtosis()
df2_kurt_col = df2_cols.kurtosis()

plt.figure(figsize=(18, 5))
plt.subplot(1, 3, 1)
sns.kdeplot(df1_skew_col, label="df1", color="blue")
sns.kdeplot(df2_skew_col, label="df2", color="red")
plt.title("Column-wise Skewness")
plt.xlabel("Skewness")
plt.legend()

plt.subplot(1, 3, 2)
sns.kdeplot(df1_std_col, label="df1", color="blue")
sns.kdeplot(df2_std_col, label="df2", color="red")
plt.title("Column-wise Std Dev")
plt.xlabel("Std Dev")
plt.legend()

plt.subplot(1, 3, 3)
sns.kdeplot(df1_kurt_col, label="df1", color="blue")
sns.kdeplot(df2_kurt_col, label="df2", color="red")
plt.title("Column-wise Kurtosis")
plt.xlabel("Kurtosis")
plt.legend()

plt.tight_layout()
plt.show()


# Calculate row-wise stats
df1_skew_row = df1_cols.skew(axis=1)
df2_skew_row = df2_cols.skew(axis=1)
df1_std_row = df1_cols.std(axis=1)
df2_std_row = df2_cols.std(axis=1)
df1_kurt_row = df1_cols.kurtosis(axis=1)
df2_kurt_row = df2_cols.kurtosis(axis=1)

plt.figure(figsize=(18, 5))
plt.subplot(1, 3, 1)
sns.kdeplot(df1_skew_row, label="df1", color="blue")
sns.kdeplot(df2_skew_row, label="df2", color="red")
plt.title("Row-wise Skewness")
plt.xlabel("Skewness")
plt.legend()

plt.subplot(1, 3, 2)
sns.kdeplot(df1_std_row, label="df1", color="blue")
sns.kdeplot(df2_std_row, label="df2", color="red")
plt.title("Row-wise Std Dev")
plt.xlabel("Std Dev")
plt.legend()

plt.subplot(1, 3, 3)
sns.kdeplot(df1_kurt_row, label="df1", color="blue")
sns.kdeplot(df2_kurt_row, label="df2", color="red")
plt.title("Row-wise Kurtosis")
plt.xlabel("Kurtosis")
plt.legend()

plt.tight_layout()
plt.show()


# --- 1. Select feature columns and split by target ---
features = [col for col in df1.columns if col not in ['ID_code', 'target']]
df0_ = df1[df1['target'] == 0][features]
df1_ = df1[df1['target'] == 1][features]

# --- 2. Column-wise statistics (per feature, grouped by target) ---
column_stats = {}
for stat_name, func in [
    ('mean', 'mean'),
    ('min', 'min'),
    ('max', 'max'),
    ('std', 'std'),
    ('skew', 'skew'),
    ('kurt', 'kurtosis'),
]:
    column_stats[stat_name] = pd.DataFrame({
        'target=0': getattr(df0_, func)(),
        'target=1': getattr(df1_, func)(),
    })


# --- 3. Row-wise statistics (computed per row, grouped by target) ---
row_stats = {}
for stat_name, func in [
    ('mean', 'mean'),
    ('min', 'min'),
    ('max', 'max'),
    ('std', 'std'),
    ('skew', 'skew'),
    ('kurt', 'kurtosis'),
]:
    row_stats[stat_name] = {
        'target=0': getattr(df0_, func)(axis=1),
        'target=1': getattr(df1_, func)(axis=1),
    }


# Print column means
print("Column means by target:")
print(column_stats['mean'])

# Draw distribution of row min values split by target
plt.figure(figsize=(8, 5))
sns.histplot(row_stats['min']['target=0'], color='blue', kde=True, label='target=0')
sns.histplot(row_stats['min']['target=1'], color='red', kde=True, label='target=1')
plt.title("Distribution of Row Minimum by Target")
plt.xlabel("Row min")
plt.legend()
plt.show()


# --- 1. Select feature columns and split by target ---
features = [col for col in df1.columns if col not in ['ID_code', 'target']]
df0_ = df1[df1['target'] == 0][features]
df1_ = df1[df1['target'] == 1][features]

# 2. Column-wise statistics
column_stats = {}
for stat_name, func in [
    ('mean', 'mean'),
    ('min', 'min'),
    ('max', 'max'),
    ('std', 'std'),
    ('skew', 'skew'),
    ('kurt', 'kurtosis'),
]:
    column_stats[stat_name] = pd.DataFrame({
        'target=0': getattr(df0_, func)(),
        'target=1': getattr(df1_, func)(),
    })

# 3. Row-wise statistics
row_stats = {}
for stat_name, func in [
    ('mean', 'mean'),
    ('min', 'min'),
    ('max', 'max'),
    ('std', 'std'),
    ('skew', 'skew'),
    ('kurt', 'kurtosis'),
]:
    row_stats[stat_name] = {
        'target=0': getattr(df0_, func)(axis=1),
        'target=1': getattr(df1_, func)(axis=1),
    }


# COLUMN-WISE PLOTS wrt target
for stat in column_stats:
    plt.figure(figsize=(8,5))
    sns.histplot(column_stats[stat]['target=0'], kde=True, color='blue', label='target=0')
    sns.histplot(column_stats[stat]['target=1'], kde=True, color='red', label='target=1')
    plt.title(f"Column-wise {stat.capitalize()} by Target")
    plt.xlabel(stat.capitalize())
    plt.legend()
    plt.show()

# ROW-WISE PLOTS wrt Target
for stat in row_stats:
    plt.figure(figsize=(8,5))
    sns.histplot(row_stats[stat]['target=0'], kde=True, color='blue', label='target=0')
    sns.histplot(row_stats[stat]['target=1'], kde=True, color='red', label='target=1')
    plt.title(f"Row-wise {stat.capitalize()} by Target")
    plt.xlabel(stat.capitalize())
    plt.legend()
    plt.show()

corrs = df1.drop('ID_code', axis=1).corr()['target'].drop(['target'], errors='ignore').sort_values(key=abs, ascending=False)
print("Correlation of all columns to target:")
print(corrs)

train1 = df1.copy()
test1= df2.copy()

#new features

# Select the features (use your structure: adjust column indices as needed)
features_df1 = train1.columns.values[2:202]   # Typically skips ID/target in tabular competitions
features_df2 = test1.columns.values[1:202]

for df, idx in zip([train1, test1], [features_df1, features_df2]):
    df['sum'] = df[idx].sum(axis=1)
    df['min'] = df[idx].min(axis=1)
    df['max'] = df[idx].max(axis=1)
    df['mean'] = df[idx].mean(axis=1)
    df['std'] = df[idx].std(axis=1)
    df['skew'] = df[idx].skew(axis=1)
    df['kurt'] = df[idx].kurtosis(axis=1)
    df['med'] = df[idx].median(axis=1)


import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import roc_auc_score

#PREPARE DATA
X = train1.drop(columns=['ID_code', 'target'])  # Exclude ID/target
y = train1['target']

#PARAMETER GRID FOR TUNING
param_grid = {
    'num_leaves': [31, 50, 70, 100],
    'learning_rate': [0.01, 0.03, 0.05],
    'max_depth': [7, 10, 15, 20],
    'min_data_in_leaf': [100, 300, 500, 1000],
    'feature_fraction': [0.8, 0.9, 1.0],
    'bagging_fraction': [0.4,0.8, 0.9, 1.0],
    'bagging_freq': [1, 2, 5],
    'lambda_l1': [0, 0.1, 0.5, 1.0],
    'lambda_l2': [0, 0.1, 0.5, 1.0],
    'n_estimators': [100, 300, 600]
}

# CV & SEARCH SETUP
clf = lgb.LGBMClassifier(objective='binary', boosting_type='gbdt', random_state=42, n_jobs=-1)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    clf,
    param_distributions=param_grid,
    n_iter=30,  # Increase for more thorough search
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1,
    verbose=2,
    refit=True
)

#  FIT THE SEARCH
search.fit(X, y)

print("Best parameters:", search.best_params_)
print("Best cross-validated ROC-AUC:", search.best_score_)

import optuna
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score

# Prepare your data
X = train1.drop(columns=['ID_code', 'target'])
y = train1['target']

def objective(trial):
    param = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'metric': 'auc',
        'random_state': 42,
        'n_jobs': -1,
        'num_leaves': trial.suggest_categorical('num_leaves', [31, 50, 70, 100]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.03, 0.05]),
        'max_depth': trial.suggest_categorical('max_depth', [7, 10, 15, 20]),
        'min_data_in_leaf': trial.suggest_categorical('min_data_in_leaf', [100, 300, 500, 1000]),
        'feature_fraction': trial.suggest_categorical('feature_fraction', [0.8, 0.9, 1.0]),
        'bagging_fraction': trial.suggest_categorical('bagging_fraction', [0.4, 0.8, 0.9, 1.0]),
        'bagging_freq': trial.suggest_categorical('bagging_freq', [1, 2, 5]),
        'lambda_l1': trial.suggest_categorical('lambda_l1', [0, 0.1, 0.5, 1.0]),
        'lambda_l2': trial.suggest_categorical('lambda_l2', [0, 0.1, 0.5, 1.0]),
        'n_estimators': trial.suggest_categorical('n_estimators', [100, 300, 600])
    }

    # Stratified split to maintain target distribution
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2,
                                                          stratify=y, random_state=42)
    model = lgb.LGBMClassifier(**param)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='auc',
        callbacks=[lgb.early_stopping(stopping_rounds=20, verbose=False)] )
    preds = model.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, preds)
    return auc

# Create and run Optuna study to maximize ROC-AUC
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30, show_progress_bar=True)

print("Best ROC-AUC:", study.best_value)
print("Best hyperparameters:", study.best_trial.params)

# Use training data with features (X) and target (y). Adjust column dropping as needed.
X1 = train1.drop(columns=['ID_code', 'target'])
y1 = train1['target']


model = lgb.LGBMClassifier(
    objective='binary',
    boosting_type='gbdt',
    num_leaves=50,
    learning_rate=0.05,
    max_depth=20,
    min_data_in_leaf=100,
    feature_fraction=0.8,
    bagging_fraction=0.9,
    bagging_freq=1,
    lambda_l1=1.0,
    lambda_l2=1.0,
    random_state=42,
    n_jobs=-1,
    n_estimators=600
)
model.fit(X1, y1, eval_metric='auc')

X_test = test1.drop(columns=['ID_code'])
# Predict probabilities
# model should be your trained LightGBM model
y_test_pred_proba = model.predict_proba(X_test)[:, 1]

# Prepare submission DataFrame
submission = pd.DataFrame({
    'ID_code': test1['ID_code'],
    'target': y_test_pred_proba
})

#  Save submission
submission.to_csv('submission.csv', index=False)

print("Submission file saved as 'submission.csv'")

importances = model.feature_importances_   # default: 'split'
features = X1.columns
importance_df = pd.DataFrame({'feature': features, 'importance': importances})
importance_df = importance_df.sort_values(by='importance', ascending=False)


# Select top N features (change N to 30 or 50 as desired)
N = 30
top_features = importance_df.sort_values(by='importance', ascending=False).head(N)['feature'].tolist()

# Prepare new training data with only top N features
X_top = train1[top_features]
y = train1['target']

# Train new LightGBM model (model2).
model2 = lgb.LGBMClassifier(
    objective='binary',
    boosting_type='gbdt',
    num_leaves=50,
    learning_rate=0.05,
    max_depth=20, # Corrected max_depth as -20 is not a valid value
    min_data_in_leaf=100,
    feature_fraction=0.8,
    bagging_fraction=0.9,
    bagging_freq=1,
    lambda_l1=1.0,
    lambda_l2=1.0,
    random_state=42,
    n_jobs=-1, # Added missing comma
    n_estimators=600
)
model2.fit(X_top, y, eval_metric='auc')


#  For prediction on test data (test1), use the same top features
X_test_top = test1[top_features]
test_preds = model2.predict_proba(X_test_top)[:, 1]
submission2 = pd.DataFrame({'ID_code': test1['ID_code'], 'target': test_preds})
submission2.to_csv('submission2.csv', index=False)

print('model2 trained and submission.csv saved!')
