In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split


## Loading and Storing the Dataset

In [None]:
def load_data():
    """
    Load the training, test, and sample submission datasets from CSV files.
    Returns three dataframes: train, test.
    """
    df = pd.read_csv("/kaggle/input/System-Threat-Forecaster/train.csv")
    test_data = pd.read_csv("/kaggle/input/System-Threat-Forecaster/test.csv")
    df = df.reindex(sorted(df.columns), axis=1)
    test_data = test_data.reindex(sorted(test_data.columns), axis=1)

    return df, test_data


## Exploratory Data Analysis (EDA)

In [None]:

def exploratory_data_analysis(df):
    """
    Perform Exploratory Data Analysis (EDA) on the given DataFrame.
    Includes null value analysis, data type distribution, statistical summary, and more.
    """
    print("Exploratory Data Analysis (EDA)")

    # Finding columns with null values
    null_counts = df.isnull().sum()[df.isnull().sum() > 0].sort_values()
    print("Columns with Missing Values:\n", null_counts)

    print("Data Type Distribution:")
    print(df.dtypes.value_counts())

    print("asic Statistical Summary (Numerical Data):")
    print(df.describe())

    print("Duplicate Rows Count:", df.duplicated().sum())
    print("EDA Completed!")
df, test_data = load_data()
exploratory_data_analysis(df)   

In [None]:
plt.figure(figsize=(3, 2))
target_dis = df['target'].value_counts()

plt.pie(target_dis, labels=target_dis.index, autopct='%1.1f%%', startangle=90, 
        wedgeprops={'edgecolor': 'black'})

plt.title("Target Variable Distribution")
plt.show()


## Handling Missing Data

In [None]:
def handle_missing_values(df, test_data):
    """
    Handles missing values:
    - Replaces numerical NaNs with column mean.
    - Replaces categorical NaNs with the most frequent value.
    """
    num_cols = df.select_dtypes(include=["int64", "float64"]).columns.drop("target", errors="ignore")
    cat_cols = df.select_dtypes(include=["object"]).columns

    num_imputer = SimpleImputer(strategy="mean")
    cat_imputer = SimpleImputer(strategy="most_frequent")

    df[num_cols] = num_imputer.fit_transform(df[num_cols])
    test_data[num_cols] = num_imputer.transform(test_data[num_cols])

    df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])
    test_data[cat_cols] = cat_imputer.transform(test_data[cat_cols])

    print("Missing values handled successfully.")
    return df, test_data


## Data Preprocessing

In [None]:
def extract_time_features(df, test_data):
    """
    Extracts the month from the 'DateAS' column.
    """
    df["DateAS"] = pd.to_datetime(df["DateAS"])
    test_data["DateAS"] = pd.to_datetime(test_data["DateAS"])
    df["Month"] = df["DateAS"].dt.month
    test_data["Month"] = test_data["DateAS"].dt.month
    df.drop(columns=["DateAS"], inplace=True)
    test_data.drop(columns=["DateAS"], inplace=True)
    print("Time features extracted successfully.")

    return df, test_data

def encode_categorical_features(df, test_data):
    """Encodes categorical features: One-hot encoding for low cardinality, Frequency encoding for high cardinality."""
    categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
    low_cardinality_features = [col for col in categorical_cols if df[col].nunique() <= 20]
    high_cardinality_features = [col for col in categorical_cols if df[col].nunique() > 20]

    # One-Hot Encoding for low cardinality
    df = pd.get_dummies(df, columns=low_cardinality_features, drop_first=True)
    test_data = pd.get_dummies(test_data, columns=low_cardinality_features, drop_first=True)

    df= df.drop(columns=list(set(df.columns)-(set(test_data.columns)|{'target'})), axis=1)

    # Frequency Encoding for high cardinality
    for col in high_cardinality_features:
        freq = df[col].value_counts()
        df[col + 'freq'] = df[col].map(freq)
        freq_test = test_data[col].value_counts()
        test_data[col + 'freq'] = test_data[col].map(freq_test)

    # Drop original high-cardinality categorical features
    df.drop(columns=high_cardinality_features, inplace=True, errors="ignore")
    test_data.drop(columns=high_cardinality_features, inplace=True, errors="ignore")
    print("categorical features encoded successfully.")

    return df, test_data

def scale_features(df, test_data):
    """
    Scales numerical features using RobustScaler.
    """
    num_cols = df.select_dtypes(include=["int64",'int32', "float64"]).columns.drop("target", errors="ignore")
    scaler = RobustScaler()

    df[num_cols] = scaler.fit_transform(df[num_cols])
    test_data[num_cols] = scaler.fit_transform(test_data[num_cols])
    print("Features scaled successfully.")
    return df, test_data


## Feature Engineering and Extraction

In [None]:

def remove_low_cardinality_features(df, test_df=None):
    """
    Removes features with only a single unique value (cardinality == 1).
    pd.DataFrame, pd.DataFrame: Processed train and test dataframes
    """
    cardinality = df.nunique()
    low_cardinality = cardinality[cardinality == 1].index.tolist()

    df = df.drop(columns=low_cardinality)
    test_df = test_df.drop(columns=low_cardinality)

    return df, test_df


def remove_high_correlation_features(df):
    """
    Removes features that are highly correlated (correlation > threshold).
    """
    corr_matrix = df.corr().abs()
    corr_pairs = corr_matrix.unstack().sort_values(ascending=False)
    high_corr_pairs = corr_pairs[(corr_pairs < 1) & (corr_pairs > 0.993)]
    features_to_drop = set()
    for feat1, feat2 in high_corr_pairs.index:
        if feat1 not in features_to_drop and feat2 not in features_to_drop:
            features_to_drop.add(feat2)  # Keep feat1, drop feat2

    df = df.drop(columns=features_to_drop)
    print(f"Removed highly correlated pair features.")
    return df

def filter_low_correlation_features(df, target="target", threshold=0.005):
    """
    Retains only features with absolute correlation >= threshold with the target variable.
    """
    print(f"Removed low correlated features.")
    filtered_corr = df.corr()[target].loc[lambda x: abs(x) >= threshold]
    df = df[filtered_corr.index]
    
    return df


def clean_column_names(df, test_data):
    """
    Cleans column names by replacing special characters with underscores.
    """
    df.columns = df.columns.str.replace("[^a-zA-Z0-9]", "_", regex=True)
    test_data.columns = test_data.columns.str.replace("[^a-zA-Z0-9]", "_", regex=True)
    
    print("Column names cleaned successfully.")
    return df, test_data

def feature_selection_variance(X, threshold=0.0005):
    """
    Removes low-variance features.
    """
    selector = VarianceThreshold(threshold=threshold)
    X_selected = selector.fit_transform(X)
    selected_columns = X.columns[selector.get_support()]
    
    print("Low-variance features removed successfully.")
    return pd.DataFrame(X_selected, columns=selected_columns)


## Splitting Data

In [None]:
def prepare_final_datasets(df, test_data, target="target"):
    """feature selection, and splits data.
    Returns:
    - X_train, X_test, y_train, y_test: Train-Test split
    - test_data: Processed test dataset
    """
    # Keep only common features in both df and test_data
    common_features = list(set(df.columns) & set(test_data.columns))
    
    df = df[common_features + [target]]  # Retain target column in train dataset
    test_data = test_data[common_features]  # Keep only aligned features in test dataset
    # Separate features and target variable
    X = df.drop(columns=[target])
    y = df[target]

    X = feature_selection_variance(X)
   
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test, test_data ,X,y,df


In [None]:
def preprocess_pipeline(df,test_data):
    """
    Runs the entire preprocessing pipeline in sequence.
    """
    df, test_data = remove_low_cardinality_features(df, test_data)
    df, test_data = handle_missing_values(df, test_data)
    df, test_data = extract_time_features(df, test_data)
    df, test_data = encode_categorical_features(df, test_data)
    df, test_data = scale_features(df, test_data)
    df = remove_high_correlation_features(df)
    df = filter_low_correlation_features(df, target="target", threshold=0.005)
    df, test_data = clean_column_names(df, test_data)

    return prepare_final_datasets(df, test_data, target="target")
# Run the preprocessing pipeline
X_train, X_test, y_train, y_test, test_data,X,y,df= preprocess_pipeline(df,test_data)


In [None]:
def feature_selection_lgb(X, y,X_train, y_train):
    """
    Uses LGBMClassifier to determine feature importance and removes features below a threshold.
    """
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train LightGBM Model
    lgbm = LGBMClassifier(n_estimators=300, learning_rate=0.08, verbose=-1, random_state=42)
    lgbm.fit(X_train, y_train)
    
    # Get feature importance
    importances = lgbm.feature_importances_
    
    # Create DataFrame for feature importance
    feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
    
    # Sort by importance (descending)
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    # Display top 20 most important features
    print("Top 20 Most Important Features:")
    print(feature_importance_df.head(20))

    # Plot Top 30 Features
    top_30_features = feature_importance_df.head(30)

    # Ensure correct plot scaling
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=top_30_features, palette='viridis')
    plt.xlabel('Feature Importance Score', fontsize=12)
    plt.ylabel('Features', fontsize=12)
    plt.title('Top 30 Most Important Features in LightGBM', fontsize=14)
    plt.grid(axis='x', linestyle='--', alpha=0.7)  # Adds grid lines for better readability
    plt.show()

    # Select Important Features
    selected_features = feature_importance_df[feature_importance_df['Importance'] > 0]['Feature']
    X_selected = X[selected_features]  # Avoid modifying X inplace

    # Split Data Again After Feature Selection
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

    return X_selected, X_train, X_test, y_train, y_test



In [None]:
X, X_train, X_test, y_train, y_test= feature_selection_lgb(X, y ,X_train, y_train)


In [None]:
import math

# Define number of plots per row
num_cols = 4  
num_rows = math.ceil(len(X.columns) / num_cols)

fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, 5 * num_rows))
axes = axes.flatten()  # Flatten axes for easy indexing

for i, fe in enumerate(X.columns):
    sns.kdeplot(df[df['target'] == 0][fe], label="Class 0", fill=True, color="blue", ax=axes[i])
    sns.kdeplot(df[df['target'] == 1][fe], label="Class 1", fill=True, color="red", ax=axes[i])
    
    axes[i].set_title(f"KDE Plot of {fe}")
    axes[i].set_xlabel(fe)
    axes[i].set_ylabel("Density")
    axes[i].legend()  # Add legend

# Remove empty subplots if the number of features is not a multiple of num_cols
for j in range(i + 1, len(axes)):  
    fig.delaxes(axes[j])  

plt.tight_layout()
plt.show()


## Model Evaluation and Comparison

In [None]:
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def evaluate_model(name, y_true, y_pred):
    """
    Evaluates a model's performance using various metrics and prints the results.
    """
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)

    print(f"\n🔹 {name} Performance:")
    print(f"   Accuracy:  {accuracy:.4f}")
    print(f"   Precision: {precision:.4f}")
    print(f"   Recall:    {recall:.4f}")
    print(f"   F1-score:  {f1:.4f}")
    print(f"   ROC-AUC:   {roc_auc:.4f}")
    print("-" * 40)

def compare_models(X_train, y_train, X_test, y_test):
    """
    Trains and evaluates LightGBM, CatBoost, and XGBoost models, then prints their performance.
    """
    models = {
        "LightGBM": LGBMClassifier(n_estimators=500, learning_rate=0.09, colsample_bytree=0.7, random_state=42, verbose=-1, n_jobs=-1),
        "CatBoost": CatBoostClassifier(iterations=500, learning_rate=0.09,verbose=0, random_seed=42, thread_count=-1),
        "XGBoost": XGBClassifier(n_estimators=500, learning_rate=0.09,colsample_bytree=0.7, random_state=42, n_jobs=-1)
    }
    
    for name, model in models.items():
        model.fit(X_train, y_train)  # Train
        y_pred = model.predict(X_test)  # Predict
        evaluate_model(name, y_test, y_pred)  # Evaluate

compare_models(X_train, y_train, X_test, y_test)


In [None]:

from scipy.stats import randint, uniform


def hyperparameter_tuning(X_train, y_train):
    """Optimized RandomizedSearchCV for XGBoost using scipy.stats distributions."""
    
    param_dist = {
        'n_estimators': randint(500, 1000),  # More trees for better learning
        'learning_rate': uniform(0.01, 0.03),  # Wider learning rate range
        'colsample_bytree':[0.7]
    }

    xgb = XGBClassifier(random_state=42, verbosity=0, n_jobs=-1)

    random_search = RandomizedSearchCV(
        xgb, param_dist, n_iter=3, cv=2, scoring='accuracy', n_jobs=-1, verbose=1, random_state=42
    )

    random_search.fit(X_train, y_train)
    print("Best Parameters:", random_search.best_params_)

    return XGBClassifier(**random_search.best_params_, random_state=42, verbosity=0)

def train_and_evaluate(X_train, y_train, X_test, y_test):
    """Trains the best XGBoost model and evaluates it."""
    best_model = hyperparameter_tuning(X_train, y_train)
    best_model.fit(X_train, y_train)
    
    y_pred = best_model.predict(X_test)
    evaluate_model("XGBoost", y_test, y_pred)
    return best_model

model = train_and_evaluate(X_train, y_train, X_test, y_test)


In [None]:
test_data = test_data[X.columns]

## Model Submission

In [None]:
test_predictions =model.predict(test_data)  # Generate predictions
sample_submission = pd.read_csv('/kaggle/input/System-Threat-Forecaster/sample_submission.csv')
submission = sample_submission.copy()
submission['target'] = test_predictions  

submission['target'] = submission['target'].apply(lambda x: 1 if x > 0.5 else 0)

# Save to CSV
submission.to_csv('submission.csv', index=False)

# Output file ready for submission
print("Submission file created.")
