In [88]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report, precision_recall_curve
from sklearn.model_selection import train_test_split, GridSearchCV,learning_curve
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.calibration import calibration_curve
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler


# Load your data
data = pd.read_csv('cleaned.csv')

# Define features and target
columns_to_exclude = ['indexing_status', 'label', 'url', 'ssl_issuer', 'ssl_subject', 'ssl_not_before', 'ssl_not_after', 'domain']
X = data.drop(columns=columns_to_exclude)
y = data['indexing_status']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['number']).columns
print(categorical_cols)
print(numerical_cols)


# Encode categorical features using OneHotEncoder
# Define the preprocessor with OneHotEncoder for categorical columns and StandardScaler for numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_cols),
        ('num', StandardScaler(), numerical_cols)
    ])

# Fit and transform the preprocessor on the feature set X
X_transformed = preprocessor.fit_transform(X)

# Display the transformed data shape
X_transformed.shape


# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define models
models = {
    'Logistic Regression': {
        'model': LogisticRegression(),
        'params': {'classifier__C': [0.1, 1, 10]}
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {'classifier__n_estimators': [50, 100, 200]}
    },
    'SVM': {
        'model': SVC(probability=True),
        'params': {'classifier__C': [0.1, 1, 10], 'classifier__kernel': ['linear', 'rbf']}
    },
    'Gradient Boosting': {
        'model': GradientBoostingClassifier(),
        'params': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__learning_rate': [0.01, 0.1, 1],
            'classifier__max_depth': [3, 5, 7]
        }
    },
    'K-Nearest Neighbors': {
        'model': KNeighborsClassifier(),
        'params': {'classifier__n_neighbors': [3, 5, 7]}
    },
    'AdaBoost': {
        'model': AdaBoostClassifier(),
        'params': {'classifier__n_estimators': [50, 100, 200]}
    },
    'Extra Trees': {
        'model': ExtraTreesClassifier(random_state=42),
        'params': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [3, 5, 7]
        }
    },
    'XGBoost': {
        'model': xgb.XGBClassifier(random_state=42),
        'params': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__learning_rate': [0.01, 0.1, 0.2],
            'classifier__max_depth': [3, 5, 7]
        }
    }
}




# Function for training and tuning models
def train_and_tune_model(name, model, params):
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression())])
    grid_search = GridSearchCV(pipeline, params, cv=5,
    scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    return grid_search.best_estimator_, grid_search.best_params_

# Train and tune models
best_models = {}
for name, model_info in models.items():
    model, params = model_info['model'], model_info['params']
    best_model, best_params = train_and_tune_model(name, model, params)
    best_models[name] = best_model

# Prediction
y_preds = {}
for name, model in best_models.items():
    y_preds[name] = model.predict(X_test)

# Error Plots
plt.figure(figsize=(12, 6))
for name, y_pred in y_preds.items():
    error = np.abs(y_test - y_pred)
    plt.plot(error, label=name)
    plt.title('Error Plot')
    plt.xlabel('Sample Index')
    plt.ylabel('Absolute Error')
    plt.legend()
    plt.grid(True)
    plt.show()


# ROC Curves
plt.figure(figsize=(12, 6))
for name, model in best_models.items():
    y_prob = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{name} (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.grid(True)
    plt.show()


# Precision-Recall Curves
plt.figure(figsize=(12, 6))
for name, model in best_models.items():
    y_prob = model.predict_proba(X_test)[:, 1]
    precision, recall, _ = precision_recall_curve(y_test, y_prob)
    plt.plot(recall, precision, label=name)
    plt.title('Precision-Recall Curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.legend()
    plt.grid(True)
    plt.show()


# Learning Curves
def plot_learning_curve(model, X, y):
    train_sizes, train_scores, test_scores = learning_curve(model, X, y, cv=5, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10))
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    
    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_mean, 'o-', color='r', label='Training score')
    plt.plot(train_sizes, test_mean, 'o-', color='g', label='Crossvalidation score')
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='r')
    plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color='g')
    plt.xlabel('Training Set Size')
    plt.ylabel('Score')
    plt.title(f'Learning Curve for {model.__class__.__name__}')
    plt.legend(loc='best')
    plt.grid(True)
    plt.show()


# Plot learning curves for each model
for name, model in best_models.items():
    plot_learning_curve(model, X_train, y_train)

# Feature Correlation Matrix
    plt.figure(figsize=(12, 10))
    sns.heatmap(X.corr(), annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    plt.title('Feature Correlation Matrix')
    plt.show()

# Calibration Curves
def plot_calibration_curve(model, X, y):
    y_prob = model.predict_proba(X)[:, 1]
    fraction_of_positives, mean_predicted_value = calibration_curve(y, y_prob, n_bins=10, strategy='uniform')

    plt.figure(figsize=(10, 6))
    plt.plot(mean_predicted_value, fraction_of_positives, 's-', label=f'{model.__class__.__name__}')
    plt.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
    plt.xlabel('Mean Predicted Value')
    plt.ylabel('Fraction of Positives')
    plt.title(f'Calibration Curve for {model.__class__.__name__}')
    plt.legend(loc='best')
    plt.grid(True)
    plt.show()

# Plot calibration curves for each model
for name, model in best_models.items():
    plot_calibration_curve(model, X_test, y_test)

# Feature Importances
def plot_feature_importances(model, X):
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1]
    
    plt.figure(figsize=(12, 8))
    plt.title(f'Feature Importances for {model.__class__.__name__}')
    plt.bar(range(X.shape[1]), importances[indices], align='center')
    plt.xticks(range(X.shape[1]), X.columns[indices], rotation=90)
    plt.xlim([-1, X.shape[1]])
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.grid(True)
    plt.show()
    
# Plot feature importances for each model
for name, model in best_models.items():
    plot_feature_importances(model, X_train)

# Pairplot
    plt.figure(figsize=(12, 10))
    data_with_target = X.copy()
    data_with_target['target'] = y
    sns.pairplot(data_with_target, hue='target', diag_kind='kde')
    plt.suptitle('Pairplot of Features', y=1.02)
    plt.show()
    
# Confusion Matrix and Classification Reports
    plt.figure(figsize=(12, 12))
    for i, (name, model) in enumerate(best_models.items()):
        y_pred = model.predict(X_test)
        
    plt.subplot(3, 3, i+1)
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
    xticklabels=['Class 0', 'Class 1'],
    yticklabels=['Class 0', 'Class 1'])
    plt.title(f'{name} Confusion Matrix')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.tight_layout()
    plt.show()

# Print Classification Reports
for name, model in best_models.items():
    print(f"Classification Report for {name}:\n")
    print(classification_report(y_test, model.predict(X_test)))

2024-09-30 23:37:04,110 - INFO - Training and tuning: Logistic Regression


Index(['background_color', 'most_used_font_face', 'secure_protocol'], dtype='object')
Index(['page_load_time', 'image_percentage', 'video_percentage',
       'text_percentage', 'internal_links', 'external_links'],
      dtype='object')
Fitting 5 folds for each of 5 candidates, totalling 25 fits


2024-09-30 23:37:06,417 - ERROR - Error training Logistic Regression: Invalid parameter 'logisticregression' for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat', OneHotEncoder(),
                                                  Index(['background_color', 'most_used_font_face', 'secure_protocol'], dtype='object')),
                                                 ('num', StandardScaler(),
                                                  Index(['page_load_time', 'image_percentage', 'video_percentage',
       'text_percentage', 'internal_links', 'external_links'],
      dtype='object'))])),
                ('classifier', LogisticRegression(class_weight='balanced'))]). Valid parameters are: ['memory', 'steps', 'verbose'].
2024-09-30 23:37:06,418 - INFO - Training and tuning: SVM


Fitting 5 folds for each of 9 candidates, totalling 45 fits


2024-09-30 23:37:08,636 - ERROR - Error training SVM: Invalid parameter 'svc' for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat', OneHotEncoder(),
                                                  Index(['background_color', 'most_used_font_face', 'secure_protocol'], dtype='object')),
                                                 ('num', StandardScaler(),
                                                  Index(['page_load_time', 'image_percentage', 'video_percentage',
       'text_percentage', 'internal_links', 'external_links'],
      dtype='object'))])),
                ('classifier', SVC(class_weight='balanced', probability=True))]). Valid parameters are: ['memory', 'steps', 'verbose'].
2024-09-30 23:37:08,641 - INFO - Training and tuning: Random Forest


Fitting 5 folds for each of 3 candidates, totalling 15 fits


2024-09-30 23:37:11,030 - ERROR - Error training Random Forest: Invalid parameter 'randomforestclassifier' for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat', OneHotEncoder(),
                                                  Index(['background_color', 'most_used_font_face', 'secure_protocol'], dtype='object')),
                                                 ('num', StandardScaler(),
                                                  Index(['page_load_time', 'image_percentage', 'video_percentage',
       'text_percentage', 'internal_links', 'external_links'],
      dtype='object'))])),
                ('classifier', RandomForestClassifier(random_state=42))]). Valid parameters are: ['memory', 'steps', 'verbose'].
2024-09-30 23:37:11,033 - INFO - Training and tuning: Gradient Boosting


Fitting 5 folds for each of 9 candidates, totalling 45 fits


2024-09-30 23:37:13,433 - ERROR - Error training Gradient Boosting: Invalid parameter 'gradientboostingclassifier' for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat', OneHotEncoder(),
                                                  Index(['background_color', 'most_used_font_face', 'secure_protocol'], dtype='object')),
                                                 ('num', StandardScaler(),
                                                  Index(['page_load_time', 'image_percentage', 'video_percentage',
       'text_percentage', 'internal_links', 'external_links'],
      dtype='object'))])),
                ('classifier', GradientBoostingClassifier())]). Valid parameters are: ['memory', 'steps', 'verbose'].
2024-09-30 23:37:13,434 - INFO - Training and tuning: K-Nearest Neighbors


Fitting 5 folds for each of 3 candidates, totalling 15 fits


2024-09-30 23:37:15,762 - ERROR - Error training K-Nearest Neighbors: Invalid parameter 'kneighborsclassifier' for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat', OneHotEncoder(),
                                                  Index(['background_color', 'most_used_font_face', 'secure_protocol'], dtype='object')),
                                                 ('num', StandardScaler(),
                                                  Index(['page_load_time', 'image_percentage', 'video_percentage',
       'text_percentage', 'internal_links', 'external_links'],
      dtype='object'))])),
                ('classifier', KNeighborsClassifier())]). Valid parameters are: ['memory', 'steps', 'verbose'].
2024-09-30 23:37:15,763 - INFO - Training and tuning: AdaBoost


Fitting 5 folds for each of 3 candidates, totalling 15 fits


2024-09-30 23:37:18,143 - ERROR - Error training AdaBoost: Invalid parameter 'adaboostclassifier' for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat', OneHotEncoder(),
                                                  Index(['background_color', 'most_used_font_face', 'secure_protocol'], dtype='object')),
                                                 ('num', StandardScaler(),
                                                  Index(['page_load_time', 'image_percentage', 'video_percentage',
       'text_percentage', 'internal_links', 'external_links'],
      dtype='object'))])),
                ('classifier', AdaBoostClassifier())]). Valid parameters are: ['memory', 'steps', 'verbose'].
2024-09-30 23:37:18,147 - INFO - Training and tuning: Extra Trees


Fitting 5 folds for each of 3 candidates, totalling 15 fits


2024-09-30 23:37:20,654 - ERROR - Error training Extra Trees: Invalid parameter 'extratreesclassifier' for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat', OneHotEncoder(),
                                                  Index(['background_color', 'most_used_font_face', 'secure_protocol'], dtype='object')),
                                                 ('num', StandardScaler(),
                                                  Index(['page_load_time', 'image_percentage', 'video_percentage',
       'text_percentage', 'internal_links', 'external_links'],
      dtype='object'))])),
                ('classifier', ExtraTreesClassifier(random_state=42))]). Valid parameters are: ['memory', 'steps', 'verbose'].
2024-09-30 23:37:20,655 - INFO - Training and tuning: XGBoost


Fitting 5 folds for each of 9 candidates, totalling 45 fits


2024-09-30 23:37:22,963 - ERROR - Error training XGBoost: Invalid parameter 'xgbclassifier' for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat', OneHotEncoder(),
                                                  Index(['background_color', 'most_used_font_face', 'secure_protocol'], dtype='object')),
                                                 ('num', StandardScaler(),
                                                  Index(['page_load_time', 'image_percentage', 'video_percentage',
       'text_percentage', 'internal_links', 'external_links'],
      dtype='object'))])),
                ('classifier',
                 XGBCla...
                               feature_types=None, gamma=None, grow_policy=None,
                               importance_type=None,
                               interaction_constraints=None, learning_rate=None,
                               max_bin=None, max_cat_threshold=None,
                           

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>