In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, HalvingGridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import warnings
from sklearn.compose import ColumnTransformer
from CustomPreprocessorSpacy import SpacyPreprocessor
# Filter out warnings
warnings.filterwarnings('ignore')


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
from sklearn.compose import ColumnTransformer
import warnings
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import HalvingGridSearchCV
import ast

# Filter out warnings
warnings.filterwarnings('ignore')

# Read the CSV files containing features and labels
df_fe = pd.read_csv('spam_bert_tfe.csv')
df_fe['message_features'] = df_fe['message_features'].apply(ast.literal_eval)
df_fe = pd.DataFrame(df_fe['message_features'].apply(pd.Series))

df = pd.read_csv("spam.csv", encoding='latin1')

# Keep only the necessary columns and rename them
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

# Convert labels to numerical values
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Preprocess text data using SpacyPreprocessor
sp = SpacyPreprocessor('en_core_web_sm')
df['message'] = sp.transform(df['message'].values)

# Combine the features and labels
df_fe['label'] = df['label']
df_fe['message'] = df['message']

# Convert column names to string
df_fe.columns = df_fe.columns.astype(str)

# Shuffle the data
sampled_df = df_fe.sample(frac=1, random_state=42)

# Define features (X) and labels (y)
X = sampled_df.drop(columns=['label'])
y = sampled_df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize classifiers
log_reg_clf = LogisticRegression()
svc_clf = SVC()
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier()

# Define parameter grids for grid search
param_grids = [
    {'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},  # C values for Logistic Regression
    {'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100], 'classifier__gamma': [0.01, 0.1, 1, 10, 100]},  # C and gamma values for SVC
    {'classifier__max_depth': [None, 5, 10, 15, 20, 25, 30]},  # max_depth values for Decision Tree
    {'classifier__n_estimators': [10, 50, 100, 200, 500], 'classifier__max_features': ['auto', 'sqrt', 'log2']}  # n_estimators and max_features values for Random Forest
]

# List of classifiers
classifiers = [log_reg_clf, svc_clf, dt_clf, rf_clf]

# Initialize variables to store best model information
best_weighted_f1_score = 0
best_model = None
best_clf_name = None

# Initialize TF-IDF Vectorizer and StandardScaler
tfidf_transformer = TfidfVectorizer()
scaler = StandardScaler()

# Define ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', tfidf_transformer, 'message'),  # TF-IDF Vectorization for text data
        ('scaler', scaler, [col for col in X.columns if col != 'message'])  # Standard Scaling for numerical columns
    ],
    remainder='passthrough'  # Ensure other columns not mentioned are passed through
)

# Iterate over classifiers and corresponding parameter grids
for classifier, param_grid in zip(classifiers, param_grids):
    # Define pipeline
    pipeline = Pipeline([
      ('preprocessor', preprocessor),
      ('classifier', classifier)
    ])

    # Perform grid search for hyperparameter tuning
    grid_search = HalvingGridSearchCV(pipeline, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    # Predict labels on test data
    y_pred = grid_search.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Print classifier information and evaluation metrics
    print("Classifier:", classifier.__class__.__name__)
    print("Best Parameters:", grid_search.best_params_)
    print("Accuracy:", accuracy)
    print(classification_report(y_test, y_pred))
    print("-------------------------------------------\n")

    # Save the best model if it achieves a higher accuracy
    if accuracy > best_weighted_f1_score:
        best_weighted_f1_score = accuracy
        best_model = grid_search.best_estimator_
        best_clf_name = classifier.__class__.__name__

# Save the best model
if best_model is not None:
    joblib.dump(best_model, f"best_model_{best_clf_name}_big_data_fe_full.joblib")
    print(f"Best model saved as best_model_{best_clf_name}_big_data_fe_full.joblib")

# Reload the best model
best_model = joblib.load(f"best_model_{best_clf_name}_big_data_fe_full.joblib")

# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on test data using the best model:", accuracy)
print(classification_report(y_test, y_pred))

# Plot learning curves (uncomment if needed)
# from plot_learning_curve import plot_learning_curve
# plot_learning_curve(best_model, 'Learning Curves', X_train, y_train, n_jobs=-1)


n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 2
min_resources_: 1485
max_resources_: 4457
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 7
n_resources: 1485
Fitting 5 folds for each of 7 candidates, totalling 35 fits
----------
iter: 1
n_candidates: 3
n_resources: 4455
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Classifier: LogisticRegression
Best Parameters: {'classifier__C': 0.1}
Accuracy: 0.9874439461883409
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       949
           1       0.97      0.95      0.96       166

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115

-------------------------------------------

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 165
max_resources_: 4457
aggressive_elimination: False
factor: 3
-------