In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import joblib
import os
import numpy as np

data_path = 'C:\\Users\\Shamailh_M77\\Downloads\\machine final\\project\\spambase.data'
names_path = 'C:\\Users\\Shamailh_M77\\Downloads\\machine final\\project\\spambase.names'
sample_submission_path = 'C:\\Users\\Shamailh_M77\\Downloads\\sample_submission.csv'

with open(names_path, 'r') as file:
    lines = file.readlines()

features = []
for line in lines:
    if not line.startswith('|') and ':' in line:
        features.append(line.split(':')[0].strip())

features.append('is_spam')

df = pd.read_csv(data_path, header=None, names=features)

print(df.head())

train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

print(f"Training set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")

train_df.to_csv('spambase_train.csv', index=False)
test_df.to_csv('spambase_test.csv', index=False)

X_train = train_df.drop('is_spam', axis=1)
y_train = train_df['is_spam']
X_test = test_df.drop('is_spam', axis=1)
y_test = test_df['is_spam']

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=42)
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
rf_random.fit(X_train, y_train)

best_model = rf_random.best_estimator_

joblib.dump(best_model, 'best_random_forest_model.pkl')

y_test_pred = best_model.predict(X_test)

test_f1 = f1_score(y_test, y_test_pred)
print(f"F1 Score on Test Set: {test_f1}")

sample_submission = pd.read_csv(sample_submission_path)

test_df = test_df.reset_index(drop=True)
test_df.index = sample_submission['ID'] - 1  # Assuming IDs in sample_submission are 1-based

def create_prediction_file(model, test_data, sample_sub, output_file):
    predictions = model.predict(test_data.drop('is_spam', axis=1))
    sample_sub['spam'] = predictions
    sample_sub.to_csv(output_file, index=False)

create_prediction_file(best_model, test_df, sample_submission, 'final_submissionRE.csv')

num_features = len(features) - 1  # Subtract 1 for the target variable 'is_spam'
print("Number of features used:", num_features)


   word_freq_make  word_freq_address  word_freq_all  word_freq_3d  \
0            0.00               0.64           0.64           0.0   
1            0.21               0.28           0.50           0.0   
2            0.06               0.00           0.71           0.0   
3            0.00               0.00           0.00           0.0   
4            0.00               0.00           0.00           0.0   

   word_freq_our  word_freq_over  word_freq_remove  word_freq_internet  \
0           0.32            0.00              0.00                0.00   
1           0.14            0.28              0.21                0.07   
2           1.23            0.19              0.19                0.12   
3           0.63            0.00              0.31                0.63   
4           0.63            0.00              0.31                0.63   

   word_freq_order  word_freq_mail  ...  char_freq_;  char_freq_(  \
0             0.00            0.00  ...         0.00        0.000   
1 

126 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
96 fits failed with the following error:
Traceback (most recent call last):
  File "D:\Anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\Anaconda\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "D:\Anaconda\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "D:\Anaconda\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidPara

F1 Score on Test Set: 0.9470899470899471
Number of features used: 57
