In [54]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [55]:
# Read spambase data .csv file to dataframe
# The .csv file is first uploaded to the Colab notebook
df = pd.read_csv('/content/spambase.csv')

# Test print (display .csv file, ensure data transmission)
#print("Dataframe: \n\n", df)

# Sample 5 rows from the dataframe
df.sample(5, random_state=44)

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
118,0.0,0.0,0.0,0.0,1.28,0.0,2.56,1.28,1.28,1.28,...,0.0,0.0,0.0,0.542,0.0,0.0,102.666,304,308,1
19,0.0,0.63,0.0,0.0,1.59,0.31,0.0,0.0,0.31,0.0,...,0.0,0.275,0.0,0.055,0.496,0.0,3.509,91,186,1
41,0.0,0.0,0.0,0.0,2.94,0.0,0.0,0.0,0.0,0.0,...,0.335,0.335,0.0,0.671,0.0,0.0,4.0,12,28,1
425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.336,0.0,0.0,0.0,0.0,1.909,5,21,0
221,1.03,0.0,0.68,0.0,1.03,0.0,0.68,0.0,0.0,0.68,...,0.0,0.0,0.0,0.366,0.061,0.0,1.895,12,91,1


In [56]:
# Split the data into relevant features (X) and label (y)
X = df[['word_freq_free', 'capital_run_length_average', 'capital_run_length_longest', 'capital_run_length_total']]
y = df['spam']

# Test print features and target values (ensure data transmission)
#print(X)
#print(y)

In [57]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [58]:
# Create Random Forest Classifier model
rf_model = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None)
rf_model.fit(X_train, y_train)

In [59]:
# Make predictions using the model
predictions = rf_model.predict(X_test)
predictions

array([0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1])

In [60]:
# Show order of classes to assist with probability predictions
rf_model.classes_

array([0, 1])

In [61]:
# Check probablities for each model prediction
rf_model.predict_proba(X_test)

array([[0.78      , 0.22      ],
       [0.        , 1.        ],
       [0.9       , 0.1       ],
       [0.69      , 0.31      ],
       [0.03      , 0.97      ],
       [0.75      , 0.25      ],
       [0.75      , 0.25      ],
       [0.86      , 0.14      ],
       [1.        , 0.        ],
       [0.8       , 0.2       ],
       [0.91      , 0.09      ],
       [0.04      , 0.96      ],
       [0.45      , 0.55      ],
       [0.94633333, 0.05366667],
       [1.        , 0.        ],
       [0.61      , 0.39      ],
       [0.        , 1.        ],
       [0.87      , 0.13      ],
       [0.95      , 0.05      ],
       [0.16      , 0.84      ],
       [0.99      , 0.01      ],
       [0.07      , 0.93      ],
       [1.        , 0.        ],
       [0.99      , 0.01      ],
       [0.52      , 0.48      ],
       [0.41      , 0.59      ],
       [0.96      , 0.04      ],
       [0.93      , 0.07      ],
       [0.62      , 0.38      ],
       [0.15      , 0.85      ],
       [0.

In [62]:
# Check importance of each feature to the model
importances = rf_model.feature_importances_
columns = X.columns
importance_df = pd.DataFrame({'Feature': columns, 'Importance': importances})
importance_df.sort_values(by='Importance', ascending=False)

Unnamed: 0,Feature,Importance
0,word_freq_free,0.334884
1,capital_run_length_average,0.267504
3,capital_run_length_total,0.209621
2,capital_run_length_longest,0.187991


In [63]:
# Determine model accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.8533333333333334
