# Loading the Data

In [1]:
import numpy as np

np.random.default_rng(seed=2026)

Generator(PCG64) at 0x10C7DCE40

In [2]:
import pandas as pd

spam_df = pd.read_csv('spambase.data', header=None)
# without the header=None flag, the first row ended up being the column names

In [3]:
print(f"Spam Data Preview: \n")
spam_df.head()

Spam Data Preview: 



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


We still need the column names, which we can get from the file spambase.names.

In [4]:
# get a list with each line in the file
with open('spambase.names', 'r') as file:
    col_names = file.readlines()

In [5]:
# the column names start at line 34
col_names = col_names[33:]
# for col_name in col_names: print(col_name) #- > this confirms that we are starting at the correct line

We also need to cut off everything after the colon in each line. 

In [6]:
col_names = [col_name.split(':')[0] for col_name in col_names]
# for col_name in col_names: print(col_name) # -> this confirms the success of the parsing

In [7]:
print(f"Number of columns in dataset: {spam_df.shape[1]}")
print(f"Number of column names acquired: {len(col_names)}")

Number of columns in dataset: 58
Number of column names acquired: 57


We are missing the final column, which is the label column (1 for spam, 0 for not spam). 

In [8]:
col_names.append("is_spam")

In [9]:
# Now, assign the column names
spam_df.columns = col_names

In [10]:
print(f"Spam Data Preview: \n")
spam_df.head()

Spam Data Preview: 



Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,is_spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [20]:
print(f"Dimensions of dataset: {spam_df.shape}")

Dimensions of dataset: (4601, 58)


We see here that we indeed have 57 continuous features and 1 binary classifier. Now, we split the data into a test and train set.

In [28]:
# Get the row indices of the dataframe 
samples = spam_df.index.tolist()   

# shuffle
np.random.shuffle(samples)

# divide this with a 5% / 95% split
train_ind = samples[:round(len(samples)*0.05)]
test_ind = samples[-round(len(samples)*0.95):]

print(f"Number of training samples: {len(train_ind)}")
print(f"Number of testings samples: {len(test_ind)}")

Number of training samples: 230
Number of testings samples: 4371


In [31]:
# split the dataset

# inputs only
X = spam_df.iloc[:, :-1]

# outputs only
y = spam_df["is_spam"]

# train-test split
X_train = X.iloc[train_ind]
X_test = X.iloc[test_ind]

y_train = y.iloc[train_ind]
y_test = y.iloc[test_ind]

# Task 1