In [13]:
import csv
import pandas as pd
import numpy as np
import chardet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

### Load spam.csv into a pandas dataframe and remove empty columns

In [3]:
with open('Data/spam.csv', 'rb') as f:
        raw_data = f.read(10000)
        result = chardet.detect(raw_data)
        detected_encoding = result['encoding']

print(f"Encoding: {detected_encoding}")

Encoding: Windows-1252


In [4]:
data_path = 'Data/spam.csv'
data = pd.read_csv(data_path, encoding='Windows-1252')
print(data[:10])

     v1  ... Unnamed: 4
0   ham  ...        NaN
1   ham  ...        NaN
2  spam  ...        NaN
3   ham  ...        NaN
4   ham  ...        NaN
5  spam  ...        NaN
6   ham  ...        NaN
7   ham  ...        NaN
8  spam  ...        NaN
9  spam  ...        NaN

[10 rows x 5 columns]


In [5]:
print(f'There are {len(data.columns)} columns')
print("Column names:")
for c in data.columns:
    print(f'-{c}')

There are 5 columns
Column names:
-v1
-v2
-Unnamed: 2
-Unnamed: 3
-Unnamed: 4


In [6]:
null_columns = data.isna().any()
null_column_list = null_columns[null_columns].index.tolist()
if(null_column_list):
    print("The following columns have null values:")
    for item in null_column_list:
        print(f'-{item}')
else:
    print("There are no columns or rows with null values")

The following columns have null values:
-Unnamed: 2
-Unnamed: 3
-Unnamed: 4


In [7]:
data = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [8]:
data.columns = ['label', 'email']

In [9]:
print(data[:10])

  label                                              email
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
5  spam  FreeMsg Hey there darling it's been 3 week's n...
6   ham  Even my brother is not like to speak with me. ...
7   ham  As per your request 'Melle Melle (Oru Minnamin...
8  spam  WINNER!! As a valued network customer you have...
9  spam  Had your mobile 11 months or more? U R entitle...


In [24]:
vectorizor = TfidfVectorizer()
vocab = vectorizor.fit_transform(data["email"].to_list())
encoding = {"ham" : 0, "spam" : 1}
labels = data["label"].map(encoding)
x_train, x_test, y_train, y_test = train_test_split(vocab, labels, random_state=0, test_size=0.2)

In [26]:
rng = np.random.RandomState(1)
clf = MultinomialNB()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}" )

Accuracy: 0.9488789237668162
