In [1]:
import csv
import pandas as pd
import numpy as np
import chardet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

### Load spam.csv into a pandas dataframe and remove empty columns

In [2]:
with open('Data/spam.csv', 'rb') as f:
        raw_data = f.read(10000)
        result = chardet.detect(raw_data)
        detected_encoding = result['encoding']

print(f"Encoding: {detected_encoding}")

Encoding: Windows-1252


In [3]:
data_path = 'Data/spam.csv'
data = pd.read_csv(data_path, encoding='Windows-1252')
print(data[:10])

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   
5  spam  FreeMsg Hey there darling it's been 3 week's n...        NaN   
6   ham  Even my brother is not like to speak with me. ...        NaN   
7   ham  As per your request 'Melle Melle (Oru Minnamin...        NaN   
8  spam  WINNER!! As a valued network customer you have...        NaN   
9  spam  Had your mobile 11 months or more? U R entitle...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
5        NaN        NaN  
6        NaN  

In [4]:
print(f'There are {len(data.columns)} columns')
print("Column names:")
for c in data.columns:
    print(f'-{c}')

There are 5 columns
Column names:
-v1
-v2
-Unnamed: 2
-Unnamed: 3
-Unnamed: 4


In [5]:
null_columns = data.isna().any()
null_column_list = null_columns[null_columns].index.tolist()
if(null_column_list):
    print("The following columns have null values:")
    for item in null_column_list:
        print(f'-{item}')
else:
    print("There are no columns or rows with null values")

The following columns have null values:
-Unnamed: 2
-Unnamed: 3
-Unnamed: 4


In [6]:
data = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [7]:
data.columns = ['label', 'email']

In [8]:
print(data[:10])

  label                                              email
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
5  spam  FreeMsg Hey there darling it's been 3 week's n...
6   ham  Even my brother is not like to speak with me. ...
7   ham  As per your request 'Melle Melle (Oru Minnamin...
8  spam  WINNER!! As a valued network customer you have...
9  spam  Had your mobile 11 months or more? U R entitle...


In [9]:
print(data["label"].value_counts())

label
ham     4825
spam     747
Name: count, dtype: int64


In [10]:
print(len(data))

5572
