In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv("spam.csv", encoding="latin-1")

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [5]:
df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1, inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [7]:
# Rename columns
df.columns = ["label", "email"]

# Map `ham` to 0 and `spam` to 1
df["label"] = df["label"].map({"ham": 0, "spam": 1})

In [8]:
df.head()

Unnamed: 0,label,email
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df.isna().sum()

label    0
email    0
dtype: int64

In [10]:
import re
import string

# Now, processing the text data
def clean_text(text: str) -> str:
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", " ", text
    )

    text = text.lower()
    text = " ".join(text.split())

    return text

In [11]:
print(df.email[0])
print(clean_text(df.email[0]))

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat


In [12]:
df["clean_email"] = df.email.map(clean_text)

In [13]:
X = df.clean_email.values
y = df.label.values

print(X.shape, y.shape)

# Split it
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(5572,) (5572,)
(4457,) (1115,) (4457,) (1115,)


In [14]:
# Pass through count vectorizer
cv = CountVectorizer()

cv.fit(X_train)

X_train = cv.transform(X_train)
X_test = cv.transform(X_test)

print(X_train.shape, X_test.shape)

(4457, 7730) (1115, 7730)


In [15]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

In [16]:
y_pred = clf.predict(X_test)

clf.score(X_test, y_test)

0.9766816143497757

Note that, in order to run inference for this decision tree with any arbitrary text input, we need to convert the text into a feature vector.

A pipeline is created to convert the text into feature vector and feed into trained model to get the output as needed. I've not built it here, but
you can attempt it yourself.