In [2]:
import pandas as pd
import os

def read_files_from_folder(folder_path, label):
    data = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path):  # Check if it's a file
            with open(file_path, 'r', encoding='latin1') as file:
                content = file.read()
                data.append({'text': content, 'label': label})
    return pd.DataFrame(data)

spam_folder = './spam'
ham_folder = './easy_ham'

# Read the files and create DataFrames
spam_df = read_files_from_folder(spam_folder, 'spam')
ham_df = read_files_from_folder(ham_folder, 'ham')

In [3]:
spam_df.head()

Unnamed: 0,text,label
0,From 102192086381143-17090200005-example.com?z...,spam
1,From sh@insiq.us Fri Sep 20 11:41:16 2002\nRe...,spam
2,From OWNER-NOLIST-SGODAILY*JM**NETNOTEINC*-COM...,spam
3,Return-Path: ler@lerami.lerctr.org\nDelivery-D...,spam
4,From viagra_medication1182@martyrs.com.au Tue...,spam


In [4]:
spam_df.info() #consists of two columns, of which column 0 is the text, column 1 is the label. all non-null.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 501 entries, 0 to 500
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    501 non-null    object
 1   label   501 non-null    object
dtypes: object(2)
memory usage: 8.0+ KB


In [5]:
full_dataset = pd.concat([spam_df, ham_df], ignore_index=True)

In [6]:
full_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3002 entries, 0 to 3001
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    3002 non-null   object
 1   label   3002 non-null   object
dtypes: object(2)
memory usage: 47.0+ KB


In [7]:
full_dataset['text'] = full_dataset['text'].astype(str)

In [8]:
print(full_dataset.dtypes)

text     object
label    object
dtype: object


In [25]:
import re
def clean_text(text):
    text = text.lower().strip()
    text = re.sub(r"[-()\"#/@;:<>{}=~|.?,]", " ", text)
    text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text)
    text = re.sub(r'\W+', ' ', text, flags=re.M)
    text = re.sub(r'https?:.*(?=\s)', 'URL', text)
    return text
#clean text of punctuation, trailing space, make lowercase. we will also then want to stem the words
full_dataset['cleantext'] = full_dataset["text"].apply(clean_text)
full_dataset.head()

Unnamed: 0,text,label,cleantext,stemmedcleaned
0,From 102192086381143-17090200005-example.com?z...,spam,from NUMBER NUMBER example com zzzz bounce til...,from number number exampl com zzzz bounc tilw ...
1,From sh@insiq.us Fri Sep 20 11:41:16 2002\nRe...,spam,from sh insiq us fri sep NUMBER NUMBER NUMBER ...,from sh insiq us fri sep number number number ...
2,From OWNER-NOLIST-SGODAILY*JM**NETNOTEINC*-COM...,spam,from owner nolist sgodaily jm netnoteinc com s...,from owner nolist sgodaili jm netnoteinc com s...
3,Return-Path: ler@lerami.lerctr.org\nDelivery-D...,spam,return path ler lerami lerctr org delivery dat...,return path ler lerami lerctr org deliveri dat...
4,From viagra_medication1182@martyrs.com.au Tue...,spam,from viagra_medicationNUMBER martyrs com au tu...,from viagra_medicationnumb martyr com au tue a...


In [19]:
import re
from html import unescape

def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

full_dataset['stemmedcleaned'] = full_dataset["cleantext"].apply(html_to_plain_text)
full_dataset.head

<bound method NDFrame.head of                                                    text label  \
0     From 102192086381143-17090200005-example.com?z...  spam   
1     From sh@insiq.us  Fri Sep 20 11:41:16 2002\nRe...  spam   
2     From OWNER-NOLIST-SGODAILY*JM**NETNOTEINC*-COM...  spam   
3     Return-Path: ler@lerami.lerctr.org\nDelivery-D...  spam   
4     From viagra_medication1182@martyrs.com.au  Tue...  spam   
...                                                 ...   ...   
2997  From rpm-list-admin@freshrpms.net  Thu Aug 29 ...   ham   
2998  From rssfeeds@jmason.org  Mon Sep 30 13:37:50 ...   ham   
2999  From pudge@perl.org  Tue Sep 10 11:23:13 2002\...   ham   
3000  From ilug-admin@linux.ie  Fri Aug 23 11:08:03 ...   ham   
3001  From sentto-2242572-56004-1034050340-zzzz=spam...   ham   

                                              cleantext  \
0     from NUMBER NUMBER example com zzzz bounce til...   
1     from sh insiq us fri sep NUMBER NUMBER NUMBER ...   
2     from o

In [26]:
from nltk.stem import PorterStemmer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def stem_text(text):
    porter_stemmer = PorterStemmer()
    text = text.split()
    text = [porter_stemmer.stem(word) for word in text]
    text = " ".join(text)
    return text
full_dataset['stemmedcleaned'] = full_dataset["cleantext"].apply(stem_text)
full_dataset.head()
full_dataset.tail()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kajakubickova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,text,label,cleantext,stemmedcleaned
2997,From rpm-list-admin@freshrpms.net Thu Aug 29 ...,ham,from rpm list admin freshrpms net thu aug NUMB...,from rpm list admin freshrpm net thu aug numbe...
2998,From rssfeeds@jmason.org Mon Sep 30 13:37:50 ...,ham,from rssfeeds jmason org mon sep NUMBER NUMBER...,from rssfeed jmason org mon sep number number ...
2999,From pudge@perl.org Tue Sep 10 11:23:13 2002\...,ham,from pudge perl org tue sep NUMBER NUMBER NUMB...,from pudg perl org tue sep number number numbe...
3000,From ilug-admin@linux.ie Fri Aug 23 11:08:03 ...,ham,from ilug admin linux ie fri aug NUMBER NUMBER...,from ilug admin linux ie fri aug number number...
3001,From sentto-2242572-56004-1034050340-zzzz=spam...,ham,from sentto NUMBER NUMBER NUMBER zzzz spamassa...,from sentto number number number zzzz spamassa...


In [20]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(full_dataset["stemmedcleaned"],full_dataset["label"],test_size=0.2)

In [54]:
#Write a data preparation pipeline to convert each email into a feature vector.
#Your preparation pipeline should transform an email into a (sparse) vector
#indicating the presence or absence of each possible word. For example, if all
#emails only ever contain four words, “Hello,” “how,” “are,” “you,” then the email
#“Hello you Hello Hello you” would be converted into a vector [1, 0, 0, 1]
#(meaning [“Hello” is present, “how” is absent, “are” is absent, “you” is
#present]), or [3, 0, 0, 2] if you prefer to count the number of occurrences of
#each word.
#https://towardsdatascience.com/how-to-turn-text-into-features-478b57632e99
#perchance then use bag of words encoding?

#probably use python counter method first to turn a sequence of words into a dict of words and frequencies
#then initialize BoW or IDF class
#fit the class
word2count = {} 
import nltk
nltk.download('punkt')
for data in full_dataset["stemmedcleaned"]: 
    words = nltk.word_tokenize(data) 
    for word in words: 
        if word not in word2count.keys(): 
            word2count[word] = 1
        else: 
            word2count[word] += 1

import heapq 
freq_words = heapq.nlargest(200, word2count, key=word2count.get)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kajakubickova/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [55]:

import numpy as np
class ArrayMaker:
    def __init__(self, freq_words):
        self.freq_words = freq_words
    def fit_transform(self, dataset):
        X = []  # Initialize X before the loop
        for data in dataset:
            vector = []
            for word in freq_words: 
                if word in nltk.word_tokenize(data): 
                    vector.append(1) 
                else: 
                    vector.append(0) 
            X.append(vector) 
        X = np.asarray(X)
        return X

array_maker = ArrayMaker(freq_words)

X_transformed = array_maker.fit_transform(x_train)

In [56]:
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
score = cross_val_score(log_clf, X_transformed, y_train, cv=3, verbose=3)
score.mean()
#what whattttt 0.995 score babey??
#this has gotta be wrong lol but we roll ig

[CV] END ................................ score: (test=0.994) total time=   0.0s
[CV] END ................................ score: (test=0.995) total time=   0.0s
[CV] END ................................ score: (test=0.999) total time=   0.0s


0.9958359342488556

In [57]:
print("Shape of X_transformed:", X_transformed.shape)
print("Shape of y_train:", y_train.shape)

Shape of X_transformed: (2401, 200)
Shape of y_train: (2401,)


In [58]:
#next time remember to make the preprocessing steps into a pipeline so it's much easier to transform the X_test and i don't have to go through it again lolll
X_test_transformed = array_maker.fit_transform(x_test)

In [63]:
from sklearn.metrics import precision_score, recall_score

log_clf.fit(X_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)
pos_label = 'spam'

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred, pos_label="spam")))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred, pos_label="spam")))

Precision: 100.00%
Recall: 100.00%
