### 1. data preprocessing

In [30]:
import pandas as pd
import numpy as np
import os
import re

emailCsvPath="data/emails.csv"
emailDf=pd.read_csv(emailCsvPath)
emailText=emailDf["Text"]
vocabularySet=set()
for i in range(emailText.size):
	text=str(emailText[i])
	sanitizedText=re.sub("<[^>]*>"," ",text) # replace the html tag enclosement symbol <> with a white space
	words=re.findall(r"\b\w+\b",sanitizedText) # find all the words in the text
	stopWords=set(["and", "the", "in", "to", "for", "on", "with", "that"]) 
	# stopWords is some really common words that got no affection to determination of spam email
	words=[word for word in words if word not in stopWords]
	vocabularySet.update(set(words))

# save the vocabulary to a csv file
vocabularyCsvPath="data/vocabulary.csv"
vocabularyDf=pd.DataFrame(vocabularySet,columns=["Word"])
vocabularyDf.to_csv(vocabularyCsvPath,index=False)

### 2. calc `P(y=1),P(y=0),P(X|y=1),P(X|y=0)`

In [49]:
emailTypeFreqCsvPath="data/emailTypeFreq.csv"
vocabFreqCsvPath="data/vocabFreq.csv"
emailDf=pd.read_csv(emailCsvPath)
vocabDf=pd.read_csv(vocabularyCsvPath)
vocabCount=vocabDf.size

# group the emailDf by label in order to count the spam and healthy email
emailGroupedByLabel=emailDf.groupby("Label")["Label"].count()
hamCount=emailGroupedByLabel.iat[0]
spamCount=emailGroupedByLabel.iat[1]
emailCount=hamCount+spamCount
# save the email type frequency: used to calc the P(y=1) and P(y=0)
emailTypeFreq=pd.DataFrame({"hamFreq":[hamCount/emailCount],"spamFreq":[spamCount/emailCount],"hamCount":[hamCount],"spamCount":[spamCount]})
emailTypeFreq.to_csv(emailTypeFreqCsvPath,index=False)

# initialize a vocabulary frequency data frame
vocabFreqDf=vocabDf
vocabFreqDf[["freqInHam","freqInSpam"]]=np.zeros((vocabCount,2))
for i in range(vocabCount):
	word=str(vocabFreqDf.iloc[i,0])
	wordCountInHam=0
	wordCountInSpam=0
	for j in range(emailCount):
		emailString=str(emailDf.iloc[j,1])
		if bool(re.search(pattern=rf"\b{re.escape(word)}\b",string=emailString)):
			if emailDf.iloc[j,0]==0:
				wordCountInHam+=1
			elif emailDf.iloc[j,0]==1:
				wordCountInSpam+=1
	freqInHam=(1+wordCountInHam)/(2+hamCount)
	freqInSpam=(1+wordCountInSpam)/(2+spamCount)
	vocabFreqDf.iloc[i,1:3]=[freqInHam,freqInSpam]
	print(f"Calculate the freq of word {word}: freqInHam-{freqInHam};freqInSpam-{freqInSpam}")
# save the vocabFreqDf to vocabFreqCsvPath
vocabFreqDf.to_csv(vocabFreqCsvPath,index=False)

Calculate the freq of word wthout: freqInHam-0.0004141644232760406;freqInSpam-0.0013351134846461949
Calculate the freq of word stereophonics: freqInHam-0.0002070822116380203;freqInSpam-0.0026702269692923898
Calculate the freq of word readers: freqInHam-0.0002070822116380203;freqInSpam-0.004005340453938585
Calculate the freq of word wishin: freqInHam-0.0006212466349140609;freqInSpam-0.0013351134846461949
Calculate the freq of word itxt: freqInHam-0.0004141644232760406;freqInSpam-0.0013351134846461949
Calculate the freq of word 4th: freqInHam-0.001449575481466142;freqInSpam-0.0013351134846461949
Calculate the freq of word drinkin: freqInHam-0.0006212466349140609;freqInSpam-0.0013351134846461949
Calculate the freq of word 250k: freqInHam-0.0002070822116380203;freqInSpam-0.004005340453938585
Calculate the freq of word pierre: freqInHam-0.0004141644232760406;freqInSpam-0.0013351134846461949
Calculate the freq of word heavy: freqInHam-0.0016566576931041624;freqInSpam-0.0013351134846461949
Ca

### 3. classify an input email(calc `P(y=1|X)`)

In [51]:
emailTypeFreqDf=pd.read_csv(emailTypeFreqCsvPath)
vocabFreqDf=pd.read_csv(vocabFreqCsvPath)

hamFreq=emailTypeFreqDf.iloc[0,0]
spamFreq=emailTypeFreqDf.iloc[0,1]
hamCount=emailTypeFreqDf.iloc[0,2]
spamCount=emailTypeFreqDf.iloc[0,3]

# type email to be classified here
emailToBeClassified=input("email to be classified").strip()
# extract all words out of email to be classified
sanitizedEmail=re.sub(r"<[^>]*>"," ",emailToBeClassified)
wordsOfEmail=re.findall(r"\b\w+\b",sanitizedEmail)
stopWords=set(["and", "the", "in", "to", "for", "on", "with", "that"])
wordsOfEmail=[word for word in wordsOfEmail if word not in stopWords ]

freqInHamOfWords=[]
freqInSpamOfWords=[]
for word in wordsOfEmail:
	rowInVocabOfWord=vocabFreqDf[vocabFreqDf.iloc[:,0]==word]
	if rowInVocabOfWord.size>0:
		freqInHamOfWords.append(rowInVocabOfWord.iloc[0,1])
		freqInSpamOfWords.append(rowInVocabOfWord.iloc[0,2])
	else:
		freqInHamOfWords.append(1/(2+hamCount))
		freqInSpamOfWords.append(1/(2+spamCount))

productForFreqInHamOfWords=np.prod(freqInHamOfWords)
productForFreqInSpamOfWords=np.prod(freqInSpamOfWords)
probToSpam=productForFreqInSpamOfWords*spamFreq/(productForFreqInHamOfWords*hamFreq+productForFreqInSpamOfWords*spamFreq)
probToHam=productForFreqInHamOfWords*hamFreq/(productForFreqInHamOfWords*hamFreq+productForFreqInSpamOfWords*spamFreq)
print(f"probability of email to be spam: {probToSpam}\n probability of email to be ham: {probToHam}")

probability of email to be spam: 1.1923565200635445e-07
 probability of email to be ham: 0.999999880764348
