In [6]:
import os
import glob
import numpy as np
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer
from sklearn.datasets import fetch_20newsgroups


all_names = set(names.words())
emails, labels = [], []
lemmatizer = WordNetLemmatizer()
#To load and label the spam email files to label '1'
spam_file_path = 'datasets/enron1/spam'
for filename in glob.glob(os.path.join(spam_file_path, '*.txt')):
    with open(filename, 'r',  encoding= 'ISO-8859-1') as infile:
        emails.append(infile.read())
        labels.append(1)

#To load and label the non spam email files to label '0'
ham_file_path = 'datasets/enron1/ham'
for filename in glob.glob(os.path.join(ham_file_path, '*txt')):
    with open(filename, 'r', encoding='ISO-8859-1') as infile:
        emails.append(infile.read())
        labels.append(0)
        
def letters_only(astr): 
    return astr.isalpha()

def clean_text(docs):
    cleaned_docs = []
    for doc in docs:
        cleaned_docs.append(' '.join(lemmatizer.lemmatize(word.lower()) for word in doc.split()
                             if letters_only(word) and word not in all_names))
        #lowercase everything, isalpha does number and punc. removal, not in all_names removes words
    return cleaned_docs

cleaned_emails = clean_text(emails)
categories = ["comp.graphics", "sci.space"]
data_train = fetch_20newsgroups(subset="train", categories=categories, random_state=42)
data_test = fetch_20newsgroups(subset="test", categories=categories, random_state=42)

In [7]:
data_test

{'data': ['From: teezee@netcom.com (TAMOOR A. ZAIDI)\nSubject: Hall Generators from USSR\nKeywords: hall generators,thrusters,USSR,JPL\nOrganization: NETCOM On-line Communication Services (408 241-9760 guest)\nLines: 21\n\nHi Folks,\n\n              Last year America bought two  "Hall Generators" which are\nused as thrusters for space vehicles from former USSR,if I could recall\ncorrectly these devices were sent to JPL,Pasadena labs for testing and\nevaluation.\n     \n              I am just curious to know  how these devices work and what\nwhat principle is involved .what became of them.There was also some\ncontroversy that the Russian actually cheated,sold inferior devices and\nnot the one they use in there space vehicles.\n\nAny info will be appreciated...\n  ok   {                         Thank{ in advance...\nTamoor A Zaidi\nLockheed Commercial Aircraft Center\nNorton AFB,San Bernardino\n\nteezee@netcom.com\nde244@cleveland.freenet.edu\n\n',
  'From: henry@zoo.toronto.edu (Henry 

In [8]:
cleaned_train = clean_text(data_train.data)

In [9]:
label_train = data_train.target
cleaned_test = clean_text(data_test.data)
label_test = data_test.target
len(label_train), len(label_test)

(1177, 783)

In [11]:
from collections import Counter
Counter(label_train)

Counter({0: 584, 1: 593})

In [12]:
Counter(label_test)

Counter({1: 394, 0: 389})

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', max_features=8000)

term_docs_train = TfidfVectorizer.fit_transform(cleaned_train)
term_docs_test = TfidfVectorizer.transform(cleaned_test)

In [16]:
term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train)
term_docs_test = tfidf_vectorizer.transform(cleaned_test)

In [19]:
from sklearn.svm import SVC
svm = SVC(kernel='linear', C=1.0, random_state=42)

In [20]:
svm.fit(term_docs_train, label_train)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape=None, degree=3, gamma='auto', kernel='linear',max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001, verbose=False)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [21]:
accuracy = svm.score(term_docs_test, label_test)

In [22]:
accuracy

0.9642401021711366

In [34]:
f"{accuracy*100:.2f}"

'96.42'