In [3]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer

tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', max_features=8000)
svm = SVC(kernel='linear', C=1.0, random_state=42)
all_names = set(names.words())
emails, labels = [], []
lemmatizer = WordNetLemmatizer()

def letters_only(astr): 
    return astr.isalpha()

def clean_text(docs):
    cleaned_docs = []
    for doc in docs:
        cleaned_docs.append(' '.join(lemmatizer.lemmatize(word.lower()) for word in doc.split()
                             if letters_only(word) and word not in all_names))
        #lowercase everything, isalpha does number and punc. removal, not in all_names removes words
    return cleaned_docs

categories = [
    "alt.atheism",
    "talk.religion.misc",
    "comp.graphics",
    "sci.space",
    "rec.sport.hockey"
]

data_train = fetch_20newsgroups(subset='train', categories=categories, random_state=42)
data_test = fetch_20newsgroups(subset='test', categories=categories, random_state=42)
cleaned_train = clean_text(data_train.data)
label_train = data_train.target
cleaned_test = clean_text(data_test.data)
label_test = data_test.target
term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train)
term_docs_test = tfidf_vectorizer.transform(cleaned_test)

In [4]:
svm.fit(term_docs_train, label_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=42,
    shrinking=True, tol=0.001, verbose=False)

In [5]:
accuracy = svm.score(term_docs_test, label_test)

In [6]:
accuracy

0.8864155251141552

In [7]:
from sklearn.metrics import classification_report
prediction = svm.predict(term_docs_test)
report = classification_report(label_test, prediction)
report

'              precision    recall  f1-score   support\n\n           0       0.81      0.77      0.79       319\n           1       0.91      0.94      0.93       389\n           2       0.98      0.96      0.97       399\n           3       0.93      0.93      0.93       394\n           4       0.73      0.76      0.74       251\n\n    accuracy                           0.89      1752\n   macro avg       0.87      0.87      0.87      1752\nweighted avg       0.89      0.89      0.89      1752\n'

In [8]:
print(report)

              precision    recall  f1-score   support

           0       0.81      0.77      0.79       319
           1       0.91      0.94      0.93       389
           2       0.98      0.96      0.97       399
           3       0.93      0.93      0.93       394
           4       0.73      0.76      0.74       251

    accuracy                           0.89      1752
   macro avg       0.87      0.87      0.87      1752
weighted avg       0.89      0.89      0.89      1752

