In [11]:
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', max_features=8000)

categories = None
all_names = set(names.words())
emails, labels = [], []
lemmatizer = WordNetLemmatizer()

def letters_only(astr): 
    return astr.isalpha()

def clean_text(docs):
    cleaned_docs = []
    for doc in docs:
        cleaned_docs.append(' '.join(lemmatizer.lemmatize(word.lower()) for word in doc.split()
                             if letters_only(word) and word not in all_names))
        #lowercase everything, isalpha does number and punc. removal, not in all_names removes words
    return cleaned_docs

data_train = fetch_20newsgroups(subset="train", categories=categories, random_state=42)
data_test = fetch_20newsgroups(subset="test", categories=categories, random_state=42)
svc_libsvm = SVC(kernel="linear")
parameters: dict = {"C": (0.1, 1, 10, 100)}
cleaned_train = clean_text(data_train.data)
label_train = data_train.target
cleaned_test = clean_text(data_test.data) 
label_test = data_test.target
term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train) 
term_docs_test = tfidf_vectorizer.transform(cleaned_test)


In [12]:
from sklearn.model_selection import GridSearchCV
grid_Search = GridSearchCV(svc_libsvm, parameters, n_jobs=-1, cv=3)

In [13]:
cleaned_train = clean_text(data_train.data)
term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train) 
import timeit
start_time = timeit.default_timer()
grid_Search.fit(term_docs_train, label_train)
print(timeit.default_timer() - start_time)

272.458041343


In [15]:
grid_Search.best_params_

{'C': 10}