**Import the data**

In [1]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True,random_state=42)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True,random_state=42)

In [2]:
# The variables twenty_train and twenty_test stored as a dictinary with tuples key and value
twenty_train.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [3]:
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
# Visualize the size of train and test data 
print(len(twenty_train.filenames),"documents")
print(len(twenty_train.target_names),"categories")
print(len(twenty_test.filenames),"documents")
print(len(twenty_test.target_names), "categories")

11314 documents
20 categories
7532 documents
20 categories


In [5]:
X_train = twenty_train.data
y_train = twenty_train.target
X_test = twenty_test.data
y_test = twenty_test.target

**Define the train function for our model**

In [6]:
import time
from sklearn.metrics import accuracy_score

def train(classifier,X_train,y_train,X_test,y_test):
    start = time.time()

    classifier.fit(X_train, y_train)
    end = time.time()
    predicted = classifier.predict(X_test)

    print("Accuracy: ", accuracy_score(y_test,predicted))
    print("Time duration: " + str(end - start))
    return classifier

**Linear SVC without tunning**

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
import string
model_1 = Pipeline([('vectorizer', TfidfVectorizer(stop_words=('english'))), ('classifier', LinearSVC())])
train(model_1,X_train,y_train,X_test,y_test)

Accuracy:  0.851035581518853
Time duration: 9.913275957107544


Pipeline(steps=[('vectorizer', TfidfVectorizer(stop_words='english')),
                ('classifier', LinearSVC())])

**Grid Search on Linear SVC**

In [8]:
from sklearn.model_selection import GridSearchCV
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2)], 
              'classifier__C': (1.5,1.75,2)
             }
gs_clf_svm = GridSearchCV(model_1, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train, y_train)
gs_clf_svm.best_params_

{'classifier__C': 2, 'vectorizer__ngram_range': (1, 2)}

In [9]:
predicted = gs_clf_svm.predict(X_test)

print("Accuracy: ", accuracy_score(y_test,predicted))

Accuracy:  0.8588688263409453


**By trial and error, so far we have found that our model achieves the highest accuracy score with parameter C = 1.55, which is a little bit higher than the accuracy score found by GridSearchCV**

In [10]:
model_2 = Pipeline([('vectorizer', TfidfVectorizer(ngram_range=(1,2),stop_words=('english'))), ('classifier', LinearSVC(C=1.55))])
train(model_2,X_train,y_train,X_test,y_test)

Accuracy:  0.8590015932023367
Time duration: 38.39503049850464


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(ngram_range=(1, 2), stop_words='english')),
                ('classifier', LinearSVC(C=1.55))])