**Import the data**

In [1]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True,random_state=42)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True,random_state=42)

In [2]:
# The variables twenty_train and twenty_test stored as a dictinary with tuples key and value
twenty_train.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [3]:
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
# Visualize the size of train and test data 
print(len(twenty_train.filenames),"documents")
print(len(twenty_train.target_names),"categories")
print(len(twenty_test.filenames),"documents")
print(len(twenty_test.target_names), "categories")

11314 documents
20 categories
7532 documents
20 categories


In [5]:
X_train = twenty_train.data
y_train = twenty_train.target
X_test = twenty_test.data
y_test = twenty_test.target

**Define the train function for our model**

In [6]:
import time
from sklearn.metrics import accuracy_score

def train(classifier,X_train,y_train,X_test,y_test):
    start = time.time()

    classifier.fit(X_train, y_train)
    end = time.time()
    predicted = classifier.predict(X_test)

    print("Accuracy: ", accuracy_score(y_test,predicted))
    print("Time duration: " + str(end - start))
    return classifier

**Model 1: k-NN without tunning(n_neighbors=5)**

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
model_1 = Pipeline([('vectorizer', TfidfVectorizer(stop_words ='english')),
                          ('clf_1', KNeighborsClassifier()),
                        ])
train(model_1, X_train,y_train,X_test,y_test)

Accuracy:  0.6757833244822092
Time duration: 5.5967793464660645


Pipeline(steps=[('vectorizer', TfidfVectorizer(stop_words='english')),
                ('clf_1', KNeighborsClassifier())])

**Model 2: number of nearest neighbors (k=1)**

In [8]:
model_2 = Pipeline([('vectorizer', TfidfVectorizer(stop_words ='english')),
                          ('clf_2', KNeighborsClassifier(n_neighbors=1)),
                        ])
train(model_2, X_train,y_train,X_test,y_test)

Accuracy:  0.6805629314922995
Time duration: 6.258819818496704


Pipeline(steps=[('vectorizer', TfidfVectorizer(stop_words='english')),
                ('clf_2', KNeighborsClassifier(n_neighbors=1))])

**Model 3: number of nearest neighbors (k=2)**

In [9]:
model_3 = Pipeline([('vectorizer', TfidfVectorizer(stop_words ='english')),
                          ('clf_3', KNeighborsClassifier(n_neighbors=2)),
                        ])
train(model_3, X_train,y_train,X_test,y_test)

Accuracy:  0.6464418481147106
Time duration: 5.910400390625


Pipeline(steps=[('vectorizer', TfidfVectorizer(stop_words='english')),
                ('clf_3', KNeighborsClassifier(n_neighbors=2))])

**Model 4: number of nearest neighbors (k=3)**

In [10]:
model_4 = Pipeline([('vectorizer', TfidfVectorizer(stop_words ='english')),
                          ('clf_4', KNeighborsClassifier(n_neighbors=3)),
                        ])
train(model_4, X_train,y_train,X_test,y_test)

Accuracy:  0.6666224110462029
Time duration: 7.210191011428833


Pipeline(steps=[('vectorizer', TfidfVectorizer(stop_words='english')),
                ('clf_4', KNeighborsClassifier(n_neighbors=3))])

**Model 5: number of nearest neighbors (k=4)**

In [16]:
model_5 = Pipeline([('vectorizer', TfidfVectorizer(stop_words ='english')),
                          ('clf_4', KNeighborsClassifier(n_neighbors=4)),
                        ])
train(model_5, X_train,y_train,X_test,y_test)

Accuracy:  0.6716675517790759
Time duration: 5.888701677322388


Pipeline(steps=[('vectorizer', TfidfVectorizer(stop_words='english')),
                ('clf_4', KNeighborsClassifier(n_neighbors=4))])