**Import the data**

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
twenty_train = fetch_20newsgroups(subset='train', shuffle=True,random_state=42)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True,random_state=42)

In [2]:
# The variables twenty_train and twenty_test stored as a dictinary with tuples key and value
twenty_train.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [3]:
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
# Visualize the size of train and test data 
print(len(twenty_train.filenames),"documents")
print(len(twenty_train.target_names),"categories")
print(len(twenty_test.filenames),"documents")
print(len(twenty_test.target_names), "categories")

11314 documents
20 categories
7532 documents
20 categories


In [5]:
X_train = twenty_train.data
y_train = twenty_train.target
X_test = twenty_test.data
y_test = twenty_test.target

**Define the train function for our model**

In [6]:
import time
from sklearn.metrics import accuracy_score

def train(classifier,X_train,y_train,X_test,y_test):
    start = time.time()

    classifier.fit(X_train, y_train)
    end = time.time()
    predicted = classifier.predict(X_test)

    print("Accuracy: ", accuracy_score(y_test,predicted))
    print("Time duration: " + str(end - start))
    return classifier


**Model 1: n_estimators = 100**

In [7]:
model_1 = Pipeline([('vectorizer',TfidfVectorizer(stop_words='english')),
                     ('clf', RandomForestClassifier(random_state=42,n_jobs=-1)),
                     ])
train(model_1,X_train,y_train,X_test,y_test)

Accuracy:  0.7833244822092406
Time duration: 37.7240035533905


Pipeline(steps=[('vectorizer', TfidfVectorizer(stop_words='english')),
                ('clf', RandomForestClassifier(n_jobs=-1, random_state=42))])

**Model 2: n_estimators = 200**

In [8]:
model_2 = Pipeline([('vectorizer',TfidfVectorizer(stop_words='english')),
                     ('clf', RandomForestClassifier(n_estimators = 200,random_state=42,n_jobs=-1)),
                     ])
train(model_2,X_train,y_train,X_test,y_test)

Accuracy:  0.7959373340414233
Time duration: 71.93949270248413


Pipeline(steps=[('vectorizer', TfidfVectorizer(stop_words='english')),
                ('clf',
                 RandomForestClassifier(n_estimators=200, n_jobs=-1,
                                        random_state=42))])

**Model 3: n_estimators = 300**

In [9]:
#n_estimators = 300
model_3 = Pipeline([('vectorizer',TfidfVectorizer(stop_words='english')),
                     ('clf', RandomForestClassifier(n_estimators = 300,random_state=42,n_jobs=-1)),
                     ])
train(model_3,X_train,y_train,X_test,y_test)

Accuracy:  0.8019118428040362
Time duration: 102.43981742858887


Pipeline(steps=[('vectorizer', TfidfVectorizer(stop_words='english')),
                ('clf',
                 RandomForestClassifier(n_estimators=300, n_jobs=-1,
                                        random_state=42))])

**Model 4: n_estimators = 400**

In [10]:
#n_estimators = 400
model_4 = Pipeline([('vectorizer',TfidfVectorizer(stop_words='english')),
                     ('clf', RandomForestClassifier(n_estimators = 400,random_state=42,n_jobs=-1)),
                     ])
train(model_4,X_train,y_train,X_test,y_test)

Accuracy:  0.8015135422198619
Time duration: 137.673015832901


Pipeline(steps=[('vectorizer', TfidfVectorizer(stop_words='english')),
                ('clf',
                 RandomForestClassifier(n_estimators=400, n_jobs=-1,
                                        random_state=42))])

**Model 5: n_estimators = 1000**

In [11]:
model_5 = Pipeline([('vectorizer',TfidfVectorizer(stop_words='english')),
                     ('clf', RandomForestClassifier(n_estimators = 1000,random_state=42,n_jobs=-1)),
                     ])
train(model_5,X_train,y_train,X_test,y_test)

Accuracy:  0.8021773765268189
Time duration: 352.22062945365906


Pipeline(steps=[('vectorizer', TfidfVectorizer(stop_words='english')),
                ('clf',
                 RandomForestClassifier(n_estimators=1000, n_jobs=-1,
                                        random_state=42))])