# Using the Pipeline Class from Scikit learn

In [59]:
from sklearn.datasets import fetch_20newsgroups  # import packages which help us download dataset and load intp python
from sklearn.pipeline import Pipeline # The class itself
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer # a numerical statistic that reflects word importance
from sklearn.naive_bayes import MultinomialNB
import nltk
from nltk.stem import WordNetLemmatizer

import numpy as np
import pandas as pd

## Exporatory Data Analysis

In [25]:
# get the training and test datasets
train = fetch_20newsgroups(subset='train', shuffle=True, download_if_missing=True)  
test = fetch_20newsgroups(subset='test', shuffle=True, download_if_missing=True)

datasets = [train, test]

In [60]:
# Count Vectorizer that Lemmatizes words too
class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        lemm = WordNetLemmatizer()
        analyzer = super(LemmaCountVectorizer, self).build_analyzer()
        return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc))

In [45]:
# handy function to convert the datasets to pandas dataframes
def convert_to_pd(dataset, name): 
    df = pd.DataFrame()
    df[name] = pd.Series(dataset.data)
    df['target'] = pd.Series(dataset.target)
    return df

In [46]:
pd_datasets = [convert_to_pd(dataset, "message") for dataset in datasets]

In [47]:
pd_datasets[0].head()

Unnamed: 0,message,target
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14


In [48]:
pd_datasets[1].head()

Unnamed: 0,message,target
0,From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. ...,7
1,From: Rick Miller <rick@ee.uwm.edu>\nSubject: ...,5
2,From: mathew <mathew@mantis.co.uk>\nSubject: R...,0
3,From: bakken@cs.arizona.edu (Dave Bakken)\nSub...,17
4,From: livesey@solntze.wpd.sgi.com (Jon Livesey...,19


## Model Defining  

In [50]:
stopwords = nltk.corpus.stopwords.words('english')

In [61]:
nb_pipeline = Pipeline([('vect', LemmaCountVectorizer(stop_words=stopwords)), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

## Model Compiling

In [62]:
model = nb_pipeline.fit(pd_datasets[0]["message"], pd_datasets[0]["target"])

## Getting a Prediction

In [63]:
prediction = model.predict(pd_datasets[1]["message"])

In [64]:
accuracy = np.mean(prediction == pd_datasets[1]["target"]) * 100

In [65]:
accuracy

81.2931492299522