# Text Classification using Newsgroup dataset

In [1]:
#Imports
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import fetch_20newsgroups

### Splitting into Test and Training dataset

In [3]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

### Text Cleaning and Pre-processing

In [19]:
#Tokenization: breaking down a stream of text into words/phrases/symbols/ tokens
from nltk.tokenize import word_tokenize
text = "The United States of America (USA) or America, is a federal republic composed of 50 states"
tokens = word_tokenize(text)
print(tokens)

['The', 'United', 'States', 'of', 'America', '(', 'USA', ')', 'or', 'America', ',', 'is', 'a', 'federal', 'republic', 'composed', 'of', '50', 'states']


In [20]:
#Stop Words: removing abbrevations and unnecessary words

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

example_sent = "The United States of America (USA) or America, is a federal republic composed of 50 states"

stop_words = set(stopwords.words('english'))

word_tokens = word_tokenize(example_sent)

filtered_sentence = [w for w in word_tokens if not w in stop_words]

filtered_sentence = []

for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

print(word_tokens)
print(filtered_sentence)

['The', 'United', 'States', 'of', 'America', '(', 'USA', ')', 'or', 'America', ',', 'is', 'a', 'federal', 'republic', 'composed', 'of', '50', 'states']
['The', 'United', 'States', 'America', '(', 'USA', ')', 'America', ',', 'federal', 'republic', 'composed', '50', 'states']


In [21]:
#Cases handling
text = "The United States of America (USA) or America, is a federal republic composed of 50 states"
print(text)
print(text.lower())
print(text.upper())

The United States of America (USA) or America, is a federal republic composed of 50 states
the united states of america (usa) or america, is a federal republic composed of 50 states
THE UNITED STATES OF AMERICA (USA) OR AMERICA, IS A FEDERAL REPUBLIC COMPOSED OF 50 STATES


In [28]:
#Stemming: modifying a word to obtain its root
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()

example_words = ["running","ran","run","runner"]

for w in example_words:
    print(ps.stem(w))

run
ran
run
runner


In [32]:
#Lemmatization: is the process of eliminating redundant prefix or suffix of a word and extract the base word (lemma).

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
example_words = ["running","ran","runs","runners"]

for w in example_words:
    print(lemmatizer.lemmatize(w))
#print(lemmatizer.lemmatize("cats"))

running
ran
run
runner


In [35]:
#Removing Special Characters
import nltk

s = "I can't do this now, because I'm so tired.  Please give me some time. @ sd  4 232"

words = nltk.word_tokenize(s)

words=[word.lower() for word in words if word.isalpha()]

print(words)

['i', 'ca', 'do', 'this', 'now', 'because', 'i', 'so', 'tired', 'please', 'give', 'me', 'some', 'time', 'sd']


### KNeighbours Classifier

In [4]:
from sklearn.neighbors import KNeighborsClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier()),
                     ])

text_clf.fit(X_train, y_train)


predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.43      0.76      0.55       319
           1       0.50      0.61      0.55       389
           2       0.56      0.57      0.57       394
           3       0.53      0.58      0.56       392
           4       0.59      0.56      0.57       385
           5       0.69      0.60      0.64       395
           6       0.58      0.45      0.51       390
           7       0.75      0.69      0.72       396
           8       0.84      0.81      0.82       398
           9       0.77      0.72      0.74       397
          10       0.85      0.84      0.84       399
          11       0.76      0.84      0.80       396
          12       0.70      0.50      0.58       393
          13       0.82      0.49      0.62       396
          14       0.79      0.76      0.78       394
          15       0.75      0.76      0.76       398
          16       0.70      0.73      0.72       364
          17       0.62    

### Decision Tree

In [6]:
from sklearn import tree
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', tree.DecisionTreeClassifier()),
                     ])

text_clf.fit(X_train, y_train)


predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.48      0.48      0.48       319
           1       0.43      0.43      0.43       389
           2       0.52      0.58      0.55       394
           3       0.49      0.43      0.46       392
           4       0.56      0.57      0.57       385
           5       0.48      0.47      0.48       395
           6       0.66      0.71      0.68       390
           7       0.60      0.60      0.60       396
           8       0.73      0.74      0.73       398
           9       0.51      0.54      0.53       397
          10       0.65      0.65      0.65       399
          11       0.75      0.70      0.72       396
          12       0.34      0.33      0.34       393
          13       0.53      0.45      0.49       396
          14       0.63      0.64      0.64       394
          15       0.72      0.70      0.71       398
          16       0.50      0.61      0.55       364
          17       0.74    

### Random Forest Classifier

In [7]:
from sklearn.ensemble import RandomForestClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier(n_estimators=100)),
                     ])

text_clf.fit(X_train, y_train)


predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.72      0.65      0.68       319
           1       0.56      0.68      0.61       389
           2       0.66      0.77      0.71       394
           3       0.61      0.62      0.62       392
           4       0.74      0.75      0.74       385
           5       0.76      0.70      0.73       395
           6       0.73      0.91      0.81       390
           7       0.82      0.77      0.79       396
           8       0.88      0.90      0.89       398
           9       0.82      0.88      0.85       397
          10       0.90      0.93      0.91       399
          11       0.88      0.91      0.89       396
          12       0.66      0.49      0.56       393
          13       0.84      0.67      0.74       396
          14       0.82      0.89      0.85       394
          15       0.70      0.92      0.79       398
          16       0.67      0.87      0.75       364
          17       0.95    

### SVM

In [13]:
from sklearn.svm import LinearSVC
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LinearSVC()),
                     ])

text_clf.fit(X_train, y_train)


predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.82      0.80      0.81       319
           1       0.76      0.80      0.78       389
           2       0.77      0.73      0.75       394
           3       0.71      0.76      0.74       392
           4       0.84      0.86      0.85       385
           5       0.87      0.76      0.81       395
           6       0.83      0.91      0.87       390
           7       0.92      0.91      0.91       396
           8       0.95      0.95      0.95       398
           9       0.92      0.95      0.93       397
          10       0.96      0.98      0.97       399
          11       0.93      0.94      0.93       396
          12       0.81      0.79      0.80       393
          13       0.90      0.87      0.88       396
          14       0.90      0.93      0.92       394
          15       0.84      0.93      0.88       398
          16       0.75      0.92      0.82       364
          17       0.97    

SVM Gives the most accurate result