In [1]:
import pandas as pd
from textblob import TextBlob
import nltk

In [2]:
myString = "John found a an a new coach and a new bed in his new apartment"

In [3]:
Output = TextBlob(myString)
Output.tags

[('John', 'NNP'),
 ('found', 'VBD'),
 ('a', 'DT'),
 ('an', 'DT'),
 ('a', 'DT'),
 ('new', 'JJ'),
 ('coach', 'NN'),
 ('and', 'CC'),
 ('a', 'DT'),
 ('new', 'JJ'),
 ('bed', 'NN'),
 ('in', 'IN'),
 ('his', 'PRP$'),
 ('new', 'JJ'),
 ('apartment', 'NN')]

In [4]:
reg_exp = "NP: {<DT>?<JJ>*<NN>}"
reg_exp1 = "NP: {<DT>*<JJ>?<NN>}"
rp = nltk.chunk.RegexpParser(reg_exp)

In [5]:
output = rp.parse(Output.tags)
print(output)

(S
  John/NNP
  found/VBD
  a/DT
  an/DT
  (NP a/DT new/JJ coach/NN)
  and/CC
  (NP a/DT new/JJ bed/NN)
  in/IN
  his/PRP$
  (NP new/JJ apartment/NN))


In [6]:
#output.draw()

In [26]:
myString = "The little yellow dog barked at the cat."
output = TextBlob(myString)
output.tags

[('The', 'DT'),
 ('little', 'JJ'),
 ('yellow', 'JJ'),
 ('dog', 'NN'),
 ('barked', 'VBD'),
 ('at', 'IN'),
 ('the', 'DT'),
 ('cat', 'NN')]

In [27]:
reg_exp = r"""NP:{<.*>+}    # chunk everything
        }<VBD|IN>+{         # chink sequences of VBD and IN
            """
rp = nltk.chunk.RegexpParser(reg_exp)
output = rp.parse(output.tags)
print(output)

(S
  (NP The/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))


In [16]:
#output.draw()

***Compare Text Classification using scikit-learn***

In [28]:
from sklearn.datasets import fetch_20newsgroups

In [29]:
news = fetch_20newsgroups(subset='all')

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

GridSearch and randomsearch hypertuning

In [87]:
from sklearn.metrics import roc_auc_score, accuracy_score
def train_test(classifier, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=48)
    classifier.fit(X_train, y_train)
    print('Classifier accuracy for train is {}'.format(classifier.score(X_train, y_train)))
    print('Classifier accuracy for test is {}'.format(classifier.score(X_test, y_test)))
    y_pred_test = classifier.predict(X_test)
    y_pred_train = classifier.predict(X_train)

    print('Accuracy Score on train is {}'.format(accuracy_score(y_train, y_pred_train)))
    print('Accuracy Score on test is {}'.format(accuracy_score(y_test, y_pred_test)))


    # print('ROC score on train is {}'.format(roc_auc_score(y_train, classifier.predict_proba(X_train), multi_class='ovo')))
    # print('ROC score on test is {}'.format(roc_auc_score(y_test, classifier.predict_proba(X_test), multi_class='ovo')))

    # print('ROC score on train is {}'.format(roc_auc_score(y_train, classifier.predict(X_train), multi_class='ovo')))
    # print('ROC score on test is {}'.format(roc_auc_score(y_test, classifier.predict(X_test), multi_class='ovo')))
    

    return classifier

In [34]:
from sklearn.naive_bayes import MultinomialNB

In [35]:
trial1= Pipeline([("Vectorizer", TfidfVectorizer()),
                  ('classifier',MultinomialNB())])

In [71]:
train_test(trial1, news.data, news.target)

Classifier accuracy for train is 0.9254987972265459
Classifier accuracy for test is 0.8535653650254669
Accuracy Score on train is 0.9254987972265459
Accuracy Score on test is 0.8535653650254669
ROC score on train is 0.9970487444908809
ROC score on test is 0.9914242313650873


In [38]:
from nltk.corpus import stopwords

In [72]:
trial2= Pipeline([("Vectorizer", TfidfVectorizer(stop_words=stopwords.words('english'))),
                  ('classifier',MultinomialNB())])
train_test(trial2, news.data, news.target)

Classifier accuracy for train is 0.9472194707796802
Classifier accuracy for test is 0.8828522920203735
Accuracy Score on train is 0.9472194707796802
Accuracy Score on test is 0.8828522920203735
ROC score on train is 0.9982557303115578
ROC score on test is 0.993779872933781


In [73]:
trial3= Pipeline([("Vectorizer", TfidfVectorizer(stop_words=stopwords.words('english'))),
                  ('classifier',MultinomialNB(alpha=0.05))])
train_test(trial3, news.data, news.target)

Classifier accuracy for train is 0.9898118013301259
Classifier accuracy for test is 0.91553480475382
Accuracy Score on train is 0.9898118013301259
Accuracy Score on test is 0.91553480475382
ROC score on train is 0.99989670940084
ROC score on test is 0.9965502572388948


In [74]:
import string
trial4= Pipeline([("Vectorizer", TfidfVectorizer(stop_words=stopwords.words('english')+list(string.punctuation), min_df=5)),
                  ('classifier',MultinomialNB(alpha=0.05))])
train_test(trial4, news.data, news.target)

Classifier accuracy for train is 0.9821706523277204
Classifier accuracy for test is 0.9100169779286927
Accuracy Score on train is 0.9821706523277204
Accuracy Score on test is 0.9100169779286927
ROC score on train is 0.9996705432933529
ROC score on test is 0.9963209926197707


In [88]:
from sklearn import svm
trial5= Pipeline([("Vectorizer", TfidfVectorizer(stop_words=stopwords.words('english')+list(string.punctuation), min_df=5)),
                  ('classifier',svm.LinearSVC())])
train_test(trial5, news.data, news.target)

Classifier accuracy for train is 0.998584972406962
Classifier accuracy for test is 0.9276315789473685
Accuracy Score on train is 0.998584972406962
Accuracy Score on test is 0.9276315789473685


In [89]:
from sklearn.ensemble import GradientBoostingClassifier
trial6= Pipeline([("Vectorizer", TfidfVectorizer(stop_words=stopwords.words('english')+list(string.punctuation), min_df=5)),
                  ('classifier',GradientBoostingClassifier(n_estimators=2))])
train_test(trial6, news.data, news.target)

Classifier accuracy for train is 0.6240979199094382
Classifier accuracy for test is 0.6005942275042445
Accuracy Score on train is 0.6240979199094382
Accuracy Score on test is 0.6005942275042445


In [54]:
trial7= Pipeline([("Vectorizer", TfidfVectorizer(stop_words=stopwords.words('english')+list(string.punctuation), min_df=5)),
                  ('classifier',GradientBoostingClassifier(n_estimators=10))])
train_test(trial7, news.data, news.target)

Classifier accuracy for train is 0.7537851988113768
Classifier accuracy for test is 0.7200764006791172


In [55]:
from sklearn.ensemble import RandomForestClassifier
trial8= Pipeline([("Vectorizer", TfidfVectorizer(stop_words=stopwords.words('english')+list(string.punctuation), min_df=5)),
                  ('classifier',RandomForestClassifier(n_estimators=2))])
train_test(trial8, news.data, news.target)

Classifier accuracy for train is 0.8049384462997028
Classifier accuracy for test is 0.48896434634974534


In [56]:
trial8= Pipeline([("Vectorizer", TfidfVectorizer(stop_words=stopwords.words('english')+list(string.punctuation), min_df=5)),
                  ('classifier',RandomForestClassifier(n_estimators=10))])
train_test(trial8, news.data, news.target)

Classifier accuracy for train is 0.9969576906749682
Classifier accuracy for test is 0.7468166383701188
