18 th July - NLP

In [8]:
from textblob import TextBlob

In [9]:
myString = "John found a new coach and a new bed in his new apartment."

In [10]:
opt = TextBlob(myString)


In [11]:
opt.tags

MissingCorpusError: 
Looks like you are missing some required data for this feature.

To download the necessary data, simply run

    python -m textblob.download_corpora

or use the NLTK downloader to download the missing data: http://nltk.org/data.html
If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.


In [5]:
import nltk

In [6]:
# reg_exp = "NP: {<DT>?<JJ>*<NN>}"
reg_exp = "NP:{<DT>{1,3}<JJ>{1,3}<NN>{1,3}}"
rp = nltk.chunk.RegexpParser(reg_exp)

In [7]:
output = rp.parse(opt.tags)
print(output)

(S
  John/NNP
  found/VBD
  (NP a/DT new/JJ coach/NN)
  and/CC
  (NP a/DT new/JJ bed/NN)
  in/IN
  his/PRP$
  new/JJ
  apartment/NN)


In [8]:
#output.draw()

In [9]:
myString = "The little yellow dog barked at the cat."

In [10]:
reg_exp = r""" NP:
    {<.*>+}     # Chunk everything
    }<VBD|IN>+{ # Chink sequences of VBD nad IN
"""

In [11]:
rp = nltk.chunk.RegexpParser(reg_exp)
output = rp.parse(opt.tags)
print(output)

(S
  (NP John/NNP)
  found/VBD
  (NP a/DT new/JJ coach/NN and/CC a/DT new/JJ bed/NN)
  in/IN
  (NP his/PRP$ new/JJ apartment/NN))


In [12]:
# output.draw()

In [13]:
myString = "The little yellow dog barked at the cat and the cat escaped to the house."

In [14]:
reg_exp = r""" NP:
    {<.*>+}     # Chunk everything
    }<VBD|IN>+{ # Chink sequences of VBD nad IN
"""

In [15]:
rp = nltk.chunk.RegexpParser(reg_exp)
output = rp.parse(opt.tags)
print(output)

(S
  (NP John/NNP)
  found/VBD
  (NP a/DT new/JJ coach/NN and/CC a/DT new/JJ bed/NN)
  in/IN
  (NP his/PRP$ new/JJ apartment/NN))


In [16]:
reg_exp = r""" NP: 
{(<JJ>{1,2}<NN>{1,2})|(<DT>{1,2}<NN>{1,2})} # chunking 
}<VBD|IN>+{ # chinking
"""

In [17]:
rp = nltk.chunk.RegexpParser(reg_exp)
output = rp.parse(opt.tags)
print(output)


(S
  John/NNP
  found/VBD
  a/DT
  (NP new/JJ coach/NN)
  and/CC
  a/DT
  (NP new/JJ bed/NN)
  in/IN
  his/PRP$
  (NP new/JJ apartment/NN))


In [18]:
my_string = "John found a new coach and a new bed in his new apartment and the cat escaped to the house"
opt = TextBlob(my_string)
reg_exp = r""" NP: 
{(<JJ>{1,2}<NN>{1,2})|(<DT>{1,2}<NN>{1,2})} # chunking 
}<VBD|IN>+{ # chinking
"""
rp = nltk.chunk.RegexpParser(reg_exp)
output = rp.parse(opt.tags)
print(output)

(S
  John/NNP
  found/VBD
  a/DT
  (NP new/JJ coach/NN)
  and/CC
  a/DT
  (NP new/JJ bed/NN)
  in/IN
  his/PRP$
  (NP new/JJ apartment/NN)
  and/CC
  (NP the/DT cat/NN)
  escaped/VBD
  to/TO
  (NP the/DT house/NN))


# Module 8

In [16]:
from sklearn.datasets import fetch_20newsgroups

In [17]:
news = fetch_20newsgroups(subset="all")

In [21]:
print(len(news.data))

18846


In [22]:
import pandas as pd
data = {
    "news" : list(news.data),
    "target" : list(news.target)
}
df = pd.DataFrame(data)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [23]:
df.head()

Unnamed: 0,news,target
0,From: Mamatha Devineni Ratnam <mr47+@andrew.cm...,10
1,From: mblawson@midway.ecn.uoknor.edu (Matthew ...,3
2,From: hilmi-er@dsv.su.se (Hilmi Eren)\nSubject...,17
3,From: guyd@austin.ibm.com (Guy Dawson)\nSubjec...,3
4,From: Alexander Samuel McDiarmid <am2o+@andrew...,4


In [24]:
count_df = df.target.value_counts().reset_index()

Extract features from text files

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [13]:
def train_test(classifier, X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.25,random_state=48)
    classifier.fit(X_train,y_train)
    print("Train accuracy: {}".format(classifier.score(X_train, y_train)))
    print("Test accuracy: {}".format(classifier.score(X_test, y_test)))
    return classifier

In [27]:
from sklearn.naive_bayes import MultinomialNB

## Trail 1 - NB, Tfidf

In [28]:
trail1 = Pipeline([("vectorizer", TfidfVectorizer()),
                   ("classifier",MultinomialNB())])

In [29]:
train_test(trail1, news.data, news.target)

Train accuracy: 0.9254987972265459
Test accuracy: 0.8535653650254669


## Trail 2 - NB, Tfidf + stopwords

In [7]:
from nltk.corpus import stopwords

In [31]:
trail2 = Pipeline([("vectorizer", TfidfVectorizer(stop_words=stopwords.words("english"))),
                   ("classifier",MultinomialNB())])

In [32]:
train_test(trail2, news.data, news.target)

Train accuracy: 0.9472194707796802
Test accuracy: 0.8828522920203735


## Trail 3 - NB + alpha, Tfidf + stopwords

In [33]:
trail3 = Pipeline([("vectorizer", TfidfVectorizer(stop_words=stopwords.words("english"))),
                   ("classifier",MultinomialNB(alpha=0.05))])

In [34]:
train_test(trail3, news.data, news.target)

Train accuracy: 0.9898118013301259
Test accuracy: 0.91553480475382


## Trail 4 - SVM, Tfidf + stopwords + punctuation

In [8]:
import string
from sklearn import svm

In [36]:
trail4 = Pipeline([("vectorizer", TfidfVectorizer(stop_words=stopwords.words("english")+list(string.punctuation),min_df=5)),
                   ("classifier", svm.LinearSVC())])

In [37]:
train_test(trail4, news.data, news.target)



Train accuracy: 0.998584972406962
Test accuracy: 0.9276315789473685


## Trail 5 - XGBoost, trail 4 all - Tfidf

In [None]:
!pip install xgboost

In [41]:
from xgboost import XGBClassifier
import xgboost as xgb

In [44]:
trail5 = Pipeline([("vectorizer", TfidfVectorizer(stop_words=stopwords.words("english")+list(string.punctuation),min_df=5)),
                   ("classifier", xgb.XGBClassifier())])

In [45]:
train_test(trail5, news.data, news.target)

Train accuracy: 0.999929248620348
Test accuracy: 0.8520797962648556


## Trail 6 - Random Forest, trail 4 all - Tfidf

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
rf = RandomForestClassifier()

In [21]:
trail6 = Pipeline([("vectorizer", TfidfVectorizer(stop_words=stopwords.words("english")+list(string.punctuation),min_df=5)),
                   ("classifier", rf)])

In [22]:
train_test(trail6, news.data, news.target)

Train accuracy: 0.999929248620348
Test accuracy: 0.8514431239388794


| Trial Number | Trial Details                             | Train Accuracy       | Test Accuracy        | Notes       |
|--------------|-------------------------------------------|----------------------|----------------------|-------------|
| 1            | MultinomialNB, vectorizer TFidf           | 0.9254987972265459   | 0.8535653650254669   | Normal      |
| 2            | NB, tfidf + stopwords                     | 0.9472194707796802   | 0.8828522920203735   | Normal      |
| 3            | NB + alpha, tfidf + stopwords             | 0.9898118013301259   | 0.91553480475382     | Normal      |
| 4            | SVM(classifier), tfidf + stopwords + punctuation | 0.998584972406962 | 0.9276315789473685   | Overfitted  |
| 5            | XGBoost, tfidf - trail 4 all              | 0.999929248620348    | 0.8520797962648556   | Overfitted  |
| 6            | Random forest, tfidf - trail 4 all        | 0.999929248620348    | 0.8514431239388794   | Overfitted  |
