In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

# Machine Learning: Text Classification Assignment

In [2]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split as tts
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
from bs4 import BeautifulSoup

import requests

<IPython.core.display.Javascript object>

### Use the CategorizedPlaintextCorpusReader to import the AP_News corpus.

In [3]:
# regex syntax
PATH = "./AP_News/AP_News"

DOC_PATTERN = r".*\.txt"
CAT_PATTERN = r"([\w_\w]+)/.*"

corpus = CategorizedPlaintextCorpusReader(PATH, DOC_PATTERN, cat_pattern=CAT_PATTERN)

<IPython.core.display.Javascript object>

### Create two separate lists - one containing the text from each document and another containing the category of each article in the corpus.

In [4]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]
categories = [corpus.categories(fileid)[0] for fileid in corpus.fileids()]

<IPython.core.display.Javascript object>

In [5]:
categories

['health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'health',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 'politics',
 

<IPython.core.display.Javascript object>

### Preprocess the corpus, ensuring to include the following steps.

- Word tokenize the documents.
- Lemmatize, stem, and lowercase all tokens.
- Remove punctuation and stop words.

In [6]:
def preprocess(docs):
    lemmatizer = WordNetLemmatizer()
    stemmer = SnowballStemmer("english")
    preprocessed = []
    for doc in docs:
        tokenized = word_tokenize(doc)
        cleaned = [
            stemmer.stem(lemmatizer.lemmatize(token.lower()))
            for token in tokenized
            if not token.lower() in stopwords.words("english")
            if token.isalpha()
        ]
        untokenized = " ".join(cleaned)
        preprocessed.append(untokenized)
    return preprocessed

<IPython.core.display.Javascript object>

In [7]:
preprocessed = preprocess(docs)

<IPython.core.display.Javascript object>

### Split the data into training and testing sets with the size of the test set being 30% of the records.

In [8]:
X_train, X_test, y_train, y_test = tts(docs, categories, test_size=0.3)

<IPython.core.display.Javascript object>

### Construct a pipeline that TF-IDF vectorizes the text and trains a Random Forest classification model.

In [9]:
pipe = Pipeline([("tfidf", TfidfVectorizer()), ("rf", RandomForestClassifier())])
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None

<IPython.core.display.Javascript object>

### Generate predictions on the test set and print a classification report to evaluate how well the model performed.

In [10]:
pred = pipe.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

      health       0.63      0.80      0.71        15
    politics       0.87      0.72      0.79        18
      sports       0.74      0.93      0.82        15
        tech       0.85      0.61      0.71        18

    accuracy                           0.76        66
   macro avg       0.77      0.77      0.76        66
weighted avg       0.78      0.76      0.76        66



<IPython.core.display.Javascript object>

### Perform 10-fold cross validation and obtain the averge F1 score across all the folds.

In [11]:
scores = cross_val_score(pipe, preprocessed, categories, cv=10, scoring="f1_macro")

NameError: name 'preprocessed' is not defined

<IPython.core.display.Javascript object>

In [None]:
scores.mean()

### Ingest, preprocess, and predict the topic of the article at the following URL.

In [None]:
def get_url_text(url):
    response = requests.get(url)
    content = response.text

    TAGS = ["h1", "h2", "h3", "h4", "h5", "h6", "h7", "p", "li"]
    soup = BeautifulSoup(content, "lxml")
    text_list = [tag.get_text() for tag in soup.find_all(TAGS)]
    text = " ".join(text_list)
    return text

In [None]:
url = "https://www.nytimes.com/2019/11/25/business/uber-london.html"

In [None]:
text = get_url_text(url)
cleaned = preprocess([text])
pipe.predict(cleaned)[0]