In [1]:
%matplotlib inline

In [98]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import requests
import collections
import re
import nltk

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Flatten, Conv1D, Conv2D, MaxPool2D, SimpleRNN

# NLP Demos

## Recap: text preprocessing

In [49]:
response = requests.get('https://www.gutenberg.org/cache/epub/67098/pg67098.txt')
pooh_raw = response.text

starting_index = pooh_raw.index('Here is Edward Bear, coming downstairs now')
pooh_text = pooh_raw[starting_index:]

In [50]:
counter_characters = collections.Counter(pooh_text)

In [51]:
counter_characters.most_common(10)

[(' ', 24539),
 ('e', 11107),
 ('t', 9216),
 ('o', 8936),
 ('a', 7831),
 ('i', 6912),
 ('n', 6684),
 ('h', 6483),
 ('s', 5649),
 ('r', 4853)]

In [52]:
pooh_words = re.split(pattern='\W', string=pooh_text)

In [53]:
counter_words = collections.Counter(pooh_words)

In [59]:
counter_words.most_common(n=10)

[('', 14793),
 ('and', 882),
 ('the', 853),
 ('to', 613),
 ('he', 572),
 ('a', 568),
 ('said', 553),
 ('it', 505),
 ('of', 487),
 ('I', 468)]

In [66]:
stopwords = nltk.corpus.stopwords.words(fileids=['english',])

In [93]:
counter_words_nostop = {
    word: counter_words[word]
    for word, count in counter_words.items()
    if word not in stopwords
}

counter_words_nostop = collections.Counter(counter_words_nostop)

In [94]:
len(counter_words), len(counter_words_nostop)

(2905, 2765)

In [95]:
counter_words_nostop.most_common(10)

[('', 14793),
 ('said', 553),
 ('I', 468),
 ('Pooh', 409),
 ('Piglet', 211),
 ('Robin', 156),
 ('Christopher', 154),
 ('And', 139),
 ('Eeyore', 117),
 ('Rabbit', 112)]

## Recap: tfidf

In [104]:
news_raw = fetch_20newsgroups()

news_data = news.data
news_target = news.target
news_target_names = news.target_names

In [105]:
news_target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [106]:
news_target

array([7, 4, 4, ..., 3, 1, 8])

In [109]:
len(news_data)

11314

In [110]:
tfidf = TfidfVectorizer(
    input = "content", 
    analyzer = "word", 
    ngram_range = (1, 4), 
    min_df = 0, 
    stop_words = stopwords, 
    sublinear_tf = True,
)

In [111]:
tfidf_matrix = tfidf.fit_transform(news_data)

In [113]:
feature_names = tfidf.get_feature_names_out()

In [114]:
doc = 0 # Change the index to view another document

feature_index = tfidf_matrix[doc, :].nonzero()[1]
tfidf_scores = zip(feature_index, [tfidf_matrix[doc, x] for x in feature_index])

for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
    print(w, s)

il brought neighborhood lerxst 0.07219586632049384
thanks il brought neighborhood 0.07219586632049384
mail thanks il brought 0.07219586632049384
please mail thanks il 0.07219586632049384
car please mail thanks 0.07219586632049384
looking car please mail 0.06915949020149523
funky looking car please 0.06915949020149523
info funky looking car 0.06915949020149523
whatever info funky looking 0.06915949020149523
history whatever info funky 0.06915949020149523
made history whatever info 0.06915949020149523
car made history whatever 0.06915949020149523
production car made history 0.06915949020149523
years production car made 0.06915949020149523
specs years production car 0.06915949020149523
engine specs years production 0.06915949020149523
name engine specs years 0.06915949020149523
model name engine specs 0.06915949020149523
tellme model name engine 0.06915949020149523
anyone tellme model name 0.06915949020149523
know anyone tellme model 0.06915949020149523
body know anyone tellme 0.069159490