In [1]:
!pip install pandas numpy matplotlib requests bs4 nltk



In [2]:
from sklearn import __version__ as skv
from nltk import __version__ as nltkv
import json
import string
import time
from collections import Counter

import nltk
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')  


stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/john/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/john/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/john/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
print("Numpy Version:", np.__version__)
print("pandas Version:", pd.__version__)
print("sklearn Version:", skv)
print("NLTK Version:", nltkv)

Numpy Version: 1.19.4
pandas Version: 1.1.4
sklearn Version: 0.24.1
NLTK Version: 3.5


In [4]:
url = "https://en.wikipedia.org/w/api.php?action=parse&format=json&page=Spider-Man"
content = json.loads(requests.get(url).text)
text = content["parse"]["text"]["*"]
soup = BeautifulSoup(text, 'html.parser')
texts = soup.findAll(text=True)
corpus = " ".join(t.strip() for t in texts)

In [5]:
tokenizer = nltk.RegexpTokenizer(r"\w+")
words = tokenizer.tokenize(corpus)
words_use = [word for word in words if word not in stop_words]

In [6]:
tokens = nltk.pos_tag(words_use)

In [7]:
listnames = []
for group in tokens:
    if group[1] == "NNP":
        listnames.append(group[0])
Counter(listnames).most_common(10)
# doesnt show much

[('Spider', 832),
 ('Man', 831),
 ('Amazing', 166),
 ('Marvel', 161),
 ('Peter', 124),
 ('Parker', 122),
 ('Lee', 111),
 ('Retrieved', 96),
 ('Archived', 92),
 ('Stan', 83)]

In [8]:
# trying with bigram
bigrams = list(ngrams(tokens, 2))
bigramFreq = Counter(bigrams)

In [9]:
bigram_nnp = []
for pair in bigrams:
    if pair[0][1] == "NNP" and pair[1][1] == "NNP":
        bigram_nnp.append(pair)

In [10]:
bigram_nnp_freq = Counter(bigram_nnp)

In [11]:
bigram_nnp_freq.most_common(20)  # select over 20

[((('Spider', 'NNP'), ('Man', 'NNP')), 696),
 ((('Amazing', 'NNP'), ('Spider', 'NNP')), 137),
 ((('Peter', 'NNP'), ('Parker', 'NNP')), 61),
 ((('Stan', 'NNP'), ('Lee', 'NNP')), 55),
 ((('Iron', 'NNP'), ('Man', 'NNP')), 50),
 ((('Captain', 'NNP'), ('America', 'NNP')), 49),
 ((('New', 'NNP'), ('York', 'NNP')), 38),
 ((('Steve', 'NNP'), ('Ditko', 'NNP')), 31),
 ((('Green', 'NNP'), ('Goblin', 'NNP')), 31),
 ((('X', 'NNP'), ('Men', 'NNP')), 31),
 ((('Fantastic', 'NNP'), ('Four', 'NNP')), 27),
 ((('Norman', 'NNP'), ('Osborn', 'NNP')), 27),
 ((('Retrieved', 'NNP'), ('April', 'NNP')), 24),
 ((('Marvel', 'NNP'), ('Comics', 'NNP')), 23),
 ((('Gwen', 'NNP'), ('Stacy', 'NNP')), 23),
 ((('Doctor', 'NNP'), ('Octopus', 'NNP')), 20),
 ((('Dorling', 'NNP'), ('Kindersley', 'NNP')), 20),
 ((('John', 'NNP'), ('Romita', 'NNP')), 19),
 ((('Lee', 'NNP'), ('Stan', 'NNP')), 19),
 ((('Gilbert', 'NNP'), ('Laura', 'NNP')), 19)]

In [12]:
selected = []
for k, v in bigram_nnp_freq.most_common(20)[2:]:  # first two are spider
    if v >= 20:
        selected.append(k[0][0]+" "+k[1][0])
selected

['Peter Parker',
 'Stan Lee',
 'Iron Man',
 'Captain America',
 'New York',
 'Steve Ditko',
 'Green Goblin',
 'X Men',
 'Fantastic Four',
 'Norman Osborn',
 'Retrieved April',
 'Marvel Comics',
 'Gwen Stacy',
 'Doctor Octopus',
 'Dorling Kindersley']

In [13]:
def get_selected(name):
    url = "https://en.wikipedia.org/w/api.php?action=parse&format=json&page=" + \
        name.replace(" ", "_")+"&redirects"
    content = json.loads(requests.get(url).text)
    try:
        text = content["parse"]["text"]["*"]
        soup = BeautifulSoup(text, 'html.parser')
        texts = soup.findAll(text=True)
        corpus = " ".join(t.strip() for t in texts)
    except KeyError:
        corpus = ""
    return corpus

In [14]:
corpus_list = {}
for name in selected:
    corpus_list[name] = get_selected(name)
    time.sleep(1)

In [15]:
corpus_list

 'Stan Lee': 'American comic book writer, editor, publisher, and producer  This article is about the comics creator. For other uses, see Stan Lee (disambiguation) .    Stan Lee Lee during the Phoenix Comicon 2014 Born Stanley Martin Lieber ( 1922-12-28 ) December 28, 1922 New York City , U.S. Died November 12, 2018 (2018-11-12) (aged\xa095) Los Angeles, California , U.S. Area(s) Comic book writer editor publisher producer Collaborators Jack Kirby Steve Ditko John\xa0Romita\xa0Sr. Don\xa0Heck Bill\xa0Everett Joe\xa0Maneely Dick\xa0Ayers Awards  The Will Eisner Award Hall of Fame  Jack Kirby Hall of Fame  National Medal of Arts  Disney Legends  Spouse(s) Joan Boocock  \u200b  \u200b ( m. 1947; died\xa02017) \u200b Children Joan Celia Lee Jan Lee Signature therealstanlee .com   Military career Allegiance  USA Service/ branch  United States Army Years\xa0of service 1942-1945 Rank Sergeant Unit Signal Corps Battles/wars World War II  Stan Lee [1] (born Stanley Martin Lieber  / ˈ l iː b ər /

In [16]:
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)


def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]


'''remove punctuation, lowercase, stem'''


def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))


vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')


def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0, 1]

[nltk_data] Downloading package punkt to /home/john/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
all_cosine = {}
for k, v in corpus_list.items():
    all_cosine[k] = cosine_sim(v, corpus)



In [18]:
values_ordered = {k: round(v, 2) for k, v in sorted(
    all_cosine.items(), key=lambda item: item[1], reverse=True)}
values_ordered

{'Peter Parker': 1.0,
 'Norman Osborn': 0.78,
 'Doctor Octopus': 0.76,
 'Gwen Stacy': 0.7,
 'Green Goblin': 0.63,
 'Stan Lee': 0.58,
 'Steve Ditko': 0.57,
 'Fantastic Four': 0.54,
 'Captain America': 0.45,
 'Iron Man': 0.45,
 'Marvel Comics': 0.42,
 'X Men': 0.41,
 'Dorling Kindersley': 0.09,
 'New York': 0.09,
 'Retrieved April': 0.0}