In [19]:
from bs4 import BeautifulSoup
import certifi
import urllib3
import requests
import nltk

In [3]:
httppage = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
response = httppage.request('GET', 'https://en.wikipedia.org/wiki/Data_science')
# response = requests.post('https://en.wikipedia.org/wiki/Software_engineer')
soup = BeautifulSoup(response.data, 'html.parser')

In [4]:
text = ' '.join(map(lambda x: x.text, soup.find_all(name = 'p'))) #only look for 'p' tag

In [5]:
text

'\n Data science is a multi-disciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from structured and unstructured data.[1][2] Data science is the same concept as data mining and big data: "use the most powerful hardware, the most powerful programming systems, and the most efficient algorithms to solve problems".[3]\n Data science is a "concept to unify statistics, data analysis, machine learning and their related methods" in order to "understand and analyze actual phenomena" with data.[4] It employs techniques and theories drawn from many fields within the context of mathematics, statistics, computer science, and information science. Turing award winner Jim Gray imagined data science as a "fourth paradigm" of science (empirical, theoretical, computational and now data-driven) and asserted that "everything about science is changing because of the impact of information technology" and the data deluge.[5][6] In 2015, the Amer

In [6]:
from nltk.tokenize import word_tokenize, sent_tokenize
sents = sent_tokenize(text)
print(sents)

['\n Data science is a multi-disciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from structured and unstructured data.', '[1][2] Data science is the same concept as data mining and big data: "use the most powerful hardware, the most powerful programming systems, and the most efficient algorithms to solve problems".', '[3]\n Data science is a "concept to unify statistics, data analysis, machine learning and their related methods" in order to "understand and analyze actual phenomena" with data.', '[4] It employs techniques and theories drawn from many fields within the context of mathematics, statistics, computer science, and information science.', 'Turing award winner Jim Gray imagined data science as a "fourth paradigm" of science (empirical, theoretical, computational and now data-driven) and asserted that "everything about science is changing because of the impact of information technology" and the data deluge.', '[5][

# Stopwords

In [7]:
from string import punctuation
from nltk.corpus import stopwords
stopword_ = stopwords.words('english') + list(punctuation) + ["'s"]
print(stopword_)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Part of Speech

In [54]:
from nltk.corpus import wordnet
def get_part_of_speech(word):
    pos = nltk.pos_tag([word])
    if pos[0][1].startswith('J'):
        return wordnet.ADJ
    elif pos[0][1].startswith('R'):
        return wordnet.ADV
    elif pos[0][1].startswith('V'):
        return wordnet.VERB
    else:
        return wordnet.NOUN

# Lemmatization

In [55]:
from nltk.stem import WordNetLemmatizer
words_clean = [word.lower() for word in word_tokenize(text) if word.lower() not in stopword_]
# print(words_clean)
lemmatizer = WordNetLemmatizer()
lemmatizerwords = [lemmatizer.lemmatize(word, get_part_of_speech(word)) for word in words_clean]
print(lemmatizerwords)

['data', 'science', 'multi-disciplinary', 'field', 'us', 'scientific', 'method', 'process', 'algorithm', 'system', 'extract', 'knowledge', 'insight', 'structure', 'unstructured', 'data', '1', '2', 'data', 'science', 'concept', 'data', 'mining', 'big', 'data', '``', 'use', 'powerful', 'hardware', 'powerful', 'program', 'system', 'efficient', 'algorithm', 'solve', 'problem', "''", '3', 'data', 'science', '``', 'concept', 'unify', 'statistic', 'data', 'analysis', 'machine', 'learn', 'related', 'method', "''", 'order', '``', 'understand', 'analyze', 'actual', 'phenomenon', "''", 'data', '4', 'employ', 'technique', 'theory', 'drawn', 'many', 'field', 'within', 'context', 'mathematics', 'statistic', 'computer', 'science', 'information', 'science', 'turing', 'award', 'winner', 'jim', 'gray', 'imagine', 'data', 'science', '``', 'fourth', 'paradigm', "''", 'science', 'empirical', 'theoretical', 'computational', 'data-driven', 'assert', '``', 'everything', 'science', 'change', 'impact', 'informa

In [56]:
from nltk.probability import FreqDist
freqdist = FreqDist(lemmatizerwords)
freqdist

FreqDist({'data': 106, 'science': 73, '``': 45, "''": 31, 'statistic': 17, 'term': 15, 'statistical': 12, 'scientist': 12, 'field': 10, 'big': 9, ...})

# Bigram - try to look for connection between two words e.g. Data Science

In [57]:
from nltk.collocations import BigramCollocationFinder
from collections import defaultdict
ranking = defaultdict(int)
finder = BigramCollocationFinder.from_words(lemmatizerwords) #frequency distribution
for i, sent in enumerate(sents):
    lemma = [lemmatizer.lemmatize(word, get_part_of_speech(word)) for word in word_tokenize(sent.lower())]
    bigrams = BigramCollocationFinder.from_words(lemma)
    for bigram in bigrams.ngram_fd:
#         print(bigram)
        if bigram in finder.ngram_fd:
            ranking[i] += finder.ngram_fd[bigram]
        

# Top 5 sentences

In [58]:
from heapq import nlargest
result = nlargest(5, ranking,key=ranking.get)
result

[6, 26, 49, 45, 13]

# Summary

In [59]:
import re
summary = '  '
for i in sorted(result):
    new_sent = re.sub(r"\[.+\]", '', sents[i])
    summary += new_sent.replace('\n', ' ').strip(' ')
print('\t\t\t\t\t[Summary about Data Science on Wikipedia]\n')
print(summary)

					[Summary about Data Science on Wikipedia]

  In 2012, when Harvard Business Review called it "The Sexiest Job of the 21st Century", the term "data science" became a buzzword.The term "data science" has appeared in various contexts over the past thirty years but did not become an established term until recently.In his conclusion, he initiated the modern, non-computer science, usage of the term "data science" and advocated that statistics be renamed data science and statisticians data scientists.In 2014, the American Statistical Association section on Statistical Learning and Data Mining renamed its journal to "Statistical Analysis and Data Mining: The ASA Data Science Journal" and in 2016 changed its section name to "Statistical Learning and Data Science".However, many critical academics and journalists see no distinction between data science and statistics, whereas others consider it largely a popular term for "data mining" and "big data".
