In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import numpy as np


from nltk.cluster.util import cosine_distance
import networkx as nx

In [2]:
df = pd.read_csv('../data/articles.csv')
df.head()

Unnamed: 0,author,claps,reading_time,link,title,text
0,Justin Lee,8.3K,11,https://medium.com/swlh/chatbots-were-the-next...,Chatbots were the next big thing: what happene...,"Oh, how the headlines blared:\nChatbots were T..."
1,Conor Dewey,1.4K,7,https://towardsdatascience.com/python-for-data...,Python for Data Science: 8 Concepts You May Ha...,If you’ve ever found yourself looking up the s...
2,William Koehrsen,2.8K,11,https://towardsdatascience.com/automated-featu...,Automated Feature Engineering in Python – Towa...,Machine learning is increasingly moving from h...
3,Gant Laborde,1.3K,7,https://medium.freecodecamp.org/machine-learni...,Machine Learning: how to go from Zero to Hero ...,If your understanding of A.I. and Machine Lear...
4,Emmanuel Ameisen,935,11,https://blog.insightdatascience.com/reinforcem...,Reinforcement Learning from scratch – Insight ...,Want to learn about applied Artificial Intelli...


### Turn claps from str into int

In [3]:
def clean_up_claps(num):
    if num[-1] == 'K':
        num = num[:-1]
        num += '00'
        num = num.replace('.', '') 
    return num
df.claps = df.claps.apply(clean_up_claps)
df.claps = df.claps.astype(int)

In [4]:
#Get rid of link column
df = df.drop(labels = ['link', 'author'], axis = 1)
df.head()

Unnamed: 0,claps,reading_time,title,text
0,8300,11,Chatbots were the next big thing: what happene...,"Oh, how the headlines blared:\nChatbots were T..."
1,1400,7,Python for Data Science: 8 Concepts You May Ha...,If you’ve ever found yourself looking up the s...
2,2800,11,Automated Feature Engineering in Python – Towa...,Machine learning is increasingly moving from h...
3,1300,7,Machine Learning: how to go from Zero to Hero ...,If your understanding of A.I. and Machine Lear...
4,935,11,Reinforcement Learning from scratch – Insight ...,Want to learn about applied Artificial Intelli...


# Clean up text

In [5]:
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
import re

punct = list(string.punctuation)
sw = stopwords.words('english')

def pos_replace(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def bare_text(text):
    text = text.replace('\n','')
    text = text.lower()
    #Adds spaces where they are missing after punctuation
    text = re.sub(r'(?<=[.,\?!])(?=[^\s])', r' ', text)
    #Tokenize text
    text_token = word_tokenize(text)
    #Get rid of stopwords
    text_token = [w for w in text_token if w.lower() not in sw]
    #Lemmatize text
    text_token = pos_tag(text_token)
    text_token = [(w[0], pos_replace(w[1])) for w in text_token]
    lemmatizer = WordNetLemmatizer() 
    text_token = [lemmatizer.lemmatize(word[0], word[1]) for word in text_token]
    #Get rid of punctuation
    text_token = [w for w in text_token if w not in punct]
    #Special punctuation marks not included in original list
    text_token = [w for w in text_token if w not in ["’", "-", "‘"]]
    text = TreebankWordDetokenizer().detokenize(text_token)
    return text

def word_tokens(text):
    text = text.replace('\n','')
    text = text.lower()
    #Adds spaces where they are missing after punctuation
    text = re.sub(r'(?<=[.,\?!])(?=[^\s])', r' ', text)
    #Tokenize text
    text_token = word_tokenize(text)
    #Get rid of stopwords
    text_token = [w for w in text_token if w.lower() not in sw]
    #Lemmatize text
    text_token = pos_tag(text_token)
    text_token = [(w[0], pos_replace(w[1])) for w in text_token]
    lemmatizer = WordNetLemmatizer() 
    text_token = [lemmatizer.lemmatize(word[0], word[1]) for word in text_token]
    #Get rid of punctuation
    text_token = [w for w in text_token if w not in punct]
    #Special punctuation marks not included in original list
    text_token = [w for w in text_token if w not in ["’", "-", "‘"]]
    return text_token

def clean_sentences(text):
    text = text.replace('\n','')
    #Get rid of links
    text = re.sub(r'www\.[a-z]?\.?(com)+|[a-z]+\.(com)', '', text)
    #Add space after punctuation if its not there
    text = re.sub(r'(?<=[.,\?!:])(?=[^\s])', r' ', text)
    text = text.lower()
    #Get rid of punctuation
    text.replace("[^a-zA-Z]", " ").split(" ")
    sent = sent_tokenize(text)
    return sent

# Similarity Matrix

In [6]:
def sent_sim(sent1, sent2, stopwords = None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)

In [16]:
from math import*
 
def sent_sim_jaccard(sent1, sent2, stopwords = None):
    if stopwords is None:
        stopwords = []
        
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
        
    intersection = len(set.intersection(*[set(vector1), set(vector2)]))
    union = len(set.union(*[set(vector1), set(vector2)]))
    return intersection/float(union)

In [7]:
def sim_matrix(sent, stop_words = sw):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sent), len(sent)))
 
    for ind1 in range(len(sent)):
        for ind2 in range(len(sent)):
            if ind1 == ind2:
                continue 
            similarity_matrix[ind1][ind2] = sent_sim(sent[ind1], sent[ind2], stop_words)
    return similarity_matrix

In [11]:
def sim_matrix_j(sent, stop_words = sw):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sent), len(sent)))
 
    for ind1 in range(len(sent)):
        for ind2 in range(len(sent)):
            if ind1 == ind2:
                continue 
            similarity_matrix[ind1][ind2] = sent_sim_jaccard(sent[ind1], sent[ind2], stop_words)
    return similarity_matrix

# Summarize Text By Hand - Extractive Approach

In [12]:
def generate_summary(article, top_n = 3):
    summarize_text = []
    sentences =  clean_sentences(article)
    #Find similar sentences
    sentence_sim_martix = sim_matrix(sentences)
    sentence_sim_graph = nx.from_numpy_array(sentence_sim_martix)
    scores = nx.pagerank(sentence_sim_graph)
    #Rank similarity and find summary sentences
    ranked_sentence = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse = True)    
    for i in range(top_n):
        summarize_text.append(ranked_sentence[i][1])
    summary = " ".join(summarize_text)
    return summary

In [13]:
test = df.text[1]
generate_summary(test)

'check out the simple example below and the upcoming video to get a better feel for the power of lambda functions: once you have a grasp on lambda functions, learning to pair them with the map and filter functions can be a powerful tool. the filter function takes in a list and a rule, much like map, however it returns a subset of the original list by comparing each element against the boolean filtering rule. personally, i find myself pulling code from similar discussion threads several times, rather than taking the time to learn and solidify the concept so that i can reproduce the code myself the next time.'

In [17]:
def generate_summary_jaccard(article, top_n = 3):
    summarize_text = []
    sentences =  clean_sentences(article)
    #Find similar sentences
    sentence_sim_martix = sim_matrix_j(sentences)
    sentence_sim_graph = nx.from_numpy_array(sentence_sim_martix)
    scores = nx.pagerank(sentence_sim_graph)
    #Rank similarity and find summary sentences
    ranked_sentence = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse = True)    
    for i in range(top_n):
        summarize_text.append(ranked_sentence[i][1])
    summary = " ".join(summarize_text)
    return summary

In [18]:
generate_summary_jaccard(test)

'basically, they let you create a function, without creating a function. note that the stopping point is a ‘cut-off’ value, so it will not be included in the array output. think of apply as a map function, but made for pandas dataframes or more specifically, for series.'

# Bert

In [9]:
# from summarizer import Summarizer,TransformerSummarizer
# def generate_summary(article, top_n = 3):
#     bert_model = Summarizer()
#     bert_summary = ''.join(bert_model(article, min_length=60))

# Spacy Summarizer

In [24]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest

def generate_summary_spacy(article, n_sent = 3):
    nlp = spacy.load('en_core_web_sm')
    text = nlp(article)
    tokens = [token.text for token in text]
    word_frequencies = {}
    for word in text:
        if word.text.lower() not in list(STOP_WORDS):
            if word.text.lower() not in punctuation:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1
                    
    max_frequency = max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word] = word_frequencies[word] / max_frequency
        
    sentence_tokens= [sent for sent in text.sents]
    sentence_scores = {}
    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():                            
                    sentence_scores[sent] = word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent] += word_frequencies[word.text.lower()]
                    
    summary = nlargest(n_sent, sentence_scores, key = sentence_scores.get)
    final_summary = [word.text for word in summary]
    summary = ''.join(final_summary)
    summary = summary.replace('\n','')
    summary = re.sub(r'(?<=[.,\?!:])(?=[^\s])', r' ', summary)
    return summary

In [7]:
test = df.text[1]
generate_summary_spacy(test, 3)

NameError: name 'generate_summary_spacy' is not defined

# Sumy - LSA

In [21]:
from sumy.summarizers.lsa import LsaSummarizer
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser

def generate_summary_lsa(article):
    parser = PlaintextParser.from_string(article, Tokenizer('english'))
    lsa = LsaSummarizer()
    lsa_summary = lsa(parser.document, 3)
    summary = ''
    for s in lsa_summary: 
        if summary == '':
            summary = str(s)
        else:
            summary = summary + ' ' + str(s)
    return summary

In [22]:
generate_summary_lsa(test)

'Each one has their specific purpose, but the appeal here (instead of using range), is that they output NumPy arrays, which are typically easier to work with for data science. You might imagine how useful this can be, especially for formatting and manipulating values across a whole DataFrame column, without having to loop at all. I hope a couple of these overviews have effectively jogged your memory regarding important yet somewhat tricky methods, functions, and concepts you frequently encounter when using Python for data science.'

# Sumy - LexRank

In [23]:
from sumy.summarizers.lex_rank import LexRankSummarizer

def generate_summary_lexrank(article):
    parser = PlaintextParser.from_string(article, Tokenizer('english'))
    lex_rank = LexRankSummarizer() 
    lex_summary = lex_rank(parser.document, 3)
    summary = ''
    for s in lex_summary: 
        if summary == '':
            summary = str(s)
        else:
            summary = summary + ' ' + str(s)
    return summary

In [24]:
generate_summary_lexrank(test)

'So given a starting and stopping point, as well as a number of values, linspace will evenly space them out for you in a NumPy array. Let’s use the example of dropping a column for now: I don’t know how many times I wrote this line of code before I actually knew why I was declaring axis what I was. If you think about how this is indexed in Python, rows are at 0 and columns are at 1, much like how we declare our axis value.'

# Adds summary column to df

In [26]:
summaries = []
for article in df.text:
    summ = generate_summary_spacy(article)
    summaries.append(summ)

In [29]:
df['summary'] = summaries

In [33]:
df.to_csv('../data/with_summary', index = False)