In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import numpy as np


from nltk.cluster.util import cosine_distance
import networkx as nx

In [2]:
df = pd.read_csv('../data/articles.csv')
df.head()

Unnamed: 0,author,claps,reading_time,link,title,text
0,Justin Lee,8.3K,11,https://medium.com/swlh/chatbots-were-the-next...,Chatbots were the next big thing: what happene...,"Oh, how the headlines blared:\nChatbots were T..."
1,Conor Dewey,1.4K,7,https://towardsdatascience.com/python-for-data...,Python for Data Science: 8 Concepts You May Ha...,If you’ve ever found yourself looking up the s...
2,William Koehrsen,2.8K,11,https://towardsdatascience.com/automated-featu...,Automated Feature Engineering in Python – Towa...,Machine learning is increasingly moving from h...
3,Gant Laborde,1.3K,7,https://medium.freecodecamp.org/machine-learni...,Machine Learning: how to go from Zero to Hero ...,If your understanding of A.I. and Machine Lear...
4,Emmanuel Ameisen,935,11,https://blog.insightdatascience.com/reinforcem...,Reinforcement Learning from scratch – Insight ...,Want to learn about applied Artificial Intelli...


### Turn claps from str into int

In [3]:
def clean_up_claps(num):
    if num[-1] == 'K':
        num = num[:-1]
        num += '00'
        num = num.replace('.', '') 
    return num
df.claps = df.claps.apply(clean_up_claps)
df.claps = df.claps.astype(int)

In [4]:
#Get rid of link column
df = df.drop(labels = ['link', 'author'], axis = 1)
df.head()

Unnamed: 0,claps,reading_time,title,text
0,8300,11,Chatbots were the next big thing: what happene...,"Oh, how the headlines blared:\nChatbots were T..."
1,1400,7,Python for Data Science: 8 Concepts You May Ha...,If you’ve ever found yourself looking up the s...
2,2800,11,Automated Feature Engineering in Python – Towa...,Machine learning is increasingly moving from h...
3,1300,7,Machine Learning: how to go from Zero to Hero ...,If your understanding of A.I. and Machine Lear...
4,935,11,Reinforcement Learning from scratch – Insight ...,Want to learn about applied Artificial Intelli...


# Clean up text

In [15]:
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
import re

punct = list(string.punctuation)
sw = stopwords.words('english')

def pos_replace(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def bare_text(text):
    text = text.replace('\n','')
    text = text.lower()
    #Adds spaces where they are missing after punctuation
    text = re.sub(r'(?<=[.,\?!])(?=[^\s])', r' ', text)
    #Tokenize text
    text_token = word_tokenize(text)
    #Get rid of stopwords
    text_token = [w for w in text_token if w.lower() not in sw]
    #Lemmatize text
    text_token = pos_tag(text_token)
    text_token = [(w[0], pos_replace(w[1])) for w in text_token]
    lemmatizer = WordNetLemmatizer() 
    text_token = [lemmatizer.lemmatize(word[0], word[1]) for word in text_token]
    #Get rid of punctuation
    text_token = [w for w in text_token if w not in punct]
    #Special punctuation marks not included in original list
    text_token = [w for w in text_token if w not in ["’", "-", "‘"]]
    text = TreebankWordDetokenizer().detokenize(text_token)
    return text

def clean_sentences(text):
    text = text.replace('\n','')
    #Get rid of links
    text = re.sub(r'www\.[a-z]?\.?(com)+|[a-z]+\.(com)', '', text)
    #Add space after punctuation if its not there
    text = re.sub(r'(?<=[.,\?!:])(?=[^\s])', r' ', text)
    text = text.lower()
    #Get rid of punctuation
    text.replace("[^a-zA-Z]", " ").split(" ")
    sent = sent_tokenize(text)
    return sent

# Similarity Matrix

In [6]:
def sent_sim(sent1, sent2, stopwords = None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)

In [7]:
def sim_matrix(sent, stop_words = sw):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sent), len(sent)))
 
    for ind1 in range(len(sent)):
        for ind2 in range(len(sent)):
            if ind1 == ind2:
                continue 
            similarity_matrix[ind1][ind2] = sent_sim(sent[ind1], sent[ind2], stop_words)
    return similarity_matrix

# Summarize Text By Hand - Extractive Approach

In [8]:
def generate_summary(article, top_n = 3):
    summarize_text = []
    sentences =  clean_sentences(article)
    #Find similar sentences
    sentence_sim_martix = sim_matrix(sentences)
    sentence_sim_graph = nx.from_numpy_array(sentence_sim_martix)
    scores = nx.pagerank(sentence_sim_graph)
    #Rank similarity and find summary sentences
    ranked_sentence = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse = True)    
    for i in range(top_n):
        summarize_text.append(ranked_sentence[i][1])
    summary = " ".join(summarize_text)
    return summary

# *Built in Genism Summarizer

In [20]:
from gensim.summarization.summarizer import summarize

def generate_summary_genism(article, word_count = 250):
    sentences = clean_sentences(article)
    text = " ".join(sentences)
    summary = summarize(text, word_count = word_count)
    summary = summary.replace('\n',' ')
    return summary

In [21]:
test = df.text[1]
generate_summary_genism(test)

'in the interest of solidifying my understanding of these concepts once and for all and saving you guys a couple of stackoverflow searches, here’s the stuff that i’m always forgetting when working with python, numpy, and pandas. writing out a for loop every time you need to define some sort of list is tedious, luckily python has a built-in way to address this problem in just one line of code. for creating quick and easy numpy arrays, look no further than the arange and linspace functions. each one has their specific purpose, but the appeal here (instead of using range), is that they output numpy arrays, which are typically easier to work with for data science. so given a starting and stopping point, as well as a number of values, linspace will evenly space them out for you in a numpy array. let’s use the example of dropping a column for now: i don’t know how many times i wrote this line of code before i actually knew why i was declaring axis what i was. if you think about how this is i