### References
https://derwen.ai/docs/ptr/explain_summ/

In [1]:
import pandas as pd

In [17]:
df = pd.read_csv('cleaned_transcripts_v2.csv').drop('Unnamed: 0', axis=1)
df = df[df["duration"] > 300]
df.head()

Unnamed: 0,title,link,description,duration,podcast_id,transcript,cleaned_transcripts,cleaned_description
0,Letter of Unhappiness,https://www.npr.org/2022/03/30/1089668294/lett...,When Naira calls her parents back home in Russ...,950.0,1089668294,"[<div aria-label=""Transcript"" class=""transcrip...",You're listening to ROUGH TRANSLATION from N...,When Naira calls her parents back home in Russ...
1,The Culture Front,https://www.npr.org/2022/03/15/1086679997/the-...,When protecting a language is used as justific...,2436.0,1086679997,"[<div aria-label=""Transcript"" class=""transcrip...",You're listening to ROUGH TRANSLATION from N...,When protecting a language is used as justific...
2,Fighting Words In Ukraine,https://www.npr.org/2022/03/02/1083960666/figh...,Vladimir Putin joined the KGB at age 23. Ukrai...,1883.0,1083960666,"[<div aria-label=""Transcript"" class=""transcrip...",You're listening to ROUGH TRANSLATION from N...,Vladimir Putin joined the KGB at age 23. Ukrai...
3,May We Have This Dance?,https://www.npr.org/2021/12/22/1066965712/may-...,A jazz dance born in Harlem in the 1920s ends ...,2630.0,1066965712,"[<div aria-label=""Transcript"" class=""transcrip...",This is ROUGH TRANSLATION from NPR. I grew u...,A jazz dance born in Harlem in the 1920s ends ...
4,Moms In Translation,https://www.npr.org/2021/12/14/1064250980/moms...,An Irish journalist discovers she belongs in a...,2519.0,1064250980,"[<div aria-label=""Transcript"" class=""transcrip...",Hey. It's Gregory from ROUGH TRANSLATION. Ju...,An Irish journalist discovers she belongs in a...


In [19]:
#drop nulls in df
df = df[df['cleaned_description'].notna()]
df.reset_index(drop=True, inplace=True)

In [24]:
!pip install pytextrank
!spacy download en_core_web_sm

Collecting pytextrank
  Downloading pytextrank-3.2.3-py3-none-any.whl (30 kB)
Collecting graphviz>=0.13
  Downloading graphviz-0.19.1-py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.3/46.3 KB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting icecream>=2.1
  Downloading icecream-2.1.2-py2.py3-none-any.whl (8.3 kB)
Collecting networkx[default]>=2.6
  Downloading networkx-2.7.1-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting colorama>=0.3.9
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)


Installing collected packages: networkx, graphviz, colorama, icecream, pytextrank
Successfully installed colorama-0.4.4 graphviz-0.19.1 icecream-2.1.2 networkx-2.7.1 pytextrank-3.2.3
Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.9/13.9 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.2.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [25]:
#write loop to add textrank to df
import spacy
import pytextrank
import scipy
from math import sqrt
from operator import itemgetter
import numpy as np
from rouge_score import rouge_scorer
import time
def run_textrank(df, lim_phrases, target_summ_length, summary_list, rouge_list, ranks_list):
    '''
    Fills 2 lists with textrank summaries and rouge scores.
    Args:
    - df with column ‘cleaned_transcripts’ as text and ‘cleaned_description’ as summary label
    - limit_phrases: number of top phrases to include in unit vector
    - target_summ_length: number of sentences to include in extractive summary
    - summary_list: an empty list to add summaries to
    - rouge_list: an empty list to add rouge scores to
    - ranks_list
    '''
    starttime = time.time()
    for i in range(len(df)):
        # example text
        text = df['cleaned_transcripts'][i]
        # load a spaCy model, depending on language, scale, etc.
        nlp = spacy.load("en_core_web_sm")
        # add PyTextRank to the spaCy pipeline
        nlp.add_pipe("textrank")
        doc = nlp(text)
        #Define sentence bounds
        sent_bounds = [ [s.start, s.end, set([])] for s in doc.sents ]
        #Create unit vector with phrase
        limit_phrases = lim_phrases
        phrase_id = 0
        unit_vector = []
        for p in doc._.phrases:
            unit_vector.append(p.rank)
            for chunk in p.chunks:
                for sent_start, sent_end, sent_vector in sent_bounds:
                    if chunk.start >= sent_start and chunk.end <= sent_end:
                        sent_vector.add(phrase_id)
                        break
            phrase_id += 1
            if phrase_id == limit_phrases:
                break
        #rank sentences based on euclidean distance from unit vector
        sent_rank = {}
        sent_id = 0
        for sent_start, sent_end, sent_vector in sent_bounds:
            sum_sq = 0.0
            for phrase_id in range(len(unit_vector)):
                if phrase_id not in sent_vector:
                    sum_sq += unit_vector[phrase_id]**2.0
            sent_rank[sent_id] = sqrt(sum_sq)
            sent_id += 1
        #sort sentences in order of least distance from unit vector
        ranks = sorted(sent_rank.items(), key=itemgetter(1))
        ranks_list += [ranks[:target_summ_length]]
        #get most important sentences in transcript for summary and record indices
        limit_sentences = target_summ_length
        indices = [i[0] for i in ranks[:limit_sentences]]
        #sort indices
        sorted_indices = list(np.sort(indices))
        sorted_indices
        #save sentences in dictionary
        sent_text = {}
        sent_id = 0
        for sent in doc.sents:
            sent_text[sent_id] = sent.text
            sent_id += 1
        num_sent = 0
        for sent_id, rank in sorted(sent_rank.items(), key=itemgetter(1)):
            #ic(sent_id, sent_text[sent_id])
            num_sent += 1
            if num_sent == limit_sentences:
                break
        #format generated summary
        final_summ = [sent_text[j] for j in sorted_indices]
        final_summ = ' '.join(final_summ)
        #add summary to list
        summary_list += [final_summ]
        #define reference for rouge evaluation
        reference = df['cleaned_description'][i]
        #calculate rouge scores
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2','rougeLsum'], use_stemmer=True)
        scores = scorer.score(final_summ, reference)
        rouge_list += [scores]
    endtime = time.time()
    print (endtime - starttime)

In [26]:
ext_summaries = []
ext_rouge = []
ext_ranks20 = []

run_textrank(df, 4, 20, ext_summaries, ext_rouge, ext_ranks20)

5097.180573940277


In [27]:
len(ext_summaries)
len(ext_rouge)

4180

In [None]:
in_top15 = []
for i in range(len(ext_ranks15)):
    switch = 0
    for j in range(15):
        if ext_ranks15[i][j][0] == 0 or ext_ranks15[i][j][0] ==1:
            switch = 1
        else:
            pass
    if switch == 1:
        in_top15 += [i]

In [None]:
len(in_top15)

In [28]:
df['ext_summaries'] = ext_summaries
df['ext_rouge'] = ext_rouge

In [29]:
# df.to_csv('extractive_summaries_v2.csv')

In [30]:
df.head()

Unnamed: 0,title,link,description,duration,podcast_id,transcript,cleaned_transcripts,cleaned_description,ext_summaries,ext_rouge
0,Letter of Unhappiness,https://www.npr.org/2022/03/30/1089668294/lett...,When Naira calls her parents back home in Russ...,950.0,1089668294,"[<div aria-label=""Transcript"" class=""transcrip...",You're listening to ROUGH TRANSLATION from N...,When Naira calls her parents back home in Russ...,You're listening to ROUGH TRANSLATION from N...,"{'rouge1': (0.6585365853658537, 0.095406360424..."
1,The Culture Front,https://www.npr.org/2022/03/15/1086679997/the-...,When protecting a language is used as justific...,2436.0,1086679997,"[<div aria-label=""Transcript"" class=""transcrip...",You're listening to ROUGH TRANSLATION from N...,When protecting a language is used as justific...,You're listening to ROUGH TRANSLATION from N...,"{'rouge1': (0.6944444444444444, 0.062189054726..."
2,Fighting Words In Ukraine,https://www.npr.org/2022/03/02/1083960666/figh...,Vladimir Putin joined the KGB at age 23. Ukrai...,1883.0,1083960666,"[<div aria-label=""Transcript"" class=""transcrip...",You're listening to ROUGH TRANSLATION from N...,Vladimir Putin joined the KGB at age 23. Ukrai...,"So watching the news out of Ukraine this week,...","{'rouge1': (0.625, 0.056818181818181816, 0.104..."
3,May We Have This Dance?,https://www.npr.org/2021/12/22/1066965712/may-...,A jazz dance born in Harlem in the 1920s ends ...,2630.0,1066965712,"[<div aria-label=""Transcript"" class=""transcrip...",This is ROUGH TRANSLATION from NPR. I grew u...,A jazz dance born in Harlem in the 1920s ends ...,"It was called Lindy Hop. LaTasha Barnes, who f...","{'rouge1': (0.5714285714285714, 0.042895442359..."
4,Moms In Translation,https://www.npr.org/2021/12/14/1064250980/moms...,An Irish journalist discovers she belongs in a...,2519.0,1064250980,"[<div aria-label=""Transcript"" class=""transcrip...",Hey. It's Gregory from ROUGH TRANSLATION. Ju...,An Irish journalist discovers she belongs in a...,Hey. It's Gregory from ROUGH TRANSLATION. In...,"{'rouge1': (0.5483870967741935, 0.057432432432..."
