<a href="https://colab.research.google.com/github/kvamsi7/ML-portfolio/blob/main/text-summarizer/text_summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# get the CNN stories dataset
!wget https://github.com/Manning-LP-What-s-The-News/Starter-Repository/releases/download/v0.1.0/cnn_stories.tgz

# untar the dataset
!tar -xvf cnn_stories.tgz

In [6]:
from tqdm.notebook import tqdm
import string
import os
import pickle
import pandas as pd
import numpy as np

In [3]:
def load_doc(path):
  with open(path,'r',encoding='utf-8') as obj:
    text = obj.read()
  return text

def split_story(doc):
  index = doc.find('@highlight')

  story, highlights = doc[:index], doc[index:].split('@highlight')

  highlights = [h.strip() for h in highlights if len(h) > 0]

  return story,highlights


def load_stories(directory):
  stories = list()

  for name in tqdm(os.listdir(directory_path)):
    file_name = directory_path + '/' + name

    # load doc
    doc = load_doc(file_name)

    # split the story and highlights
    story, highlights = split_story(doc)

    stories.append({'highlights':highlights, 'story':story})

  return stories

def save_data(data,file_name):

  with open(file_name,'wb') as file:
    pickle.dump(data,file)

  print(f"Data has been saved to {file_name}")

In [4]:
directory_path = 'cnn/stories'
stories = load_stories(directory_path)
data_path = 'stories_list.pkl'
print('Loaded Stories %d '% len(stories))
save_data(stories,data_path)

  0%|          | 0/92579 [00:00<?, ?it/s]

Loaded Stories 92579 
Data has been saved to stories_list.pkl


In [5]:
stories[0]

{'highlights': ['Xinhua: Three dead, three children and four others wounded',
  'Xinhua: Police find two-foot (60-centimeter) knife that Fang said he used',
  'Motive for attack not immediately known',
  'The 26-year-old man tells police he was responsible'],
 'story': "Beijing, China (CNN) -- Police have detained a man accused of having killed at least three kindergarten students Tuesday in east China's Zibo City, Shandong Province, the city government said Wednesday, according to the state-run Xinhua News Agency.\n\nFang Jiantang, 26, was detained hours after the attack and has told police he was responsible, said the government in a statement.\n\nIn addition to the three fatalities, three other children and four teachers were wounded, it said.\n\nTwo of the teachers were in serious condition, it said.\n\nPolice found the two-foot (60-centimeter) knife that Fang said he used, the statement said.\n\nThe motive for the attack was not immediately known.\n\nThe attack is one of at least 

In [7]:
# stories = load(open('/content/stories_list.pkl','rb'))
# print('Loaded Stories %d' % len(stories))

In [10]:
def preprocess_text(seq):
  cleaned_txt = []


  # prepare a translation table to remove punctuation
  table = str.maketrans('','',string.punctuation)
  for line in seq:
    # strip source cnn if it exists
    index = line.find('(CNN) --')
    if index > -1:
      line = line[index + len('(CNN) --'):]

      # tokenize on white space
      line = line.split()

      # convert to lower case and remove punctuations and tokens with numbers
      line = [word.lower().translate(table) for word in line if word.isalpha()]

      # store as string
      cleaned_txt.append(' '.join(line))

  cleaned = [c for c in cleaned_txt if len(c) > 0]
  return cleaned

In [11]:
for example in tqdm(stories):
  example['story'] = preprocess_text(example['story'].split('\n'))
  example['highlights'] = preprocess_text(example['highlights'])

  0%|          | 0/92579 [00:00<?, ?it/s]

# Extracting Summaries from the stories with ROUGE Score

In [13]:
# !pip install -q Rouge

In [14]:
from rouge import Rouge
roug = Rouge()

In [15]:
def get_rouge_f1(references, sentence):
  score_ls = []

  for ans in references:
    scores = roug.get_scores(ans,sentence)
    score_ls.append(scores[0]['rouge-1']['f'])

  return max(score_ls)

In [16]:
def get_list_ans_each_story(story,references):
  scr = []
  hyp = []


  for i in range(0, len(story)):
    hypothesis = story[i]
    score = get_rouge_f1(references, hypothesis)

    hyp.append(hypothesis)
    scr.append(score)

  hyp1 = np.array(hyp)

  # sort the scores to get the indices
  scr1 = np.array(scr)
  scr2 = np.sort(scr)[::-1]
  ind = np.argsort(scr)[::-1]

  # take top 5
  ind1 = ind[0:5]
  list_ref = list(hyp1[ind1])

  return list_ref, scr2[:5]

In [23]:
dict_id_summary = {}
dict_id_score = {}

for s_id in tqdm(range(0, len(stories))):
  # story input
  story = stories[s_id]['story']

  references = stories[s_id]['highlights']
  references = references if len(references) else 'eos'

  list_ref, list_score = get_list_ans_each_story(story, references)

  dict_id_summary[s_id] = list_ref

  dict_id_score[s_id] = list_score


  0%|          | 0/92579 [00:00<?, ?it/s]

store the dictionaries created above in .pkl format

In [25]:
story_id = []
label_sent = []
sent_id = []
list_sent = []


for i in tqdm(range(0, len(stories))):

  list_ref = dict_id_summary[i]

  for j,story in enumerate(stories[i]['story']):

    ind = int(story in list_ref)

    label_sent.append(ind)

    list_sent.append(story)

    sent_id.append(j)

    story_id.append(i)



  0%|          | 0/92579 [00:00<?, ?it/s]

In [26]:
df_story_summary = pd.DataFrame()

df_story_summary['story_id'] = story_id
df_story_summary['sent_id'] = sent_id
df_story_summary['sentence'] = list_sent
df_story_summary['label_sent'] = label_sent


df_story_summary.head()

Unnamed: 0,story_id,sent_id,sentence,label_sent
0,0,0,police have detained a man accused of having k...,1
1,2,0,many including president have become fans of t...,1
2,3,0,the same obama administration that is under fi...,1
3,4,0,both alike in in fair geneva where we lay our,1
4,5,0,president obama will deliver a eulogy on sunda...,1


In [27]:
df_story_summary.to_pickle('dataframe_extractive.pkl')