## Import Packages

In [1]:
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

import warnings
warnings.filterwarnings('ignore')

## Read the data 

In [2]:
filename = 'data/raw_da_qs.csv'

data_total = pd.read_csv(filename)

data_total.head()

Unnamed: 0,year,month,day,url,title,letterId,question_only
0,1985,1.0,1,proquest,WOMAN NEEDS HELP: HER BURDEN OF HOPELESSNESS I...,1,i have been in a bad marriage for 40 years. i ...
1,1985,1.0,1,proquest,WOMAN NEEDS HELP: HER BURDEN OF HOPELESSNESS I...,1,"this is for all newspaper carriers, mail carri..."
2,1985,1.0,2,proquest,LAMENT ABOUT OLD AGE AND SICKNESS IS ILL-ADVISED,1,our 16-year-old son recently was placed in a s...
3,1985,1.0,3,proquest,'NORMAL' WIDOW HAS AN EYE FOR THE GUYS,1,"i was a happy, respectable wife for 40 years, ..."
4,1985,1.0,4,proquest,IT'S WISE TO BE WARY WHEN A STRANGER COMES TO ...,1,you be the judge: last night about 7 p.m. i wa...


### Replace the month NAN with the median value

In [3]:
data_total.month.isna().sum(), np.isnan(data_total.month).median()

(3, 0.0)

In [4]:
def replace_month_nan(df = data_total):
    for i in range(len(df)):
        if np.isnan(df.month[i]):
            df.month[i] = np.isnan(df.month).median()
    return df

replace_month_nan()

data_total.month.isna().sum()

0

### Create the 'DATE' column

In [5]:
def get_date(df=data_total):
    df['date'] = ''*len(df)
    
    for row in range(len(df)):
        df['date'][row] = str(df['day'][row]) + '-' + str(df['month'][row]) + '-' + str(df['year'][row])
    
    return df[['url','question_only','date']]

data_total = get_date()

## Instantiate Model

In [6]:
data_total.head(3)

Unnamed: 0,url,question_only,date
0,proquest,i have been in a bad marriage for 40 years. i ...,01-1.0-1985
1,proquest,"this is for all newspaper carriers, mail carri...",01-1.0-1985
2,proquest,our 16-year-old son recently was placed in a s...,02-1.0-1985


In [7]:
#instantiate the model
tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_bbc")
model = AutoModelForSeq2SeqLM.from_pretrained("google/roberta2roberta_L-24_bbc")

normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


### Create summarization function

In [8]:
for i in range(2):
    text = data_total.question_only[i]
    print(text)
    
    input_ids = tokenizer(text, return_tensors="pt").input_ids
    output_ids = model.generate(input_ids)[0]
    print("\t\t\t\t", tokenizer.decode(output_ids, skip_special_tokens=True, max_length=20))

i have been in a bad marriage for 40 years. i knew it was a mistake after the first year, but being a catholic, i accepted it as my cross. i bore eight children, hating every minute of it.
there was never enough money, so i started teaching school when my youngest was 4. i'm 60 now and feel like 85. i'm sick of marriage, sick of my family and sick of life in general.
suicide would cut off insurance and be hard on my family. is there a place where i can get some information on how to end my life and make it look like an accident?
every night i pray i won't wake up the next morning.
helpless in hell
				 " I want to end my life, I'm a mother, I'm a mother, I'm a mother, I'm a mother, I'm a mother. "
this is for all newspaper carriers, mail carriers and delivery people: when you see a dog, barking and growling in front of someone's house, turn right around and forget that house.
today, a delivery boy tried to get up to our front door when he was confronted by our dog on a chain, growling 

In [9]:
data_total = data_total.dropna()
data_total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20034 entries, 0 to 20033
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   url            20034 non-null  object
 1   question_only  20034 non-null  object
 2   date           20034 non-null  object
dtypes: object(3)
memory usage: 626.1+ KB


In [25]:
import time 
from multiprocessing import Pool

data_total['question_summary'] = ''*len(data_total)

def summarize_questions(range_ab ,df = data_total):
    r1,r2 = range_ab[0],range_ab[1]
    if r2 is None :
        r2 = len(df)
    
    for row in range(r1,r2):
        input_ids = tokenizer.encode(df.question_only[row],
                                     return_tensors='pt', 
                                     padding=True, 
                                     truncation=True, 
                                     max_length=20, 
                                     add_special_tokens = True)
        output_ids = model.generate(input_ids)[0]
        res = tokenizer.decode(output_ids,skip_special_tokens=True)
        
        df['question_summary'][row] = res
        
        print(" Process %s waiting %s seconds" % (row, row+1))
        time.sleep(int(row+1))
        print(" Process %s Finished." % row)
    
    return df

In [27]:
a = list(range(0, 20100, 100))
b = list(range(100, 20100, 100))

In [28]:
ranges = list(zip(a, b))
ranges, len(ranges)

([(0, 100),
  (100, 200),
  (200, 300),
  (300, 400),
  (400, 500),
  (500, 600),
  (600, 700),
  (700, 800),
  (800, 900),
  (900, 1000),
  (1000, 1100),
  (1100, 1200),
  (1200, 1300),
  (1300, 1400),
  (1400, 1500),
  (1500, 1600),
  (1600, 1700),
  (1700, 1800),
  (1800, 1900),
  (1900, 2000),
  (2000, 2100),
  (2100, 2200),
  (2200, 2300),
  (2300, 2400),
  (2400, 2500),
  (2500, 2600),
  (2600, 2700),
  (2700, 2800),
  (2800, 2900),
  (2900, 3000),
  (3000, 3100),
  (3100, 3200),
  (3200, 3300),
  (3300, 3400),
  (3400, 3500),
  (3500, 3600),
  (3600, 3700),
  (3700, 3800),
  (3800, 3900),
  (3900, 4000),
  (4000, 4100),
  (4100, 4200),
  (4200, 4300),
  (4300, 4400),
  (4400, 4500),
  (4500, 4600),
  (4600, 4700),
  (4700, 4800),
  (4800, 4900),
  (4900, 5000),
  (5000, 5100),
  (5100, 5200),
  (5200, 5300),
  (5300, 5400),
  (5400, 5500),
  (5500, 5600),
  (5600, 5700),
  (5700, 5800),
  (5800, 5900),
  (5900, 6000),
  (6000, 6100),
  (6100, 6200),
  (6200, 6300),
  (6300, 6400

288 The Japanese government has announced it will ban the use of the word " moo " in the country's music and entertainment system.
 Process 288 waiting 289 seconds
 Process 206 Finished.
207 The BBC has been looking at the care system for the elderly in the US.
 Process 207 waiting 208 seconds


In [21]:
from multiprocessing.pool import ThreadPool
from tqdm import tqdm

pool = ThreadPool()

with tqdm(total =) as pbar:
    for i in range(100):
        results = pool.map(summarize_questions, ranges)
    pool.close()
    pool.join()

  0%|          | 0/100 [00:00<?, ?it/s]

3200 I'm not sure who was involved in the battle of the Somme.
 Process 3200 waiting 3201 seconds
4800 Did you know that you have to be born in a different country to be a member of the public?
 Process 4800 waiting 4801 seconds
4000 The BBC has been looking at how I've been looking after your loved ones for more than a decade.
 Process 4000 waiting 4001 seconds
1600 It's not often that young people have to go through an accident.
 Process 1600 waiting 1601 seconds
800 One person has had to give up some of his Christmas gifts to try to get the people who have received them through through social media.
 Process 800 waiting 801 seconds
2400 It's that time of the year when people are worried about their children's health and what they want to do if they are killed.
 Process 2400 waiting 2401 seconds
0 " I don't know what I will do if I got married? "
 Process 0 waiting 1 seconds
 Process 0 Finished.
5600 If you're a fan of some of the world's most popular pop stars, then you'll know that

Process SpawnPoolWorker-553:
Process SpawnPoolWorker-510:
Process SpawnPoolWorker-512:
Process SpawnPoolWorker-509:
Process SpawnPoolWorker-552:
Process SpawnPoolWorker-511:
Process SpawnPoolWorker-625:
Process SpawnPoolWorker-588:
Process SpawnPoolWorker-615:
Process SpawnPoolWorker-590:
Process SpawnPoolWorker-589:
Process SpawnPoolWorker-521:
Process SpawnPoolWorker-525:
Process SpawnPoolWorker-513:
Process SpawnPoolWorker-514:
Process SpawnPoolWorker-628:
Process SpawnPoolWorker-617:
Process SpawnPoolWorker-616:
Process SpawnPoolWorker-591:
Process SpawnPoolWorker-526:
Process SpawnPoolWorker-527:
Process SpawnPoolWorker-624:
Process SpawnPoolWorker-592:
Process SpawnPoolWorker-535:
Process SpawnPoolWorker-536:
Process SpawnPoolWorker-533:
Process SpawnPoolWorker-515:
Process SpawnPoolWorker-614:
Process SpawnPoolWorker-626:
Process SpawnPoolWorker-555:
Process SpawnPoolWorker-520:
Process SpawnPoolWorker-534:
Process SpawnPoolWorker-554:
  0%|          | 0/100 [19:45:57<?, ?it/s]P

KeyboardInterrupt: 

 Process 205 Finished.


In [None]:

data_mod.head(3)