In [1]:
import re
import requests
from bs4 import BeautifulSoup
import os
from tqdm.notebook import tqdm
from dateutil import parser
import jsonlines
import datetime
import pandas as pd
import warnings
import string
import unicodedata
import pdfminer
import glob
warnings.simplefilter("ignore")
pd.set_option('max_colwidth', 350)


transtable = str.maketrans(dict.fromkeys(string.punctuation))

def to_string(s):
    """
    takes input s and returns a string type and encoding if necessary
    """
    try:
        return str(s)
    except:
        # Change the encoding type if needed
        return s.encode('utf-8')


def unicodeToAscii(s):
    """
    input: string
    output: string ascii
    """
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        )

pd.set_option('max_colwidth', 100)

todaysdate = datetime.datetime.today().strftime('%Y-%m-%d')
print(f"{todaysdate}")
# making transtable

transtable = str.maketrans(dict.fromkeys(string.punctuation))

def cln(i, extent=1):
    """
    String white space 'cleaner'.
    :param i:
    :param extent: 1 --> all white space reduced to length 1; 2 --> removal of all white space.
    :return:
    """

    if isinstance(i, str) and i != "":
        if extent == 1:
            return re.sub(r"\s\s+", " ", i)
        elif extent == 2:
            return re.sub(r"\s+", "", i)
    else:
        return i


def strip_punctuation(input_string):
    """cleans string by stripping punctuation """
    return input_string.translate(transtable)



def chunks(l, n):
    """Yield n number of sequential chunks from l."""
    d, r = divmod(len(l), n)
    for i in range(n):
        si = (d + 1) * (i if i < r else r) + d * (0 if i < r else i - r)
        yield l[si:si + (d + 1 if i < r else d)]


flatten = lambda l: [item for sublist in l for item in sublist]

def get_match(text, rex):
    if isinstance(rex, (list, tuple, set)):
        rex = '(' + '|'.join(rex) + ')'
    result = re.findall(rex, text)
    return result


def make_url_from_href(href, base='https://www.dhs.gov'):
    if str(href).lower() in ['nan', 'none', 'np.nan', 'na']:
        return None
    
    else:
        if 'www.' in href: 
            return href
        else:
            return base + href
        
def name_to_regex(name):
    decoded_name = unidecode(name)
    if name != decoded_name:
        regex = fr'\b({name}|{decoded_name})\b'
    else:
        regex = fr'\b{name}\b'
    return re.compile(regex, flags=re.IGNORECASE)


def get_date(date_string):


    try:
        date = parser.parse(date_string.split('update')[0], fuzzy=True)
        return date
    except Exception:
        return None
    
def get_date_multiple_strings(row):
    d1 = get_date(row['text'])
    if not d1:
        return get_date(row['href'])
    else:
        return d1

        
    

2022-06-11


In [2]:
dir_path = 'NationalTerrorismAdvisories/'
if not os.path.exists(dir_path):
    os.mkdir(dir_path)

```python
# grabbing file
soup = BeautifulSoup(requests.get('https://www.dhs.gov/national-terrorism-advisory-system').text, 'html.parser')

dfbulletins= pd.DataFrame({'text': a.text.strip(), 'href': a.attrs.get('href') } \
                          for a in soup.find_all('a', href = re.compile(r'.*/advisory*|.*/publication*')))
dfbulletins['url'] = dfbulletins['href'].map(make_url_from_href)
dfbulletins['date'] = dfbulletins.apply(get_date_multiple_strings, axis=1)
dfbulletins['year'] = dfbulletins['date'].dt.year
dfbulletins = dfbulletins[dfbulletins['year'].notnull()].reset_index(drop=True)
dfbulletins['language'] = dfbulletins['href'].map(lambda x: 'Translation' if 'translation' in x.lower() else 'English')
filename = 'NationalTerrorismAdvisories/meta-dfbulletins.csv'
dfbulletins.to_csv(filename, index=False)
```

In [3]:
filename = 'NationalTerrorismAdvisories/meta-dfbulletins.csv'
dfbulletins = pd.read_csv(filename)
dfbulletins.head()


Unnamed: 0,text,href,url,date,year,language
0,"National Terrorism Advisory System Bulletin - June 7, 2022",/ntas/advisory/national-terrorism-advisory-system-bulletin-june-7-2022,https://www.dhs.gov/ntas/advisory/national-terrorism-advisory-system-bulletin-june-7-2022,2022-06-07,2022.0,English
1,Read this Bulletin translated into various languages,/publication/national-terrorism-advisory-system-bulletin-june-7-2022-translations,https://www.dhs.gov/publication/national-terrorism-advisory-system-bulletin-june-7-2022-translat...,2022-06-07,2022.0,Translation
2,"National Terrorism Advisory System Bulletin - February 07, 2022",/ntas/advisory/national-terrorism-advisory-system-bulletin-february-07-2022,https://www.dhs.gov/ntas/advisory/national-terrorism-advisory-system-bulletin-february-07-2022,2022-02-07,2022.0,English
3,Read this Bulletin translated into various languages,/publication/national-terrorism-advisory-system-bulletin-february-7-2022-translations,https://www.dhs.gov/publication/national-terrorism-advisory-system-bulletin-february-7-2022-tran...,2022-02-07,2022.0,Translation
4,"National Terrorism Advisory System Bulletin - November 10, 2021",/ntas/advisory/national-terrorism-advisory-system-bulletin-november-10-2021,https://www.dhs.gov/ntas/advisory/national-terrorism-advisory-system-bulletin-november-10-2021,2021-11-10,2021.0,English


```python
def get_table_from_url(url):
    
    df = pd.read_html(url)[0]
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    tempdf = pd.DataFrame({'Attachment': t.text.strip(), 'href': t.attrs.get('href')} for t \
     in soup.find('tbody').find_all('a', href = re.compile(r'.*pdf*|.*PDF*'))
    ).merge(df)
    tempdf['Date']=tempdf.apply(lambda row: get_date(row['Attachment']) if pd.isnull(row['Date']) else row['Date'], axis=1)
    tempdf['orig_url'] = url
    return tempdf


lines = (line for line in open(filename))
list_line = (s.rstrip().split(",") for s in lines)
cols = next(list_line)
_dicts = (dict(zip(cols, data)) for data in list_line)
urls_ = pd.concat(
   ( get_table_from_url(_dict["url"])
    for _dict in _dicts
    if _dict["language"] == "Translation")
)
urls_['lang'] =urls_['Attachment'].map(unicodeToAscii).map(lambda x: get_match(cln(strip_punctuation(x.lower())),rex=['arabic',
 'chinesefrench',
 'chinesesimplified',
 'chinesetraditional', 'chinese traditional', 'chinese simplified', 'simplified chinese', 'traditional chinese',
 'french',
 'korean',
 'portugese',
 'portuguese',
 'russian',
 'simplifiedchinese',
 'somali',
 'spanish',
 'tagalog',
 'thai',
 'traditionalchinese',
 'vietnamese'])).map(lambda x: x[0] if len(x) > 0 else None)

def normalize_lang(x):
    if not 'chinese' in x.lower():
        return x
    
    if 'simplified' in x.lower(): return 'simplified-chinese'
    elif 'traditional' in x.lower(): return 'traditional-chinese'


urls_['Date'] = urls_['Date'].map(str).map(get_date)

urls_['lang'] = urls_['lang'].map(normalize_lang)
urls_.to_csv(dir_path + '/meta-all-translation-pdfs.csv', index=False)
```

```python
urls_ = pd.read_csv(dir_path + '/meta-all-translation-pdfs.csv')
files = urls_.to_dict('records')
for row in tqdm(files, total=len(files)):
    link_href = row['href']
    
    link_filename = f"{row['index']}_{row['Date']}_{row['lang']}.pdf"
    pdf_res = requests.get(link_href)
    with open(os.path.join(dir_path, link_filename), 'wb') as f:
        f.write(pdf_res.content)
``` 

```python
from io import StringIO

from PyPDF2 import PdfFileReader
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser


def extract_information(pdf_path, verbose=False):
    with open(pdf_path, 'rb') as f:
        pdf = PdfFileReader(f)
        information = pdf.getDocumentInfo()
        number_of_pages = pdf.getNumPages()

    txt = f"""
    Information about {pdf_path}: 

    Author: {information.author}
    Creator: {information.creator}
    Producer: {information.producer}
    Subject: {information.subject}
    Title: {information.title}
    Number of pages: {number_of_pages}
    """
    ad = {}
    ad['pdf_path'] = pdf_path
    ad['num_pages'] = number_of_pages
    for k,val in information.items():
        ad[strip_punctuation(k)] = val
    if verbose: print(txt)
    return ad

ld = []
pdf_files = glob.glob('NationalTerrorismAdvisories/*russian*pdf')
for file in tqdm(pdf_files):
    d = extract_information(file)
    
    idx, date, lang = file.split('/')[-1].split('_')
    d['file_idx'] = idx
    d['bulletin_date'] = date
    d['lang'] = lang
    d['file']  = file
    
    output_string = StringIO()
    with open(file, 'rb') as in_file:
        parser = PDFParser(in_file)
    
        doc = PDFDocument(parser)
  

        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in PDFPage.create_pages(doc):
        
            

            interpreter.process_page(page)

    d['text'] = output_string.getvalue()
    ld.append(d)
```

In [89]:
pd.set_option('max_columns', 12)
mergeddata[[ 'num_pages', 'CreationDate', 'Creator',
    'Producer', 'bulletin_date',  'lang', 'file',  'Title', 'Keywords', 'Subject', 'text',
       'href', 'orig_url']].sort_values('bulletin_date', ascending=False)

Unnamed: 0,num_pages,CreationDate,Creator,Producer,bulletin_date,lang,...,Title,Keywords,Subject,text,href,orig_url
2,2,D:20210129170043-05'00',Acrobat PDFMaker 20 for Word,Adobe PDF Library 20.12.73,2021-02-02,russian,...,NTAS Bulletin (Russian),,,Бюллетень \nНациональная система предупреждения о террористической угрозе \n\nwww.dhs.gov/adviso...,https://www.dhs.gov/sites/default/files/publications/21_0202_ntas-translation_russian.pdf,https://www.dhs.gov/publication/national-terrorism-advisory-system-bulletin-january-27-2021-tran...
5,3,D:20200110113539-05'00',Acrobat PDFMaker 19 for Word,Adobe PDF Library 19.12.66,2020-01-04,russian,...,"National Terrorism Advisory Security Bulletin, January 2020, Russian",,,Национальная система предупреждения о террористической угрозе \n\nБюллетень \n\nwww.dhs.gov/advi...,https://www.dhs.gov/sites/default/files/publications/20_0104_ntas_russian_0.pdf,https://www.dhs.gov/publication/national-terrorism-advisory-system-bulletin-january-4-2020-trans...
1,3,D:20190207040114Z,Acrobat PDFMaker 19 for Word,Adobe PDF Library 19.10.96,2019-01-18,russian,...,NTAS Update January 18 2019,"Terrorism, NTAS, DHS",National Terrorism Advisory System Bulletin,Национальная система предупреждения о террористической угрозе \n\nБюллетень \n\nwww.dhs.gov/advi...,https://www.dhs.gov/sites/default/files/publications/19_0118_NTAS-bulletin_Russian.pdf,https://www.dhs.gov/publication/national-terrorism-advisory-system-bulletin-january-18-2019-tran...
6,3,D:20180918201109Z,Acrobat PDFMaker 15 for Word,Adobe PDF Library 15.0,2018-09-14,russian,...,NTAS_Russian_FINAL,"Terrorism, NTAS, DHS",National Terrorism Advisory System Bulletin,Национальная система предупреждения о террористической угрозе \n\nБюллетень \n\nwww.dhs.gov/advi...,https://www.dhs.gov/sites/default/files/publications/18_0914_NTAS-Bulletin_Russian.pdf,https://www.dhs.gov/publication/national-terrorism-advisory-system-bulletin-september-14-2018-tr...
7,3,D:20180510105940-04'00',Acrobat PDFMaker 18 for Word,Adobe PDF Library 15.0,2018-05-09,russian,...,Национальная система предупреждения о,,,Национальная система предупреждения о \n\nтеррористической угрозе \n\nБюллетень \n\nwww.dhs.gov/...,https://www.dhs.gov/sites/default/files/publications/18_0509_NTAS_Russian.pdf,https://www.dhs.gov/publication/national-terrorism-advisory-system-bulletin-may-9-2018-translations
3,3,D:20171109155012-05'00',Acrobat PDFMaker 15 for Word,Adobe PDF Library 15.0,2017-11-09,russian,...,17_1109_NTAS_Russian,,,Национальная система предупреждения о террористической угрозе \n\nБюллетень \n\nwww.dhs.gov/advi...,https://www.dhs.gov/sites/default/files/publications/17_1109_NTAS_Russian.pdf,https://www.dhs.gov/publication/national-terrorism-advisory-system-bulletin-november-9-2017-tran...
8,3,D:20170516132536-04'00',PScript5.dll Version 5.2.2,Acrobat Distiller 15.0 (Windows),2017-05-15,russian,...,NTAS Bulletin 2017 (Russian),,,Национальная\tсистема\tпредупреждения\tо\tтеррористической\tугрозе \n\t\nБюллетень\t \n\nwww.dhs...,https://www.dhs.gov/sites/default/files/publications/NTAS-Bulletin_2017_Russian.pdf,https://www.dhs.gov/publication/national-terrorism-advisory-system-bulletin-may-15-2017-translat...
9,3,D:20161115204816-05'00',Acrobat PDFMaker 15 for Word,Adobe PDF Library 15.0,2016-11-15,russian,...,NTAS - Russian,,,Национальная система предупреждения о террористической угрозе \n \n\nБюллетень \n \n\nwww.dhs.go...,https://www.dhs.gov/sites/default/files/publications/16_1115_NTAS_bulletin_RU.pdf,https://www.dhs.gov/publication/ntas-bulletin-november-15-2016-translations
4,3,D:20160617095130-04'00',Acrobat PDFMaker 11 for Word,Adobe PDF Library 11.0,2016-06-15,russian,...,NTAS Bulletin 2016 001 Russian,,,Национальная система предупреждения о террористической угрозе \n\nБюллетень \n\nwww.dhs.gov/advi...,https://www.dhs.gov/sites/default/files/publications/16_0615_NTAS_bulletin_RU.pdf,https://www.dhs.gov/publication/ntas-bulletin-june-15-2016-translations
0,2,D:20160201134704-05'00',Acrobat PDFMaker 11 for Word,Adobe PDF Library 11.0,2015-12-16,russian,...,,,,Национальная система предупреждения о террористической угрозе \n\nБюллетень \n\nwww.dhs.gov/advi...,https://www.dhs.gov/sites/default/files/publications/15_1216_NTAS_bulletin_RU.pdf,https://www.dhs.gov/publication/ntas-bulletin-december-16-2015-translations


In [46]:
urls_ = pd.read_csv('NationalTerrorismAdvisories/meta-all-translation-pdfs.csv')
urls_['pdf_path'] = urls_.apply(lambda row: f"{dir_path}{row['index']}_{row['Date']}_{row['lang']}.pdf", axis=1)

mergeddata = pd.DataFrame(ld).merge(urls_[['pdf_path', 'href', 'orig_url','Date']], how='left', on=['pdf_path'])

In [47]:
mergeddata

num_pages                                                                                                          3
bulletin_date                                                                                             2020-01-04
text             Национальная система предупреждения о террористической угрозе \n\nБюллетень \n\nwww.dhs.gov/advi...
href                                 https://www.dhs.gov/sites/default/files/publications/20_0104_ntas_russian_0.pdf
orig_url         https://www.dhs.gov/publication/national-terrorism-advisory-system-bulletin-january-4-2020-trans...
Date                                                                                                      2020-01-04
Name: 5, dtype: object

In [59]:
import pprint

(mergeddata.sort_values('bulletin_date', ascending=False)[['text', 'num_pages', 'Keywords', 'Subject', 'Title', 
       'href', 'orig_url', 'Date']].to_dict('records')[2])

{'text': 'Национальная система предупреждения о террористической угрозе \n\nБюллетень \n\nwww.dhs.gov/advisories \n\nДАТА ВЫПУСКА: пятница, 18 января 2019 \n\nОбзор террористической угрозы на территории США \nС 2015 года Министерство национальной безопасности (МНБ) публикует в данном бюллетене сообщения о \nсохраняющейся террористической угрозе на территории США. США находятся в эпохальной схватке с \nтеррористами, которые стремятся совершать нападения на американский народ, нашу страну и наш образ жизни. \nИнформированность, бдительность и вовлеченность общественности входят в число важнейших ресурсов для \nвыявления потенциальных террористов и предотвращения терактов. \n\nСрок действия \nДата выпуска: 18 января 2019 года, 14.00 восточного времени \nДата истечения срока действия: 18 января 2019 года, 13.00 восточного времени \nПримечание: публикация продления срока действия задерживается в связи с временным отсутствием \nфедерального финансирования. \n\nДополнительная информация \n\n•

In [None]:
# from pdfminer.converter import PDFPageAggregator
# from pdfminer.layout import LAParams, LTFigure, LTTextBox
# from pdfminer.pdfdocument import PDFDocument
# from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
# from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
# from pdfminer.pdfparser import PDFParser
# for file in pdf_files[:1]:
#     d = {'file': file, } 
#     textl = []
#     stackl = []
#     with open(file, 'rb') as f:
#         parser = PDFParser(f)
#         doc = PDFDocument(parser)
#         pages = PDFPage.create_pages(doc)
#         d = doc.info
#         print(d[0])
#         for page in list(pages):
#             print(page)
#             rsrcmgr = PDFResourceManager()
#             device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
#             interpreter = PDFPageInterpreter(rsrcmgr, device)
#             interpreter.process_page(page)
#             layout = device.get_result()
#             text = ""
#             stack = []
#             for obj in layout:
#                 if isinstance(obj, LTTextBox):
#                     text += obj.get_text()
            
#                 elif isinstance(obj, LTFigure):
#                     stack += list(obj)
#             print(text)
#             textl.append(text)
            
#             break

In [71]:
mergeddata['lang'] = mergeddata['lang'].map(lambda x: x.replace('.pdf', '')
                                   )

In [68]:
with jsonlines.open('/Users/user/FOLDER2CLEANOLDICLOUD/Desktop/PycharmProjects/BMSC-GA-4493-Spring2021/lab12/russian-notices.jsonl', 'w') as fout:
    fout.write_all(mergeddata.to_dict('records'))

In [None]:
urls_[urls_['lang']=='russian']['href']