# Get new headlines from Daily Mail and save to one file

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import time
import os
import glob

1. [Get new headlines](#Get-new-headlines)
    1. [Parameters for Daily Mail search]()
    1. [Get new Daily Mail search pages]()
    1. Extract headlines using Beautiful Soup
    1. Format dates
    1. Save new headline data
1. [Save all headlines to one file and delete duplicates](#Save-all-headlines-to-one-file-and-delete-duplicates)

## Get new headlines

### Parameters for Daily Mail search

In [162]:
# custom
page_size = 50
start_page = 400
end_page = 425

In [163]:
# default
endpoint = 'https://www.dailymail.co.uk/home/search.html'
params = {
    'offset': '0',
    'size': str(page_size),
    'sel': 'site',
    'searchPhrase': '',
    'sort': 'recent',
    'channel': 'tvshowbiz',
    'type': 'article',
    'days': 'all'
}

### Get new Daily Mail search pages

In [164]:
page_content = []
for page in range(start_page, end_page):
    params['offset'] = str(page * page_size)
    new_r = requests.get(endpoint, params=params)
    page_content.append(new_r.text)
    time.sleep(1)

In [165]:
# number of new records
(end_page - start_page) * page_size

1250

### Extract headlines using Beautiful Soup

In [166]:
article_bs = []
for html in page_content:
    bs = BeautifulSoup(html)
    div_sch_result = bs.find_all('div', class_='sch-result')
    article_bs = article_bs + div_sch_result

In [167]:
headline_list = []
for bs_obj in article_bs:
    h3 = bs_obj.find('h3', class_='sch-res-title')
    h4 = bs_obj.find('h4')
    a = h4.find('a').text if h4.find('a') != None else h4.text
    split = h4.text.split(' -')
    date = split[1] if len(split) > 1 else h4.text

    obj = {
        'headline_text': h3.text,
        'article_url': h3.find('a')['href'],
        'article_preview': bs_obj.find('p', class_='sch-res-preview').text,
        'date': date,
        'author': a
    }
    headline_list.append(obj)

### Format dates

In [168]:
df = pd.DataFrame(headline_list)
df = df.drop_duplicates()
df['date'] = df['date'].str.strip()

In [169]:
def replace(old, new):
    return df['date'].str.replace(old, new)

In [170]:
for i in [['\n', ''], ['th', ''], ['nd',''], ['rd',''], ['st', ''],['Augu ', 'August ']]:
    df['date'] = replace(i[0], i[1])

In [171]:
df['date'] = pd.to_datetime(df['date'], format='%B %d %Y, %I:%M:%S %p')
df = df.sort_values('date')

In [172]:
start = str(df['date'].min())
end = str(df['date'].max())

### Save new headline data

In [173]:
start, end

('2022-01-26 12:25:27', '2022-02-02 01:12:40')

In [174]:
df.to_csv('saved_headlines/%s to %s.csv' % (start, end), index=False)

## Save all headlines to one file and delete duplicates

In [175]:
files = glob.glob('saved_headlines/*.csv')

In [176]:
files

['saved_headlines/2022-04-18 22:34:33 to 2022-05-13 15:43:28.csv',
 'saved_headlines/2022-02-02 01:12:40 to 2022-02-14 18:35:11.csv',
 'saved_headlines/2022-02-14 18:38:57 to 2022-02-27 11:09:40.csv',
 'saved_headlines/2022-01-30 04:33:49 to 2022-02-01 01:22:44.csv',
 'saved_headlines/2022-02-01 01:25:16 to 2022-02-05 22:32:44.csv',
 'saved_headlines/2022-02-27 11:09:40 to 2022-04-18 22:34:33.csv',
 'saved_headlines/2022-01-20 04:18:14 to 2022-01-26 12:19:01.csv',
 'saved_headlines/2022-01-22 22:06:19 to 2022-01-26 00:24:44.csv',
 'saved_headlines/2022-01-26 12:25:27 to 2022-02-02 01:12:40.csv']

In [177]:
df = pd.concat([pd.read_csv(file) for file in files])

In [178]:
len(df)

24491

In [179]:
df = df.drop_duplicates()

In [180]:
len(df)

22497

In [181]:
df = df.sort_values('date')

In [182]:
start = str(df['date'].min())
end = str(df['date'].max())

In [183]:
start, end

('2022-01-20 04:18:14', '2022-05-13 15:43:28')

In [184]:
# replace old main file
!rm -r complete_headline_list/*

zsh:1: no matches found: complete_headline_list/*


In [185]:
df.to_csv('complete_headline_list/%s to %s.csv' % (start, end), index=False)