In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import time
import os
import glob
import string
import re

Save all headlines to one file and delete duplicates

In [3]:
files = glob.glob('saved_headlines/*.csv')

In [4]:
files

['saved_headlines/2018-11-13 10:29:15 to 2020-02-05 09:38:59.csv',
 'saved_headlines/2022-01-27 18:40:50 to 2022-01-30 22:02:24.csv',
 'saved_headlines/2021-10-04 06:49:57 to 2022-01-14 02:18:40.csv',
 'saved_headlines/2021-12-31 12:10:16 to 2022-01-20 16:47:50.csv',
 'saved_headlines/2020-09-24 21:29:05 to 2021-10-04 06:57:50.csv',
 'saved_headlines/2020-02-05 09:14:28 to 2020-09-24 21:37:29.csv',
 'saved_headlines/2022-01-25 11:19:20 to 2022-01-27 18:34:12.csv',
 'saved_headlines/2022-01-20 16:53:47 to 2022-01-25 22:00:55.csv']

In [5]:
df = pd.concat([pd.read_csv(file) for file in files])

In [6]:
len(df)

256028

In [7]:
df = df.drop_duplicates()

In [8]:
len(df)

253293

In [9]:
df = df.sort_values('date')

In [10]:
start = str(df['date'].min())
end = str(df['date'].max())

In [11]:
start, end

('2018-11-13 10:29:15', '2022-01-30 22:02:24')

In [13]:
df.to_csv('complete_headline_list/%s to %s.csv' % (start, end), index=False)

Clean data

In [14]:
regex = re.compile('[%s]' % re.escape(string.punctuation))

In [15]:
def cleaner(df, col, new_col):
    df[new_col] = df[col].str.lower()
    df[new_col] = df[new_col].str.strip()
    df[new_col] = df[new_col].str.replace(regex, '')
    df[new_col] = df[new_col].str.strip()
    return df

In [16]:
df = cleaner(df, 'headline_text', 'headline_clean')

In [17]:
df = cleaner(df, 'article_preview', 'preview_clean')

In [19]:
df.sort_values('date', ascending=False)

Unnamed: 0,headline_text,article_url,article_preview,date,author,headline_clean,preview_clean
499,New Zealand actor Pete Smith dead at 63: Star ...,/tvshowbiz/article-10457837/New-Zealand-actor-...,"New Zealand actor Pete Smith, who starred in d...",2022-01-30 22:02:24,Mary Mrad For Daily Mail Australia,new zealand actor pete smith dead at 63 star o...,new zealand actor pete smith who starred in dr...
498,Bindi Irwin says she has 'never worried so muc...,/tvshowbiz/article-10457949/Bindi-talks-mother...,"'I've never worried so much in my life, becaus...",2022-01-30 21:52:56,Jo Scrimshire For Daily Mail Australia,bindi irwin says she has never worried so much...,ive never worried so much in my life because i...
497,Glamorous Australian fashion designer Lïllïan ...,/tvshowbiz/article-10457841/L-ll-Khallouf-welc...,Top Australian fashion designer Lïllïan Khallo...,2022-01-30 21:29:19,Monique Friedlander For Daily Mail Australia,glamorous australian fashion designer lïllïan ...,top australian fashion designer lïllïan khallo...
496,Dancing On Ice: Paul Gascoigne breaks down in ...,/tvshowbiz/article-10457813/Paul-Gascoigne-bre...,The proud dad has been struggling to hold back...,2022-01-30 21:27:56,Laura Parkin For Mailonline,dancing on ice paul gascoigne breaks down in t...,the proud dad has been struggling to hold back...
495,"Howard Hesseman, who starred in WKRP in Cincin...",/tvshowbiz/article-10457913/Howard-Hesseman-st...,"Howard Hesseman, best known for his starring r...",2022-01-30 21:10:20,Ashley Hume For Dailymail.Com,howard hesseman who starred in wkrp in cincinn...,howard hesseman best known for his starring ro...
...,...,...,...,...,...,...,...
4,'Do you understand? He got arrested! He went t...,/tvshowbiz/article-6383519/He-got-arrested-Wal...,"In April, champion mixed martial artist Conor ...",2018-11-13 11:00:13,Nick Hadley For Daily Mail Australia,do you understand he got arrested he went to j...,in april champion mixed martial artist conor m...
3,Scarlett Moffatt displays sunburned legs as sh...,/tvshowbiz/article-6383693/Scarlett-Moffatt-lo...,She will spend the next five weeks Down Under ...,2018-11-13 10:53:52,Jessica Green For Mailonline,scarlett moffatt displays sunburned legs as sh...,she will spend the next five weeks down under ...
2,Sashay away! Sam Smith performs a sassy dance ...,/tvshowbiz/article-6383583/Sam-Smith-dances-Sy...,Sam Smith is currently having the time of his ...,2018-11-13 10:45:44,Joshua Fox For Daily Mail Australia,sashay away sam smith performs a sassy dance r...,sam smith is currently having the time of his ...
1,Cara Delevingne asked Princess Eugenie for per...,/tvshowbiz/article-6383789/Cara-Delevingne-ask...,The 26-year-old admitted she text her long-tim...,2018-11-13 10:38:14,Ciara Farmer For Mailonline,cara delevingne asked princess eugenie for per...,the 26yearold admitted she text her longtime f...
