In [1]:
import json
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from datetime import date, timedelta


In [2]:
df = pd.DataFrame([], columns = ['PubDate', 'URL', 'byline', 'headline', 'standfirst', 'body'])


In [3]:

start_date = date(2020, 4, 1)
end_date = date(2020, 10, 31)
dayrange = range((end_date - start_date).days + 1)
for daycount in dayrange:
    dt = start_date + timedelta(days=daycount)
    datestr = dt.strftime('%Y-%m-%d')
    fname = 'articles/' + datestr + '.json'
#     print(fname)

    with open(fname) as f:
        data = json.load(f)

    for article in data:

        if 'webPublicationDate' in article: 
            pub_date = article['webPublicationDate']
        else:
            pub_date = ""

        if 'webUrl' in article: 
            URL = article['webUrl']
        else:
            URL = ""
        
        if 'headline' in article['fields']: 
            headline = article['fields']['headline']
        else:
            headline = ""
            
        if 'standfirst' in article['fields']: 
            standfirst_soup = BeautifulSoup(article['fields']['standfirst'], features='html.parser')
            standfirst = standfirst_soup.get_text()
        else:
            standfirst = ""
                                         
        if 'byline' in article['fields']: 
            byline = article['fields']['byline']
        else:
            byline = ""

        if 'body' in article['fields']: 
            body_soup = BeautifulSoup(article['fields']['body'], features='html.parser')
            body = body_soup.get_text()
        else:
            body = ""

        new_record = {
            'PubDate': pub_date,
            'URL': URL,
            'headline': headline,
            'standfirst': standfirst,
            'byline': byline,
            'body': body,
        }

        df.loc[len(df.index)] = new_record


In [4]:
df.shape

(3067, 6)

In [6]:
def is_rumour_mill(row):
    return row['URL'].find('transfer-rumours') >= 0 

In [12]:
df_sample['is_rumour_mill'] = df_sample.apply(is_rumour_mill, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [13]:
print(df_sample)

                   PubDate                                                URL  \
2300  2020-09-15T08:15:38Z  https://www.theguardian.com/football/2020/sep/...   
2301  2020-09-15T07:00:19Z  https://www.theguardian.com/football/blog/2020...   
2302  2020-09-16T21:14:34Z  https://www.theguardian.com/football/live/2020...   
2303  2020-09-16T21:12:19Z  https://www.theguardian.com/football/2020/sep/...   
2304  2020-09-16T19:07:48Z  https://www.theguardian.com/football/2020/sep/...   
2305  2020-09-16T18:26:22Z  https://www.theguardian.com/football/2020/sep/...   
2306  2020-09-16T17:51:57Z  https://www.theguardian.com/football/2020/sep/...   
2307  2020-09-16T15:02:17Z  https://www.theguardian.com/football/2020/sep/...   
2308  2020-09-16T14:57:06Z  https://www.theguardian.com/football/2020/sep/...   
2309  2020-09-16T14:18:17Z  https://www.theguardian.com/football/2020/sep/...   
2310  2020-09-16T12:22:08Z  https://www.theguardian.com/football/2020/sep/...   
2311  2020-09-16T11:48:11Z  

In [14]:
df_sample.shape

(30, 7)

In [15]:
df_sample['is_rumour_mill'].dtype

dtype('bool')

In [16]:
df['is_rumour_mill'] = df.apply(is_rumour_mill, axis=1)

In [17]:
df.shape

(3067, 7)

In [18]:
df.head()

Unnamed: 0,PubDate,URL,byline,headline,standfirst,body,is_rumour_mill
0,2020-04-01T20:16:54Z,https://www.theguardian.com/football/2020/apr/...,David Conn,Premier League tells PFA players will have to ...,Deferrals of wages not cuts wanted by football...,The Premier League and EFL have urged the foot...,False
1,2020-04-01T18:44:06Z,https://www.theguardian.com/football/blog/2020...,Paul MacInnes,Premier League clubs fail the smell test by fu...,Belts are being tightened across the UK but it...,Under the shadow of coronavirus new rules are ...,False
2,2020-04-01T15:57:46Z,https://www.theguardian.com/football/2020/apr/...,Simon Burnton,"The Premier League, pay cuts and the plight of...",Sign up now! Sign up now! Sign up now? Sign up...,HOT SHIITAKE There was consternation among Pre...,False
3,2020-04-01T15:13:23Z,https://www.theguardian.com/football/2020/apr/...,Suzanne Wrack,Women's Euro 2021 in England postponed by a ye...,Move widely anticipated amid coronavirus pande...,"The women’s European Championship, which was d...",False
4,2020-04-01T14:48:34Z,https://www.theguardian.com/football/2020/apr/...,Ben Fisher,Bournemouth manager Eddie Howe takes 'signific...,Three other senior employees doing the sameBou...,Eddie Howe has become the first Premier League...,False
