In [11]:
import json
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from datetime import date, timedelta


In [3]:
df = pd.DataFrame([], columns = ['PubDate', 'URL', 'byline', 'headline', 'standfirst', 'body'])


In [4]:

start_date = date(2020, 4, 1)
end_date = date(2020, 10, 31)
dayrange = range((end_date - start_date).days + 1)
for daycount in dayrange:
    dt = start_date + timedelta(days=daycount)
    datestr = dt.strftime('%Y-%m-%d')
    fname = 'articles/' + datestr + '.json'
#     print(fname)

    with open(fname) as f:
        data = json.load(f)

    for article in data:

        if 'webPublicationDate' in article: 
            pub_date = article['webPublicationDate']
        else:
            pub_date = ""

        if 'webUrl' in article: 
            URL = article['webUrl']
        else:
            URL = ""
        
        if 'headline' in article['fields']: 
            headline = article['fields']['headline']
        else:
            headline = ""
            
        if 'standfirst' in article['fields']: 
            standfirst_soup = BeautifulSoup(article['fields']['standfirst'], features='html.parser')
            standfirst = standfirst_soup.get_text()
        else:
            standfirst = ""
                                         
        if 'byline' in article['fields']: 
            byline = article['fields']['byline']
        else:
            byline = ""

        if 'body' in article['fields']: 
            body_soup = BeautifulSoup(article['fields']['body'], features='html.parser')
            body = body_soup.get_text()
        else:
            body = ""

        new_record = {
            'PubDate': pub_date,
            'URL': URL,
            'headline': headline,
            'standfirst': standfirst,
            'byline': byline,
            'body': body,
        }

        df.loc[len(df.index)] = new_record


In [6]:
df.shape

(3067, 6)

In [7]:
player_string = 'Havertz'


In [8]:
df.columns


Index(['PubDate', 'URL', 'byline', 'headline', 'standfirst', 'body'], dtype='object')

In [49]:
df_sample = df[2500:2600]

In [10]:
df_sample

Unnamed: 0,PubDate,URL,byline,headline,standfirst,body
0,2020-04-01T20:16:54Z,https://www.theguardian.com/football/2020/apr/...,David Conn,Premier League tells PFA players will have to ...,Deferrals of wages not cuts wanted by football...,The Premier League and EFL have urged the foot...
1,2020-04-01T18:44:06Z,https://www.theguardian.com/football/blog/2020...,Paul MacInnes,Premier League clubs fail the smell test by fu...,Belts are being tightened across the UK but it...,Under the shadow of coronavirus new rules are ...
2,2020-04-01T15:57:46Z,https://www.theguardian.com/football/2020/apr/...,Simon Burnton,"The Premier League, pay cuts and the plight of...",Sign up now! Sign up now! Sign up now? Sign up...,HOT SHIITAKE There was consternation among Pre...
3,2020-04-01T15:13:23Z,https://www.theguardian.com/football/2020/apr/...,Suzanne Wrack,Women's Euro 2021 in England postponed by a ye...,Move widely anticipated amid coronavirus pande...,"The women’s European Championship, which was d..."
4,2020-04-01T14:48:34Z,https://www.theguardian.com/football/2020/apr/...,Ben Fisher,Bournemouth manager Eddie Howe takes 'signific...,Three other senior employees doing the sameBou...,Eddie Howe has become the first Premier League...
5,2020-04-01T14:46:00Z,https://www.theguardian.com/football/2020/apr/...,Paul MacInnes and David Conn,Uefa puts all football on hold but could targe...,"International games, including Euro 2020 play-...",Uefa has called off June’s round of internatio...
6,2020-04-01T12:53:56Z,https://www.theguardian.com/football/2020/apr/...,Paul Campbell,Football quiz: Premier League managers when t...,Who did not enjoy fish? Who stopped the bus? W...,\n\n\n\n\nWhich current Premier League manager...
7,2020-04-01T12:41:28Z,https://www.theguardian.com/football/2020/apr/...,Guardian sport and PA Media,Premier League clubs accused of 'moral vacuum'...,Politicians angry at cuts to non-playing staff...,Premier League football has been accused of op...
8,2020-04-01T12:00:20Z,https://www.theguardian.com/football/2020/apr/...,Nick Ames,'Our players are afraid': Nicaraguan football ...,Diriangén are unhappy games continue in a nati...,"On Wednesday night the players of Diriangén, t..."
9,2020-04-01T10:00:18Z,https://www.theguardian.com/football/football-...,Ben Fisher,Hull's Angus MacDonald on surviving cancer: 'I...,"The defender, ready to return when football st...",“In my head that was me hanging my boots up – ...


In [15]:
some_text = df.iloc[2595]['body']

In [16]:
print(some_text)

There was a little over an hour until kick-off against Atlético Madrid on Wednesday evening and they hadn’t reached the ground yet, but Huesca’s players were already stretched and changed. One after another they came down the stairs or out the lift and into the foyer at a small hotel in the town centre. Outside, the bus had pulled up in front of the park, engine running. They strolled past the disinfected rug, the hand gel dispensers and out into the sunshine, climbing on board in full kit and trainers. When they got to Alcoraz, most went straight on the pitch. They weren’t allowed to go anywhere else.
A few minutes later Atlético arrived, Luis Suárez heading in wearing bright yellow football socks and shorts, a training top and a small rucksack, like a schoolboy going to a game. The scene is repeated across Spain, some changing into their boots on the grass. A couple of hours after Atlético turned up, 382km away Real Madrid’s squad were walking to their meeting with Valladolid togethe

[9061]


'Atletico'

In [21]:
v = [ m.start()  for m in re.finditer('virus', some_text)]


[1551, 2362, 7011]


In [63]:
# find_mentions = lambda row: [ m.start() for m in re.finditer('virus', row['body'])]

def find_mentions(row, i):
    r = []
    for m in re.finditer('Havertz', row['body']):
        s = row['body'][m.start()-60:m.start()+60]
#         print(i, s)
        r.append((i, m.start(), s))
    return r

In [67]:
havertz_mentions = []
for idx, row in df.iterrows():
    havertz_mentions.extend( find_mentions(row, idx) )
    
# print(len(havertz_mentions))
# print(havertz_mentions)

In [66]:
start_date = date(2020, 4, 1)
datestr = start_date.strftime('%Y-%m-%d')
print(datestr)

2020-04-01


In [68]:
def is_rumour_mill(row):
    return row['URL'].find('transfer-rumours') >= 0 

In [69]:
df['is_rumour_mill'] = df.apply(is_rumour_mill, axis=1)
df.shape

(3067, 7)

In [70]:
df.columns

Index(['PubDate', 'URL', 'byline', 'headline', 'standfirst', 'body',
       'is_rumour_mill'],
      dtype='object')

In [81]:
data_sample = df[:100]

In [86]:
df['PubDate'] = pd.to_datetime(df['PubDate']).dt.date

In [98]:
date_series = df.groupby('PubDate').size().reset_index(name='counts')


In [92]:
fname = 'players.json'

with open(fname) as f:
    player_set = json.load(f)
    
player_set

{'bought': {'Kai Havertz': '2020-04-01',
  'Timo Werner': '2020-06-18',
  'Ben Chilwell': '2020-08-26',
  'Hakim Ziyech': '2020-02-13',
  'Edouard Mendy': '2020-09-24',
  'Thiago Silva': '2020-08-28'},
 'not bought': ['Sergio Reguilon',
  'Jadon Sancho',
  'Moussa Dembele',
  'Jan Oblak',
  'Kalidou Koulibaly',
  'Raphael Varane',
  'Nicolas Tagliafico',
  'Declan Rice',
  'Dean Henderson']}

In [151]:
player_info = {}
for key, value in player_set['bought'].items():
    new_player = { 'last_name': key.split()[1], 'transfer_date': value }
    player_info[key] = new_player
    
for name in player_set['not bought']:
    new_player = { 'last_name': name.split()[1], 'transfer_date': False }
    player_info[name] = new_player

                  
print(len(player_info))
player_info

15


{'Kai Havertz': {'last_name': 'Havertz', 'transfer_date': '2020-04-01'},
 'Timo Werner': {'last_name': 'Werner', 'transfer_date': '2020-06-18'},
 'Ben Chilwell': {'last_name': 'Chilwell', 'transfer_date': '2020-08-26'},
 'Hakim Ziyech': {'last_name': 'Ziyech', 'transfer_date': '2020-02-13'},
 'Edouard Mendy': {'last_name': 'Mendy', 'transfer_date': '2020-09-24'},
 'Thiago Silva': {'last_name': 'Silva', 'transfer_date': '2020-08-28'},
 'Sergio Reguilon': {'last_name': 'Reguilon', 'transfer_date': False},
 'Jadon Sancho': {'last_name': 'Sancho', 'transfer_date': False},
 'Moussa Dembele': {'last_name': 'Dembele', 'transfer_date': False},
 'Jan Oblak': {'last_name': 'Oblak', 'transfer_date': False},
 'Kalidou Koulibaly': {'last_name': 'Koulibaly', 'transfer_date': False},
 'Raphael Varane': {'last_name': 'Varane', 'transfer_date': False},
 'Nicolas Tagliafico': {'last_name': 'Tagliafico', 'transfer_date': False},
 'Declan Rice': {'last_name': 'Rice', 'transfer_date': False},
 'Dean Hender

In [150]:
# del player_info['Moussa Dembele']
# print(len(player_info))
# player_info

In [105]:
def matches_chelsea(player, row):
    matches_player  = row['standfirst'].find(player) >= 0 or row['body'].find(player) >= 0
    matches_club  = row['standfirst'].find('Chelsea') >= 0 or row['body'].find('Chelsea') >= 0
    return matches_player and matches_club

def matches_chelsea_in_rumour_mill(player, row):
    mathches_player  = row['standfirst'].find(player) >= 0 or row['body'].find(player) >= 0
    mathches_club  = row['standfirst'].find('Chelsea') >= 0 or row['body'].find('Chelsea') >= 0
    is_rumour_mill = row['is_rumour_mill']
    return matches_player and matches_club and is_rumour_mill


In [131]:
player = "Moussa Dembele"
player_matches_chelsea = lambda row: matches_chelsea(player, row)

In [132]:
dembele_df = df[df.apply(player_matches_chelsea, axis=1)]

In [133]:
dembele_df

Unnamed: 0,PubDate,URL,byline,headline,standfirst,body,is_rumour_mill


In [108]:
# havertz_df[:5]['body']

594      9.41pm BST   That’s it from me. Thanks for r...
661    Chelsea are close to signing Timo Werner after...
702    They arrived talking about the influence one y...
713    THE GERMAN CUP IS BACK, BABY!\nWho fancies a b...
715    The former Real Sociedad, Vissel Kobe and Real...
Name: body, dtype: object

In [111]:
# for i in range(5):
#     print()
#     print(havertz_df.iloc[i]['body'])

In [112]:
havertz_df.shape

(118, 7)

In [153]:
for key, player_record in player_info.items():
    player = key
    player_df = df[df.apply(player_matches_chelsea, axis=1)]
    player_record['player df'] = player_df


In [154]:
# player_info['Moussa Dembele']
player_info['Moussa Dembele']['player df'].shape

(0, 7)

In [146]:
for key, player_record in player_info.items():
    player = key
    print(player, player_record['player df'].shape[0])


Kai Havertz 118
Timo Werner 136
Ben Chilwell 106
Hakim Ziyech 101
Edouard Mendy 24
Thiago Silva 69
Sergio Reguilon 5
Jadon Sancho 66
Moussa Dembele 0
Jan Oblak 15
Kalidou Koulibaly 22
Raphael Varane 0
Nicolas Tagliafico 0
Declan Rice 67
Dean Henderson 25


In [218]:
footballers = pd.read_csv("GuardianList.csv", usecols = [10, 11, 12], skiprows=3)

english_clubs = set([
    "Manchester City",
    "Liverpool",
    "Tottenham Hotspur",
    "Chelsea",
    "Leicester City",
    "Arsenal",
    "Everton",
    "Manchester United",
    "Wolverhampton Wanderers"
])

all_footballers_dict = {}
for d in footballers.to_dict(orient='records'):
#     print(d['Name'], type(d['Name']))
    if isinstance(d['Name'], str):
        all_footballers_dict[d['Name']] = {}
        
        if isinstance(d['Club at Dec 20 2020'], str):
            all_footballers_dict[d['Name']]['Club'] = d['Club at Dec 20 2020']
            if d['Club at Dec 20 2020'] in english_clubs:
                all_footballers_dict[d['Name']]['English Club'] = True
            else:
                all_footballers_dict[d['Name']]['English Club'] = False
        else:
            all_footballers_dict[d['Name']]['Club'] = ""
            all_footballers_dict[d['Name']]['English Club'] = False

        if isinstance(d['Nationality'], str):
            all_footballers_dict[d['Name']]['Nationality'] = d['Nationality']
        else:
            all_footballers_dict[d['Name']]['Nationality'] = ""

print(len(all_footballers_dict))
all_footballers_dict

439


{'Robert Lewandowski': {'Club': 'Bayern Munich',
  'English Club': False,
  'Nationality': 'Poland'},
 'Lionel Messi': {'Club': 'Barcelona',
  'English Club': False,
  'Nationality': 'Argentina'},
 'Cristiano Ronaldo': {'Club': 'Juventus',
  'English Club': False,
  'Nationality': 'Portugal'},
 'Kevin De Bruyne': {'Club': 'Manchester City',
  'English Club': True,
  'Nationality': 'Belgium'},
 'Sadio Mané': {'Club': 'Liverpool',
  'English Club': True,
  'Nationality': 'Senegal'},
 'Erling Braut Haaland': {'Club': 'Borussia Dortmund',
  'English Club': False,
  'Nationality': 'Norway'},
 'Kylian Mbappé': {'Club': 'Paris St-Germain',
  'English Club': False,
  'Nationality': 'France'},
 'Mohamed Salah': {'Club': 'Liverpool',
  'English Club': True,
  'Nationality': 'Egypt'},
 'Neymar': {'Club': 'Paris St-Germain',
  'English Club': False,
  'Nationality': 'Brazil'},
 'Virgil van Dijk': {'Club': 'Liverpool',
  'English Club': True,
  'Nationality': 'Netherlands'},
 'Joshua Kimmich': {'Cl

In [219]:
for key, player_record in all_footballers_dict.items():
    player = key
    player_df = df[df.apply(player_matches_chelsea, axis=1)]
    player_record['player df'] = player_df

In [214]:
# all_footballers_dict["Lionel Messi"]
print(all_footballers_dict["Wissam Ben Yedder"]['player df'])
len(all_footballers_dict)
# all_footballers_dict

         PubDate                                                URL  \
1661  2020-08-02  https://www.theguardian.com/football/2019/aug/...   

              byline                                           headline  \
1661  Guardian sport  Premier League, Bundesliga, La Liga and Serie ...   

                                             standfirst  \
1661  Keep tabs of the top scorers in Europe’s major...   

                                                   body  is_rumour_mill  
1661  Premier League 23 Jamie Vardy (Leicester) 22 P...           False  


439

In [216]:

for key, player_record in all_footballers_dict.items():
    print(key, player_record['player df'].shape[0])


Robert Lewandowski 21
Lionel Messi 55
Cristiano Ronaldo 35
Kevin De Bruyne 48
Sadio Mané 43
Erling Braut Haaland 4
Kylian Mbappé 20
Mohamed Salah 34
Neymar 28
Virgil van Dijk 50
Joshua Kimmich 11
Karim Benzema 8
Sergio Ramos 12
Thomas Müller 19
Manuel Neuer 11
Serge Gnabry 13
Thiago Alcântara 37
Trent Alexander-Arnold 42
Romelu Lukaku 17
Harry Kane 72
Bruno Fernandes 64
Son Heung-min 36
Alphonso Davies 10
Ciro Immobile 7
Zlatan Ibrahimovic 12
Alisson 43
Raheem Sterling 62
Jan Oblak 15
Jadon Sancho 66
Timo Werner 136
David Alaba 11
Alejandro ‘Papu’ Gomez 0
Jordan Henderson 59
Andrew Robertson 6
Jamie Vardy 59
Pierre-Emerick Aubameyang 62
João Félix 2
Angel Di María 0
Paulo Dybala 8
Leon Goretzka 5
Marcus Rashford 94
Toni Kroos 4
Roberto Firmino 28
Lautaro Martínez  5
Ansu Fati 4
Thibaut Courtois 2
Kingsley Coman 11
Marc-André ter Stegen 1
Marquinhos 3
Kai Havertz 118
Fabinho 38
Dayot Upamecano 7
Lucas Ocampos 3
Thiago Silva 69
N'Golo Kanté 4
Casemiro 4
Duván Zapata 1
Luis Suárez 27
Jack