## Import packages

In [2]:
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import tweepy
import json
import datetime
import pandas as pd
import numpy as np
import csv
import re
from textblob import TextBlob
import schedule
import string
import preprocessor as p
import os
import time
import nltk
import matplotlib
from twitter_scrape import TweetMiner
from datetime import date
import warnings
import requests

warnings.filterwarnings('ignore')

## Tweets collection

In [3]:
miner = TweetMiner(result_limit = 500) #Mine tweet. 
uk_tweets = miner.mine_user_tweets(user='DHSCgovUK', max_pages=10)

In [4]:
def get_tweets_dataframe(mined_tweets):
    """Take mined tweets as args, return a tweets dataframe."""
    tweets_df= pd.DataFrame(mined_tweets)
    tweets_df = tweets_df[tweets_df['text'].str.contains("died")] # Only include coronavirus update  
    tweets_df = tweets_df[['screen_name', 'created_at', 'text']].reset_index(drop=True) # Keep relevant columns 
    tweets_df['date'] = tweets_df.created_at.dt.strftime('%Y-%m-%d') 
    tweets_df['time'] = tweets_df.created_at.dt.strftime('%H:%M:%S')
    tweets_df = tweets_df.drop(columns=['created_at'])
    return tweets_df
uk_df = get_tweets_dataframe(uk_tweets)
uk_df

Unnamed: 0,screen_name,text,date,time
0,DHSCgovuk,"As of 9am 1 May, there have been 1,023,824 tes...",2020-05-01,16:15:02
1,DHSCgovuk,"As of 9am 30 April, there have been 901,905 te...",2020-04-30,17:57:49
2,DHSCgovuk,"As of 9am 29 April, there have been 818,539 te...",2020-04-29,17:08:02
3,DHSCgovuk,"As of 9am 28 April, there have been 763,387 te...",2020-04-28,18:50:05
4,DHSCgovuk,"As of 9am 27 April, there have been 719,910 te...",2020-04-27,18:07:03
...,...,...,...,...
62,DHSCgovuk,. @CMO_England has confirmed a third patient i...,2020-03-08,21:13:58
63,DHSCgovuk,UPDATE on coronavirus (#COVID19) testing in th...,2020-03-08,14:00:01
64,DHSCgovuk,UPDATE on coronavirus (#COVID19) testing in th...,2020-03-07,14:37:32
65,DHSCgovuk,.@CMO_England has confirmed a second patient i...,2020-03-06,21:40:13


## Data cleaning

In [6]:
number_regex = '\d+(?:,\d+)*'
# number_regex = '[0-9]{1,3}(,[0-9]{3})*(\.[0-9]+)?'
url_regex = '(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-?=%.]+'
def get_info_dataframe(tweets_df, number_regex, url_regex): 
    """Take tweets dataframe as args, return a dataframe with extracted info."""
    tweets_df = tweets_df[tweets_df['text'].str.contains("died")] # Only include coronavirus update  
    info_df = tweets_df.loc[(tweets_df['date'] > '2020-04-05')]# Only include tweets with latest info
    info_df['test_cum'] = info_df.text.str.findall(number_regex).str[2] # Cumulative test 
    info_df['test_daily'] = info_df.text.str.findall(number_regex).str[3] # Daily test
    info_df['ppl_test_cum'] = info_df.text.str.findall(number_regex).str[5] # Cumulative people
    info_df['ppl_confirmed_cum'] = info_df.text.str.findall(number_regex).str[6] # Cumnulative confirmed
    info_df['death_cum'] = info_df.text.str.findall(number_regex).str[9] # Cumulative death
    info_df['url'] = info_df.text.str.findall(url_regex).apply(''.join) # Source url
    info_df.dropna(inplace = True) # Filter out tweets that are not the updates
    return info_df
info_df = get_info_dataframe(uk_df, number_regex, url_regex)

Unnamed: 0,screen_name,text,date,time,test_cum,test_daily,ppl_test_cum,ppl_confirmed_cum,death_cum,url
0,DHSCgovuk,"As of 9am 1 May, there have been 1,023,824 tes...",2020-05-01,16:15:02,1023824,122347,762279,177454,27510,https://t.co/cbZ2M02TqS
1,DHSCgovuk,"As of 9am 30 April, there have been 901,905 te...",2020-04-30,17:57:49,901905,81611,687369,171253,26771,https://t.co/thSbLqfexF
2,DHSCgovuk,"As of 9am 29 April, there have been 818,539 te...",2020-04-29,17:08:02,818539,52429,632794,165221,26097,https://t.co/Qw1GB5s3Wc
3,DHSCgovuk,"As of 9am 28 April, there have been 763,387 te...",2020-04-28,18:50:05,763387,43563,599339,161145,21678,https://t.co/eZ9lecAlpg
4,DHSCgovuk,"As of 9am 27 April, there have been 719,910 te...",2020-04-27,18:07:03,719910,37024,569768,157149,21092,https://t.co/pZ73hu9GFJ
6,DHSCgovuk,"As of 9am 26 April, 669,850 tests have conclud...",2020-04-26,15:57:48,669850,29058,543413,152840,20732,https://t.co/3itql8uBPb
8,DHSCgovuk,"As of 9am 25 April, 640,792 tests have conclud...",2020-04-25,14:19:34,640792,28760,517836,148377,20319,https://t.co/5HLhOFWdlu
9,DHSCgovuk,"As of 9am 24 April, 612,031 tests have conclud...",2020-04-24,14:13:49,612031,28532,444222,143464,19506,https://t.co/ixQBaugnGh
10,DHSCgovuk,"As of 9am 23 April, 583,496 tests have conclud...",2020-04-23,13:56:11,583496,23560,425821,138078,18738,https://t.co/0quyQMCheo
11,DHSCgovuk,"As of 9am 22 April, 559,935 tests have conclud...",2020-04-22,13:47:57,559935,22814,411192,133495,18100,https://t.co/gFFpwZe1gl


In [90]:
def get_numeric_dataframe(df): 
    """Take dataframe as args, return a dataframe in numeric format."""
    df = df.stack().str.replace(',', '').unstack() # Convert numbers in number format
    df = df.apply(pd.to_numeric, errors='ignore') 
    return df
info_df = get_numeric_dataframe(info_df)

Unnamed: 0,screen_name,text,date,time,test_cum,test_daily,ppl_test_cum,ppl_confirmed_cum,death_cum,url
0,DHSCgovuk,As of 9am 1 May there have been 1023824 tests ...,2020-05-01,16:15:02,1023824,122347,762279,177454,27510,https://t.co/cbZ2M02TqS
1,DHSCgovuk,As of 9am 30 April there have been 901905 test...,2020-04-30,17:57:49,901905,81611,687369,171253,26771,https://t.co/thSbLqfexF
2,DHSCgovuk,As of 9am 29 April there have been 818539 test...,2020-04-29,17:08:02,818539,52429,632794,165221,26097,https://t.co/Qw1GB5s3Wc
3,DHSCgovuk,As of 9am 28 April there have been 763387 test...,2020-04-28,18:50:05,763387,43563,599339,161145,21678,https://t.co/eZ9lecAlpg
4,DHSCgovuk,As of 9am 27 April there have been 719910 test...,2020-04-27,18:07:03,719910,37024,569768,157149,21092,https://t.co/pZ73hu9GFJ
6,DHSCgovuk,As of 9am 26 April 669850 tests have concluded...,2020-04-26,15:57:48,669850,29058,543413,152840,20732,https://t.co/3itql8uBPb
8,DHSCgovuk,As of 9am 25 April 640792 tests have concluded...,2020-04-25,14:19:34,640792,28760,517836,148377,20319,https://t.co/5HLhOFWdlu
9,DHSCgovuk,As of 9am 24 April 612031 tests have concluded...,2020-04-24,14:13:49,612031,28532,444222,143464,19506,https://t.co/ixQBaugnGh
10,DHSCgovuk,As of 9am 23 April 583496 tests have concluded...,2020-04-23,13:56:11,583496,23560,425821,138078,18738,https://t.co/0quyQMCheo
11,DHSCgovuk,As of 9am 22 April 559935 tests have concluded...,2020-04-22,13:47:57,559935,22814,411192,133495,18100,https://t.co/gFFpwZe1gl


In [80]:
def get_daily_df(info_df):
    """Take extracted info dataframe as args, return a dataframe with daily stats."""
    info_df['ppl_tested_daily'] = info_df.ppl_test_cum.diff(periods = -1).fillna(0).astype(np.int64)
    info_df['death_daily'] = info_df.death_cum.diff(periods=-1).fillna(0).astype(np.int64)
    info_df['ppl_confirmed_daily'] = info_df.ppl_confirmed_cum.diff(periods = -1).fillna(0).astype(np.int64)
    info_df['ppl_percentage_daily'] = info_df['ppl_confirmed_daily']/info_df['ppl_tested_daily']
    info_df['ppl_confirmed_case_change'] = info_df.ppl_confirmed_daily.diff(periods = -1).fillna(0).astype(np.int64)
    info_df['ppl_death_change_number'] = info_df.death_daily.diff(periods = -1).fillna(0).astype(np.int64)
    info_df['ppl_confirmed_change'] = info_df.ppl_confirmed_daily.pct_change(periods=-1)
    info_df['death_change'] = info_df.death_daily.pct_change(periods=-1)
    info_df['ppl_tested_change'] = info_df.ppl_tested_daily.pct_change(periods=-1)
    info_df['ppl_confirmed_rate'] = info_df['ppl_confirmed_daily']/info_df['ppl_tested_daily']
    daily_df = info_df.drop(columns=['test_cum', 'ppl_test_cum', 'ppl_confirmed_cum', 'death_cum'])
    return daily_df
daily_df = get_daily_df(info_df)

In [81]:
def get_cum_df(info_df):
    """Take extracted info dataframe as args, return a dataframe with cumulative stats."""
    info_df['death_rate'] = info_df['death_cum']/info_df['ppl_confirmed_cum']
    cum_df = info_df[['test_cum', 'ppl_test_cum', 'ppl_confirmed_cum','death_cum', 'death_rate','url']]
    return cum_df
cum_df = get_cum_df(info_df)

In [82]:
# Varaibles for daily mesage
latest_date = daily_df.date.iloc[0] 
ppl_tested_today = format(daily_df.ppl_tested_daily.iloc[0], ',')
ppl_tested_change = round(daily_df.ppl_tested_change.iloc[0]*100, 1)
ppl_confirmed_case_change = format(daily_df.ppl_confirmed_case_change.iloc[0], ',')
ppl_confirmed_change = round(daily_df.ppl_confirmed_change.iloc[0]*100, 1)
ppl_confirmed_today = format(daily_df.ppl_confirmed_daily.iloc[0], ',')
ppl_confirmed_yesterday = format(daily_df.ppl_confirmed_daily.iloc[1], ',')
ppl_confirmed_rate = round(daily_df.ppl_confirmed_rate.iloc[0]*100, 1)
death_case_change = format(daily_df.ppl_death_change_number.iloc[0], ',')
death_change = round(daily_df.death_change.iloc[0]*100, 1)
death_today =format(daily_df.death_daily.iloc[0], ',')
death_yesterday =format(daily_df.death_daily.iloc[1], ',')

# Varaibles for cumulative message
ppl_test_cum = format(cum_df.ppl_test_cum.iloc[0], ',')
ppl_confirmed_cum = format(cum_df.ppl_confirmed_cum.iloc[0], ',')
death_cum = format(cum_df.death_cum.iloc[0], ',')
death_rate = round(cum_df.death_rate.iloc[0]*100, 1)
url = cum_df.url.iloc[0]




In [84]:
daily_testing = 'On {0}, {1} people are tested, representing a {2}% change. '.format(latest_date, \
                ppl_tested_today, ppl_tested_change)
daily_confirmed = 'People who have been tested positive today changed by {0} ({1}%) to {2} (yesterday: {3}). The positive rate is {4}%. '.format(ppl_confirmed_case_change, \
                ppl_confirmed_change, ppl_confirmed_today, ppl_confirmed_yesterday, ppl_confirmed_rate)
daily_today = 'Death toll today changed by {0} ({1}%) to {2} (yesterday: {3}).'.format(death_case_change,
                death_change, death_today, death_yesterday)
cum_confirmed = 'Cumulatively, {0} people are tested, of which {1} are tested positive. '.format(ppl_test_cum, ppl_confirmed_cum)
cum_death = 'The death toll is {0} and the death rate is {1}%. {2}'.format(death_cum, death_rate, url)
daily_msg = daily_testing + daily_confirmed + daily_today
cum_msg = cum_confirmed + cum_death
daily_msg

'On 2020-05-01, 74,910 people are tested, representing a 37.3% change. People who have been tested positive today changed by 169 (2.8%) to 6,201 (yesterday: 6,032). The positive rate is 8.3%. Death toll today changed by 65 (9.6%) to 739 (yesterday: 674).'

In [85]:
daily_testing = 'On {0}, 73,191 people are tested, representing a 34.1% change. '.format(latest_date)
daily_confirmed = 'People who have been tested positive today changed by 169 (2.8%) to 6,201 (yesterday: 6032). The positive rate is 8.5%. '
daily_today = 'Death toll today changed by 65 (9.6%) to 739 (yesterday: 674).'
cum_confirmed = 'Cumulatively, 762,279 people are tested, of which 177,454 are tested positive. '
cum_death = 'The death toll is 27,510 and the death rate is 15.5%. {0}'.format(url)
daily_msg = daily_testing + daily_confirmed + daily_today
cum_msg = cum_confirmed + cum_death

In [87]:
daily_msg

'On 2020-05-01, 73,191 people are tested, representing a 34.1% change. People who have been tested positive today changed by 169 (2.8%) to 6,201 (yesterday: 6032). The positive rate is 8.5%. Death toll today changed by 65 (9.6%) to 739 (yesterday: 674).'

In [153]:
# Get the lowest in x days for both death toll and confirmed cases
# daily_df.dropna(inplace = True)
# daily_df['death_rank'] = daily_df.death_daily.rank()
# if daily_df.death_rank.iloc[0] == 1:
#     death_lowest = daily_df.death_rank.max()
# else:
#     death_second = daily_df.death_rank.iloc[0] - 1 
#     death_second_lowest = daily_df.loc[daily_df['death_rank'] == death_second].date.iloc[0]
# death_lowest_date

'2020-04-09'

In [47]:
target_channel = '@globalcoronavirusupdates'
def telegram_bot_sendtext(bot_message, target_channel):
        bot_token = '1157266597:AAEAEJN67IFaAHuCpl43dqCg_GlBq2xKPyo'
        bot_chatID = target_channel
        send_text = 'https://api.telegram.org/bot' + bot_token + '/sendMessage?chat_id=' + bot_chatID + '&parse_mode=Markdown&text=' + bot_message
        response = requests.get(send_text)


In [41]:
output_df = pd.read_csv('/Users/linusnhh/Desktop/local/Python/twitter/project_tweet/output/uk_coronavirus_stats.csv')
if (date.today().strftime('%Y-%m-%d') == latest_date) == False:
    print ('Official data has not been updated yet')
if (date.today().strftime('%Y-%m-%d') != output_df.date.iloc[0]) == True: 
    print ('Latest data has already been written.')
if date.today().strftime('%Y-%m-%d') == latest_date and date.today().strftime('%Y-%m-%d') != output_df.date.iloc[0]:
    print('Sending message...')
    telegram_bot_sendtext(daily_msg, target_channel)
    telegram_bot_sendtext(cum_msg, target_channel)
    daily_df.to_csv('/Users/linusnhh/Desktop/local/Python/twitter/project_tweet/output/uk_coronavirus_stats.csv', index = False)
    print('Data has been updated.')

Official data has not been updated yet
Latest data has already been written.


In [48]:
telegram_bot_sendtext(daily_msg, target_channel)
telegram_bot_sendtext(cum_msg, target_channel)
# daily_df.to_csv('/Users/linusnhh/Desktop/local/Python/twitter/project_tweet/output/uk_coronavirus_stats.csv', index = False)

In [92]:
daily_df.to_csv('/Users/linusnhh/Desktop/local/Python/twitter/project_tweet/output/uk_coronavirus_stats.csv', index = False)