## Import packages

In [151]:
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import tweepy
import json
import datetime
import pandas as pd
import numpy as np
import csv
import re
from textblob import TextBlob
import string
import preprocessor as p
import os
import time
import nltk
from twitter_scrape import TweetMiner
from datetime import date
import warnings
warnings.filterwarnings('ignore')

## Tweets collection

In [152]:
miner = TweetMiner(result_limit = 999) #Mine tweet. 
mined_tweets = miner.mine_user_tweets(user='DHSCgovUK', max_pages=10)
mined_tweets_df= pd.DataFrame(mined_tweets)

## Data cleaning

In [153]:
virus_df = mined_tweets_df[mined_tweets_df['text'].str.contains("died")] # Only include coronavirus update  
virus_df = virus_df[['screen_name', 'created_at', 'text']].reset_index(drop=True) # Keep relevant columns. 
virus_df['date'] = virus_df.created_at.dt.strftime('%Y-%m-%d') # Get date. 
virus_df['time'] = virus_df.created_at.dt.strftime('%H:%M:%S') # Get time. 

In [154]:
update_df = virus_df.loc[(virus_df['date'] > '2020-04-05')] # Work on the latest format.
update_df['test_cum'] = update_df.text.str.findall('\d+(?:,\d+)?').str[2] # Toral number 
update_df['test_daily'] = update_df.text.str.findall('\d+(?:,\d+)?').str[3]
update_df['ppl_test_cum'] = update_df.text.str.findall('\d+(?:,\d+)?').str[5]
update_df['ppl_confirmed_cum'] = update_df.text.str.findall('\d+(?:,\d+)?').str[6]
update_df['death_cum'] = update_df.text.str.findall('\d+(?:,\d+)?').str[9]

In [169]:
vis_df = update_df.drop(columns=['text', 'created_at', 'time'])
vis_df= vis_df.stack().str.replace(',', '').unstack()
vis_df = vis_df.apply(pd.to_numeric, errors='ignore')
vis_df['ppl_tested_daily'] = vis_df.ppl_test_cum.diff(periods = -1).fillna(0).astype(np.int64)
vis_df['death_daily'] = vis_df.death_cum.diff(periods=-1).fillna(0).astype(np.int64)
vis_df['ppl_confirmed_daily'] = vis_df.ppl_confirmed_cum.diff(periods = -1).fillna(0).astype(np.int64)
vis_df['ppl_percentage_daily'] = vis_df['ppl_confirmed_daily']/vis_df['ppl_tested_daily']
vis_df['death_rate'] = vis_df['death_cum']/vis_df['ppl_confirmed_cum']
vis_df['ppl_confirmed_change'] = vis_df.ppl_confirmed_daily.pct_change(periods=-1)
vis_df['death_change'] = vis_df.death_daily.pct_change(periods=-1)

In [170]:
today = vis_df.date.iloc[0]
yesterday = vis_df.date.iloc[1]
ppl_test_cum = format(vis_df.ppl_test_cum.iloc[0], ',')
ppl_confirmed_daily = format(vis_df.ppl_confirmed_daily.iloc[0], ',')
ppl_tested_daily = format(vis_df.ppl_tested_daily.iloc[0], ',')
yesterday_cases = format(vis_df.ppl_confirmed_daily.iloc[1], ',')
ppl_confirmed_change = round(vis_df.ppl_confirmed_change.iloc[0]*100, 1)
death_daily = format(vis_df.death_daily.iloc[0], ',')
death_rate = round(vis_df.death_rate.iloc[0]*100, 1)
death_cum = format(vis_df.death_cum.iloc[0], ',')
death_change = round(vis_df.death_change.iloc[0]*100, 1)

In [171]:
vis_df

Unnamed: 0,screen_name,date,test_cum,test_daily,ppl_test_cum,ppl_confirmed_cum,death_cum,ppl_tested_daily,death_daily,ppl_confirmed_daily,ppl_percentage_daily,death_rate,ppl_confirmed_change,death_change
0,DHSCgovuk,2020-04-16,417649,18665,327608,103093,13729,13839,861,4617,0.333622,0.133171,0.003041,0.131406
1,DHSCgovuk,2020-04-15,398916,15994,313769,98476,12868,11170,761,4603,0.412086,0.130671,-0.123572,-0.021851
2,DHSCgovuk,2020-04-14,382650,14982,302599,93873,12107,11879,778,5252,0.442125,0.128972,0.209581,0.085077
3,DHSCgovuk,2020-04-13,367667,14506,290720,88621,11329,8346,717,4342,0.520249,0.127837,-0.178896,-0.027137
4,DHSCgovuk,2020-04-12,352974,18000,282374,84279,10612,12776,737,5288,0.413901,0.125915,0.01051,-0.196292
5,DHSCgovuk,2020-04-11,334974,18091,269598,78991,9875,12993,917,5233,0.402755,0.125014,-0.397189,-0.064286
6,DHSCgovuk,2020-04-10,316836,19116,256605,73758,8958,13184,980,8681,0.65845,0.121451,0.998389,0.112372
7,DHSCgovuk,2020-04-09,298169,16095,243421,65077,7978,10713,881,4344,0.405489,0.122593,-0.208887,-0.060768
8,DHSCgovuk,2020-04-08,282074,14682,232708,60733,7097,19527,938,5491,0.2812,0.116856,0.511007,0.193384
9,DHSCgovuk,2020-04-07,266694,14006,213181,55242,6159,4344,786,3634,0.836556,0.111491,inf,inf


In [177]:
print ('On {0}, {1} people are tested.'.format(today, ppl_tested_daily))
print('Cumulatively, {0} people have been tested of which {1} tested positive.'.format(ppl_test_cum, ppl_confirmed_daily))
print ('There is a {0}% change, comparing to {1} cases yesterday.'.format(ppl_confirmed_change, yesterday_cases))
print ('Death toll increased by {0} to {1}, a {2}% change. The death rate is {3}%.'.format(death_daily, death_cum, death_change,death_rate))

On 2020-04-16, 13,839 people are tested.
Cumulatively, 327,608 people have been tested of which 4,617 tested positive.
There is a 0.3% change, comparing to 4,603 cases yesterday.
Death toll increased by 861 to 13,729, a 13.1% change. The death rate is 13.3%.


In [179]:
death_df = vis_df[['death_cum', 'death_rate', 'death_daily', 'death_change', ]]

In [None]:
death_df.