In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **PART 1**

1. Counting the total number of tweets, describing how I dealt with duplicates or other anomalies in the data set.
2. Plotting a time-series of the number of tweets by day using the whole dataset and comment on what you see.
3. Using a box and whisker diagram, compare the average number of tweets on the weekdays in the dataset to the numbers for weekend days. Are there statistically significant differences between the number of tweets on weekdays and weekends? 
4. Plot the average number of tweets at each hour of the day for weekdays and weekends and comment. You should have two plots where the x-axis is time of day (from midnight to midnight) and the y-axis shows the number of tweets.

# Data Extraction and reduction:
The twitter data files have a lot of data which will not be needed for tasks to complete. So to make it viable to use with the available RAM(16 GB) the data is extracted into smaller jsons. Once the data is collected it can be extracted again to make a dataframe out of it to perform further analysis. 

Each ID has been taken as a unique tweet, this will consider:
* Same tweets done by the same user multiple times, as it still shows the activity.
* Same tweets done by the different user, it could be bots or avid followers, which again shows the real world activity.
* Retweets. That is, the same tweet echoed by differnt users. 

Also below considerations have been taken in account:

* The tweets which are considered ***duplicates*** are the data lines with same IDs, which will be removed in the following steps.
* And the tweets with no ID at all have not been included in the count and have been considered as ***anomalies*** in the data. 

## Task 1.2: Total number of tweets 
The smaller jsons have been downloaded and now will be used to make a dataframe which can be used to perform analysis. The first step of the analysis would be to calculate the number of tweets, before it can be done the ***duplicates*** with same ID will be removed. 

In [None]:
paths = []
for directory, _, files in os.walk('../input/twitter-task'):
    for file in files:
        paths.append(os.path.join(directory, file))
paths = sorted(paths)

# Data for Task 1
1. ID
2. Created at
3. language

In [None]:
import json
i = 1
for path in paths:
    
    with open(path) as try_file:
        #json_obj = json.load(try_file)
        data = {'id':[], 'created_at':[],# 'text':[], 
                'tweet_lang':[], }#'tweet_coordinates':[], 'country':[], 'username':[], 'user_mentions':[]}
        for line in try_file:
            jsonfile= json.loads(line)

            if 'id_str' in line:
                data['id'].append(jsonfile['id_str'])
            else:
                continue
            data['created_at'].append(jsonfile['created_at']) 
            #data['text'].append(jsonfile['text'].replace('\n',''))
            data['tweet_lang'].append(jsonfile['lang'])
            
        
    name = './jsonq3'+str(i)+'.json' 
    df1 = pd.DataFrame(data)
    jfile = df1.to_json(name, orient= 'records', date_format = 'iso')
    i += 1


In [10]:
df = pd.DataFrame()
rjpaths = []
for directory, _, files in os.walk('../input/q1-json'):
    for file in files:
        if file == 'random.csv':
            continue
        else:
            rjpaths.append(os.path.join(directory, file))
rjpaths = sorted(rjpaths)

for rjpath in rjpaths:
    df1 = pd.read_json(rjpath, orient = 'records', convert_dates = False)
    #read_csv(rjpath, index_col = 0, parse_dates = ['created_at'], infer_datetime_format =True, dtype = {'id': 'str','created_at': 'str', 'text': 'str', 'tweet_lang': 'str','tweet_coordinates': 'str','country': 'str', 'username': 'str','user_mentions': 'str' } ) 
    df = df.append(df1)
    del df1

In [5]:
df1 = pd.read_csv('../input/csvwithtext/csvwtxt10.csv', index_col = 0, parse_dates = ['created_at'], 
                      infer_datetime_format =True, dtype = {'id': 'str','created_at': 'str', 'text': 'str', 
                                                            'tweet_lang': 'str',
                               'tweet_coordinates': 'str','country': 'str', 'username': 'str','user_mentions': 'str' } )
df1

In [7]:
df1 = df1.drop_duplicates(subset= ['id'])
df1.shape

In [11]:
df['id'] = df['id'].astype(str)

In [12]:
#Count before removing duplicates:
df.shape

In [13]:
#Count after removing duplicates:
df = df.drop_duplicates(subset= ['id'])
df.shape

***Total number of tweets:*** After removing 22219 duplicate IDs the final number of tweets came out to be ***24780899***. 

# Task 1.3: Tweets by day
The dataframe we have contains columns we do not need and so selective columns will be taken to analyse the per day tweet count. Once that is done we can count tweets per day grouping dataframe columns against "created_at" column.

In [93]:
df['date'] = df.created_at.astype(str).str[:10]
dd = df['date'].value_counts()

dfplot = dd.to_frame(name = 'count')
dfplot.index.name = 'date'
dfplot = dfplot.reset_index()
dfplot['day'] = dfplot['date'].astype(str).str[:3]
dfplot

In [94]:
dayofweek = []
for index, rows in dfplot.iterrows():
    if rows['day'] == 'Sun' or rows['day'] == 'Sat':
        dayofweek.append('Weekend')
    else:
        dayofweek.append('Weekday')
dfplot['dow'] = dayofweek
dfplot

In [100]:
dfplot['date'] = dfplot['date'].astype(str).str[4:]
dfplot = dfplot.sort_values('date')
dfplot = dfplot.reset_index(drop = True)


In [102]:
dfplot.T

In [103]:
plot = dfplot.reset_index().plot.line(x = 'date', y = 'count', figsize = (20, 7), marker = 'o',
                                      title = 'Daily tweet count in Europe for March 2020', x_compat=True)
ylabel = plot.set_ylabel('tweet count')
meanine = plot.axhline(y=dfplot['count'].mean(), color='r', linestyle='-', label = 'average tweets')
#plt.savefig('#Daily_tweet_count_Europe_March2020.png', bbox_inches='tight')

***Comments*:** The number of tweets can be seen increasing after first 10 days of the month and can be seen that last 18 days have had above the average number of tweets. The tipping point of the increase seems to be 11th March. 

# Task 1.4: Tweets by language.
Finding number of tweets by language will require the same approach as finding the tweets by date. This time the group by action will be performed on 'tweet_lang' column.

Language info: According to twitter documentation twitter recognises 70 languages. These languages will be mapped against their encoder and will be translated to human recognisable words. Also the encoder 'und' is used for tweets which are not understandable or just contain symbols, emojis or tags. 

***BCP47*** library will be imported to translate language names from the encoded tags.

In [106]:
dflang = df.value_counts('tweet_lang')
dflang = dflang.reset_index()
dflang = dflang.rename(columns={0:'count'})

In [112]:
! pip install bcp47

In [114]:
import bcp47
#dflang = dflang.reset_index()
langlist = []
for index, rows in dflang.iterrows():
    if bcp47.tags.get(rows['tweet_lang']) != None:
        langlist.append(bcp47.tags.get(rows['tweet_lang']))
    else:
        langlist.append(rows['tweet_lang'])
        
#since some of the language tags are not updated in the library, 
#they are manually added refering the details on twitter api guide

langlist[langlist.index('ckb')] = 'Sorani Kurdish'
langlist[langlist.index('ht')]  = 'Haitian Creole'
langlist[langlist.index('in')]  = 'Indonesian'
langlist[langlist.index('iw')]  = 'Hebrew'
langlist[langlist.index('tl')]  = 'Tagalog'
langlist[langlist.index('und')] = 'Undetermined'

dflang['decoded_lang'] = langlist

In [115]:
dflang

In [117]:

langplot = dflang.plot.bar(x = 'decoded_lang', y= 'count', figsize = [17,8], title = 'Popular languages on twitter in Europe')
#plt.savefig('#Languages_used_bargraph.png', bbox_inches='tight')

***Comments***: A few languages seem much more used than the others. Looking at the bar chart one can determine that the geography in question is Europe as all the languages clearly towering are European languages. English, not surprisingly, is the most common language in the tweets. It can also be seen that tweets with language 'und' or 'undetermined' is also pretty common, this shows the trend that a huge number of people tend to tweet just the emojis or tags in their tweets.   

# Task 1.5: Number of tweets during weekdays vs weekends 



In [118]:
dfplot.boxplot(column=['count'], by= 'dow')

In [121]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=[20,10])
ax = sns.boxplot(x="dow", y="count", data=dfplot, width = 0.3)
sns.set(font_scale=2)
#plt.xlabel(fontdict = {'size' : 16})
plt.title('Tweet count distribution on weekdays and weekends')#, fontdict = {'size' : 20})
plt.savefig('#weekdayweekendDistribution.png', bbox_inches = 'tight')

***Comments:*** People seem to tweet more during the weekends.

# Task 1.6: Finding tweets by hour to determine busiest and least busy.

In [123]:
df['hour'] = df.created_at.astype(str).str[11:13]

In [134]:
df_hour = df.value_counts('hour')
df_hour = df_hour.to_frame(name= 'tweet_count')

In [135]:
df_hour = df_hour.reset_index()
df_hour = df_hour.sort_values('hour')
df_hour = df_hour.reset_index(drop= True)
df_hour

In [136]:
df_hour['average_tweet_per_hour'] = df_hour['tweet_count']/31
df_hour

In [137]:
df_hour.plot.bar(x = 'hour', y= 'average_tweet_per_hour', )

In [138]:
fig, ax = plt.subplots(figsize=[20,10])
ax = sns.barplot(x="hour", y="average_tweet_per_hour", data=df_hour)
plt.title('Average Tweet count distribution per hour for March 2020')
plt.savefig('#hourlyDistribution.png', bbox_inches = 'tight')

***Comments***: It can be seen that number of tweets per hour peak during the evening between 5 pm and 10 pm. This also aligns with the our daily routine of working men and women, most tweets are posted after office hours. We can also observe the steep depression in the count during wee hours which again tells the obvious, indicates that majority of users are sleeping.

In [141]:
#saving the ram!
del df
del df_hour
del dfplot
del dflang

# Part 2. Mapping 
1. Drawing a map of Europe showing the location of the GPS-tagged tweets - these are tweets which have a “coordinates” field in the metadata. 
2. Explaining any patterns I observed.


**Data for task 2**

* ID
* Coordinates


In [None]:
import json
import numpy as np
import pandas as pd
import os

i = 1
for path in paths:
    
    with open(path) as try_file:
        data = {'id':[],'tweet_coordinates':[], 'country': [], 'country_code': [] }
                 
        for line in try_file:
            jsonfile= json.loads(line)
            
            #extracting the data if it has ID and coordinates
            if ('id_str' in line) and (jsonfile['coordinates'] != None ):
                data['id'].append(jsonfile['id_str'])
                data['tweet_coordinates'].append(jsonfile['coordinates']['coordinates'])
                if jsonfile['place'] != None:
                    data['country'].append(jsonfile['place']['country'])
                    data['country_code'].append(jsonfile['place']['country_code'])
                else:
                    data['country'].append(np.nan)
                    data['country_code'].append(np.nan)
            else:
                continue

        
    name = './jsonq2'+str(i)+'.json' 
    df1 = pd.DataFrame(data)
    jfile = df1.to_json(name, orient= 'records', date_format = 'iso')
    i += 1

In [1]:
import json
import numpy as np
import pandas as pd
import os
df = pd.DataFrame()
rjpaths = []
for directory, _, files in os.walk('../input/q2-json'):
    for file in files:
        rjpaths.append(os.path.join(directory, file))
rjpaths = sorted(rjpaths)

for rjpath in rjpaths:
    df1 = pd.read_json(rjpath, orient = 'records', convert_dates = False)
    df = df.append(df1)
    del df1

In [2]:
df.head()

In [3]:
df = df.drop_duplicates(subset = ['id'])
df.index = df.id
df = df.drop(columns = ['id'])
df = df.dropna()
df.head()

In [5]:
longarr =[df.iloc[i,0][0] for i in range(len(df.index))]
latarr = [df.iloc[i,0][1] for i in range(len(df.index))]

In [6]:
df['long'] = longarr
df['lat'] = latarr
df.head()

***Comment:*** Because names of some countries are written in local languages, using country codes to get the names of the countries.

In [7]:
df.iloc[0,1]

In [8]:
import pycountry
countries = []

#get the values of countries against ISO tags if they are present in pycountry library
for index, rows in df.iterrows():
    if rows['country_code'] != None:
        if pycountry.countries.get(alpha_2= rows['country_code']) != None:
            countries.append(pycountry.countries.get(alpha_2= rows['country_code']).name)
        else: 
            countries.append(rows['country_code'])
    else:
        countries.append(np.nan)

#Kosovo was not present in the pyountry and hence manually added
df['decoded_C'] = ['Kosovo' if ca == 'XK' else ca for ca in countries]

df = df.dropna()
df.head()

In [11]:
locationlist = df[['lat','long']].values.tolist()

# Task 2.1: Plotting a map.



In [12]:
#we'll need 3 dataframes for this, 
#1. to pin point the tweets, 
#2. to get the number of tweets for each country for choropleth
#3. The geo json to get the boundries for each country

#2nd data frame for choropleth

df_tweets = df.groupby('decoded_C').count()
df_tweets = df_tweets.drop(['tweet_coordinates', 'country', 'long', 'lat'], axis = 1)
df_tweets = df_tweets.reset_index()
df_tweets = df_tweets.drop(0)
df_tweets

In [13]:
df_tweets.sort_values(by=['country_code']).head(10)

In [None]:
# adding geo json file

worldGeo

In [14]:
import folium
worldGeo = f"../input/world-countries-geo-json/world-countries.geo.json"
m = folium.Map(location=[51.1657,10.4515], zoom_start=3,  tiles="cartodbpositron")
folium.Choropleth(
    geo_data=worldGeo,
    name="choropleth",
    data=df_tweets,
    columns=["decoded_C", "country_code"],
    key_on="feature.properties.name",
    fill_color="RdPu",
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name="Tweet count in each country",
).add_to(m)

folium.LayerControl().add_to(m)

In [15]:
m

In [16]:
from folium.plugins import MarkerCluster

marker_cluster = MarkerCluster().add_to(m)

for point in range(0, len(locationlist)):
    folium.Marker(locationlist[point]).add_to(marker_cluster)


In [17]:
folium.Marker([41.54902754, 2.10348329]).add_to(marker_cluster)

In [None]:
m.save("./index.html")


In [19]:
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
from matplotlib.patches import Circle
plt.style.use('fivethirtyeight')
plt.rcParams.update({'font.size': 20})
plt.rcParams['figure.figsize'] = (20, 10)

In [20]:
ax = plt.axes(projection=ccrs.PlateCarree())
ax.stock_img()
# plot individual locations                                                                                                       
ax.plot(df.long, df.lat, 'ro', transform=ccrs.PlateCarree())
# add coastlines for reference                                                                                                
ax.coastlines(resolution='50m')
ax.set_global()
ax.set_extent([20, -20, 45,60])
'''def get_radius(freq):
    if freq < 50:
        return 0.5
    elif freq < 200:
        return 1.2
    elif freq < 1000:
        return 1.8
# plot count of tweets per location
for i,x in locations.iteritems():
    ax.add_patch(Circle(xy=[i[2], i[1]], radius=get_radius(x), color='blue', alpha=0.6, transform=ccrs.PlateCarree()))'''
plt.show()

# Part 3. Users 
1. Making a histogram of tweets per user with number of users on the y-axis and number of tweets they make on the x-axis. Discuss the distribution that you see. All the users in the data set should be included! 
2. Find the top-5 users by total number of tweets. Do you think any are automated accounts (aka. bots)? Justify your answer.
3. Find the 5 users who receive the most mentions and comment on this.
4. Calculate how often users in the UK, France, Germany, Italy and Turkeymention users in each of the other 4 countries. You should compute 25 numbers e.g. UK mentions UK, UK mentions France, France mentions UK etc. Comment on any patterns you observe. 

# Data for Task 3
1. ID
2. Created at
3. Users
4. User mentions

In [None]:
import json
i = 1
for path in paths:
    
    with open(path) as try_file:
        #json_obj = json.load(try_file)
        data = {'id':[], 'created_at':[],# 'text':[], 'tweet_lang':[], }#'tweet_coordinates':[], 'country':[], 
                'username':[], 'user_mentions':[]}
        for line in try_file:
            jsonfile= json.loads(line)

            if 'id_str' in line:
                data['id'].append(jsonfile['id_str'])
            else:
                continue
            data['created_at'].append(jsonfile['created_at']) 
            data['username'].append(jsonfile['user']['screen_name'])

            if len(jsonfile['entities']['user_mentions']) == 0:
                data['user_mentions'].append(np.nan)
            else:
                data['user_mentions'].append([jsonfile['entities']['user_mentions'][i]['screen_name'] 
                                             for i in range(len(jsonfile['entities']['user_mentions']))])
        
    name = './jsonq3'+str(i)+'.json' 
    df1 = pd.DataFrame(data)
    jfile = df1.to_json(name, orient= 'records', date_format = 'iso')
    i += 1


In [None]:
import pandas as pd
import os

In [21]:
df = pd.DataFrame()
rjpaths = []
for directory, _, files in os.walk('../input/q3-json'):
    for file in files:
        rjpaths.append(os.path.join(directory, file))
rjpaths = sorted(rjpaths)

In [22]:
for rjpath in rjpaths:
    df1 = pd.read_json(rjpath, orient = 'records', convert_dates = False)
    #read_csv(rjpath, index_col = 0, parse_dates = ['created_at'], infer_datetime_format =True, dtype = {'id': 'str','created_at': 'str', 'text': 'str', 'tweet_lang': 'str','tweet_coordinates': 'str','country': 'str', 'username': 'str','user_mentions': 'str' } ) 
    df = df.append(df1)
    del df1

**Task 3.1** 
Number of tweets per user bar graph

In [24]:
df = df.drop_duplicates(subset = ['id'])

In [28]:
df_usertweets = df.groupby('username').count()

In [29]:
df_usertweets = df_usertweets.reset_index()
df_usertweets = df_usertweets.drop(['created_at', 'user_mentions'], axis =1)
df_usertweets = df_usertweets.rename(columns = {'id' : 'Tweet_count'})
df_usertweets.head()

In [30]:
df_usertweets[df_usertweets['Tweet_count'] < 5]

***Comments:*** Total number of users that tweeted in that month are **1111170** in number, looking at the plot it can be seen that there is a huge majority of the user post less than 5 tweets a month. 

In [32]:
import seaborn as sns
f, ax = plt.subplots(figsize=(15, 7))
sns.histplot(data = df_usertweets,x = 'Tweet_count', binwidth = 5)
plt.ylabel('Number of users')
plt.title('Number of tweets per user')
plt.xlim(0,150)
#plt.savefig('#3.1_tweets_per_user.png', bbox_inches = 'tight')

In [33]:
f, ax = plt.subplots(figsize=(15, 7))
sns.histplot(data = df_usertweets,x = 'Tweet_count', binwidth = 10)
plt.ylabel('Number of users')
plt.title('Number of tweets per user')
plt.xlim(0,2000)
ax.set_yscale('log')
plt.savefig('#3.1_tweets_per_user.png', bbox_inches = 'tight')

In [34]:
f, ax = plt.subplots(figsize=(15, 7))
sns.histplot(data = df_usertweets,x = 'Tweet_count', binwidth = 5)
plt.ylabel('Number of users')
plt.title('Number of tweets per user')
plt.xlim(0,300)
plt.ylim(0,10000)
plt.savefig('#3.1_tweets_per_user.png', bbox_inches = 'tight')

***Comments***: Minority of user do majority of tweets!

**Task 3.2**
Hunting for the top 10 users

In [36]:
df_usertweets.sort_values(by = ['Tweet_count'], 
                          ascending=False).head(10)

* WhatsOnOLIO = valid food sharing app, automated
* KorayDavulcu2 and thaiselenags = possibly spamming or automated account, account doesn't exist 
* MathieuRonsard = possibly automated account, low followers and all the tweets redirect to a suspicious link.
* infosrv: Account suspended by twitter,
* AnimalsHolbox: 0 followers, 0 following, 2.3 million tweets, possibly spammer
* HoraCatalana: Automated account posts time every 5 minutes in Catalan
* _BB_RADIO_MUSIC: German radio posts all the songs it plays, possibly automated
* RadioTeddyMusic: Another german radio station, posts all the songs it plays, possibly automated
* haykakan_top: possibly automated


**Task 3.3** User mentions data.

In [None]:
import numpy as np

In [39]:
namelist = df['user_mentions'].values.tolist()

flat_list = []
for sublist in namelist:
    for item in sublist:
        flat_list.append(item)

In [41]:
from collections import Counter, OrderedDict
count = Counter()
for user in flat_list:
    count[user]+=1
df_mentionCount = pd.DataFrame.from_dict(count, orient = 'index', columns = ['MentionCount'])
df_mentionCount

In [42]:
import matplotlib.pyplot as plt 
f,ax = plt.subplots(figsize = [15,7])
sns.histplot(data = df_mentionCount, x = 'MentionCount', binwidth = 10)
plt.xlim(0,100)
plt.ylabel('Number of Users')
plt.title('User mentions per user during March 2020')

In [43]:
f,ax = plt.subplots(figsize = [15,7])
sns.histplot(data = df_mentionCount, x = 'MentionCount', binwidth = 10)
plt.xlim(0,2500)
ax.set_yscale('log')
#plt.ylim(0,700)
plt.ylabel('Number of Users')
plt.title('User mentions per user during March 2020')
plt.savefig('#3.3_user_mentions per_user.png', bbox_inches = 'tight')

***Comment:*** User that got more than 200 mentions are less than 700 out of **1111170** of the total users. 

**Task 3.4**
Some of the highly mentioned users and why?


In [44]:
df_mentionCount.\
sort_values(by = ['MentionCount'], 
            ascending = False).head(15)

***Comments:*** Ah! Famous people!

In [46]:
#Alternatively
jsonResult = OrderedDict(count.most_common(10))
tupleResult = count.most_common(10)
tupleResult

# Part 4. Events 
1. Identify 3 days with unusually high activity in 3 different countries of your choosing. For example you could choose one day in the UK, one in France and one in Turkey. Describe and justify how you identify ‘unusual’ days. 
2. Characterise each of these three days. Exactly how you do this is up to you, but for example you could:
Display some indicative Tweets.
Make a word cloud from the tweet text.
Plot tweets locations on a map.
Validate your conclusions with some other source of data e.g. government or news reports.

# Data for Task 4
* ID
* Created at
* Country: England, Ireland, 
* Text

In [None]:
import json
i = 1
for path in paths:
    
    with open(path) as try_file:
        data = {'id':[], 'created_at':[], 'text':[], 'country':[]}
        for line in try_file:
            jsonfile= json.loads(line)
            if ('id_str' in line) and (jsonfile['place'] != None):
                if(jsonfile['place']['country'] == 'United Kingdom' or jsonfile['place']['country'] == 'Ireland') :
                    data['id'].append(jsonfile['id_str'])
                    data['created_at'].append(jsonfile['created_at'])
                    data['text'].append(jsonfile['text'].replace('\n',' '))
                    data['country'].append(jsonfile['place']['country'])
                else:
                    continue
                    
    name = './jsonq4'+str(i)+'.json' 
    df1 = pd.DataFrame(data)
    jfile = df1.to_json(name, orient= 'records', date_format = 'iso')
    i += 1

In [3]:
import pandas as pd
import matplotlib.pyplot as plt 
import os

df = pd.DataFrame()
rjpaths = []
for directory, _, files in os.walk('../input/q4-json'):
    for file in files:
        rjpaths.append(os.path.join(directory, file))
rjpaths = sorted(rjpaths)

In [4]:
for rjpath in rjpaths:
    df1 = pd.read_json(rjpath, orient = 'records', convert_dates = False)
    #read_csv(rjpath, index_col = 0, parse_dates = ['created_at'], infer_datetime_format =True, dtype = {'id': 'str','created_at': 'str', 'text': 'str', 'tweet_lang': 'str','tweet_coordinates': 'str','country': 'str', 'username': 'str','user_mentions': 'str' } ) 
    df = df.append(df1)
    del df1

# Task 4.1
Plotting unusual days for UK and Ireland

First plotting number of tweets per day for each country and checking spikes. From the previous analysis we observed that number of tweets during the weekends are more than on weekdays. Keeping this information in mind, we can check weekdays with high number of tweets. 

In [5]:
df['date'] = df['created_at'].str[4:10]

# Text preprocessing:

* making all text lowercase
* removing hyperlinks
* removing user tags
* removing stemming words: like, likes, liking, 
* removing non alpha numerics

In [7]:
df['processed_text'] = df['text'].str.lower().str.replace('(@[a-z0-9]+)\w+',' ')\
                        .str.replace('(http\S+)', ' ')\
                          .str.replace('([^0-9a-z \t])',' ').str.replace(' +',' ')

In [8]:
df.head()

In [9]:
df_UK = df[df.country == 'United Kingdom']
df_IR = df[df.country == 'Ireland']

In [10]:
df_UK_count= df_UK.groupby('date').count()
df_UK_count = df_UK_count.drop(['created_at', 'text', 'country'], axis = 1)
df_UK_count = df_UK_count.reset_index()
df_UK_count = df_UK_count.rename(columns = {'id': 'Tweet_count_UK'})


In [11]:
df_IR_count= df_IR.groupby('date').count()
df_IR_count = df_IR_count.drop(['created_at', 'text', 'country'], axis = 1)
df_IR_count = df_IR_count.reset_index()
df_IR_count = df_IR_count.rename(columns = {'id': 'Tweet_count_ireland'})


In [12]:
df_IR_count = df_IR_count.rename(columns = {'Tweet_count': 'Tweet_count_ireland'})
df_UK_count = df_UK_count.rename(columns = {'Tweet_count': 'Tweet_count_UK'})

In [13]:
df_IR_count.head()

In [14]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(20, 15), sharex = True)
sns.barplot(data= df_IR_count, x = 'date', y = 'Tweet_count_ireland' , ax = ax1, palette = "crest")
sns.barplot(data= df_UK_count, x = 'date', y = 'Tweet_count_UK', ax = ax2,palette="light:b")
ticks= plt.setp(ax2.get_xticklabels(), rotation=45)
ax1.set_title('Tweet Frequency in Ireland and UK during March 2020')
#title = plt.title('Tweet Frequency in Ireland and UK during March 2020')
plt.savefig('#4.1_Tweet_frequency_comparision.png',bbox = 'tight')

# Task 4.2: Characterise 3 days

* Ireland: March 17 Tuesday
* Ireland: March 12 Thursday --X March 27 Friday
* UK : March 20 Friday


In [15]:
df_IR_17 = df.loc[(df['country'] == "Ireland") & (df['created_at'].str[4:10] == 'Mar 17'), 'processed_text']
df_IR_17

In [16]:
df_IR_27 = df.loc[(df['country'] == "Ireland") & (df['created_at'].str[4:10] == 'Mar 27'), 'processed_text']
df_UK_20 = df.loc[(df['country'] == "United Kingdom") & (df['created_at'].str[4:10] == 'Mar 20'), 'processed_text']

In [17]:
df_IR_12 = df.loc[(df['country'] == "Ireland") & (df['created_at'].str[4:10] == 'Mar 12'), 'processed_text']

In [18]:
df_UK_17 = df.loc[(df['country'] == "Ireland") & (df['created_at'].str[4:10] == 'Mar 17'), 'processed_text']

In [19]:
df_UK_26 = df.loc[(df['country'] == "United Kingdom") & (df['created_at'].str[4:10] == 'Mar 26'), 'processed_text']

In [22]:
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

# making a huge string with processed data value
text = " ".join(tweet for tweet in df_UK_26.values)
stopwords = set(STOPWORDS)
stopwords.update(["s", "t", "will", "thank","day","m","one","good","now", "thanks", "time", "amp", "re", "u", "people"
                 ,"ireland", "new", "got", "know", "going", "go", "much"])

f, ax = plt.subplots(figsize= [20,10])
plt.imshow(WordCloud(stopwords = stopwords,  collocations = False, width=2000,height=1000, 
                     background_color = 'white').generate(text), interpolation='bilinear', )
plt.axis('off')
#plt.savefig('26maruk.png', bbox = 'tight')

In [None]:
#plt.savefig('#4.2_Ireland_12_Mar.png', bbox = 'tight')

In [25]:
stopwords = set(STOPWORDS)
stopwords.update(["s", "t", "will", "thank", "well","day","m","one","good","now", "thanks", "time", "amp", "re", "u", "people"
                 ,"ireland"])


In [26]:
text = " ".join(tweet for tweet in df_IR_12.values)
f, ax = plt.subplots(figsize= [20,10])
plt.imshow(WordCloud(stopwords = stopwords,  collocations = False, width=2000,height=1000, background_color = 'white').generate(text), interpolation='bilinear', )
plt.axis('off')
#plt.savefig('#4.2_Ireland_12_Mar.png', bbox = 'tight')

In [29]:
import numpy as np
UK_mask = np.array(Image.open("../input/q4-images/UK_outline_edited.png"))


In [31]:
def transform_format(val):
    if val == 0:
        return 255
    else:
        return val

In [32]:
# Transform your mask into a new one that will work with the function:
transformed_UK_mask = np.ndarray((UK_mask.shape[0],UK_mask.shape[1]), np.int32)
print(transformed_UK_mask)
for i in range(len(UK_mask)):
    for j in range (len(transformed_UK_mask[i])):
        transformed_UK_mask[i][j] = list(map(transform_format, UK_mask[i]))
print(transformed_UK_mask)

In [34]:
stopwords = set(STOPWORDS)
stopwords.update(["s", "t", "will", "thank","day","m","one","good","now", "thanks", "time", "amp", "re", "u", "people"
                 ,"ireland", "need", "going", "work", "think", "know", "today", "love", "great", "hope", "ve", "go", 
                  "see"])

text2 = " ".join(tweet for tweet in df_UK_26.values)
f, ax = plt.subplots(figsize= [20,10])
plt.imshow(WordCloud(stopwords = stopwords,  collocations = True, width=2000,height=1000,
                     background_color = 'white', mask=UK_mask, contour_width=3, contour_color = 'firebrick').generate(text2), interpolation='bilinear' )
plt.axis('off')
plt.savefig('#4.2_UK_26_Mar.png', bbox = 'tight')