In [None]:
#importing necessery libraries for future analysis of the dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
import seaborn as sns
np.random.seed(2020)
import nltk
nltk.download('punkt') # one time execution
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
import re
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
data = pd.read_csv("covid19_tweets.csv")
data.head(3)

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,ᏉᎥ☻լꂅϮ,astroworld,wednesday addams as a disney princess keepin i...,2017-05-26 05:46:42,624,950,18775,False,2020-07-25 12:27:21,If I smelled the scent of hand sanitizers toda...,,Twitter for iPhone,False
1,Tom Basile 🇺🇸,"New York, NY","Husband, Father, Columnist & Commentator. Auth...",2009-04-16 20:06:23,2253,1677,24,True,2020-07-25 12:27:17,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,,Twitter for Android,False
2,Time4fisticuffs,"Pewee Valley, KY",#Christian #Catholic #Conservative #Reagan #Re...,2009-02-28 18:57:41,9275,9525,7254,False,2020-07-25 12:27:14,@diane3443 @wdunlap @realDonaldTrump Trump nev...,['COVID19'],Twitter for Android,False


In [None]:
data.shape

(179108, 13)

In [None]:
data.dtypes

user_name           object
user_location       object
user_description    object
user_created        object
user_followers       int64
user_friends         int64
user_favourites      int64
user_verified         bool
date                object
text                object
hashtags            object
source              object
is_retweet            bool
dtype: object

## Data cleaning

In [None]:
def get_mention(txt):
    mention = []
    for i in txt.split(" "):
        if len(i) > 0 and i[0] == "@":
            mention.append(i)
    return "".join([mention[i] + ", " if i != len(mention) - 1 else mention[i] for i in range(len(mention))])

data["mention"] = data.text.apply(get_mention)

In [None]:
def remove_link_email(txt):
    txt = txt.replace("...", "")
    txt = re.sub(r"http\S+", "", txt)
    txt = txt.replace('\S*@\S*\s?', "")
    txt = re.sub(r'[^\w\s]', '', txt)
    return txt

data.text = data.text.apply(remove_link_email)

In [None]:
# no non-english
def clean_non_english(txt): 
    try: 
        txt = re.sub(r'\W+', ' ', txt)
        txt = txt.lower()
        txt = txt.replace("[^a-zA-Z]", " ")
        word_tokens = word_tokenize(txt) 
        filtered_word = [w for w in word_tokens if all(ord(c) < 128 for c in w)]
        filtered_word = [w + " " for w in filtered_word]
        return "".join(filtered_word)
    except:
        return np.nan

data["english_text"] = data.text.apply(clean_non_english)

In [None]:
# all clean
def clean_text(english_txt): 
    try: 
        word_tokens = word_tokenize(english_txt)
        filtered_word = [w for w in word_tokens if not w in stop_words] 
        filtered_word = [w + " " for w in filtered_word]
        return "".join(filtered_word)
    except:
        return np.nan

data["cleaned_text"] = data.english_text.apply(clean_text)

In [None]:
data.isnull().sum()

user_name               0
user_location       36771
user_description    10286
user_created            0
user_followers          0
user_friends            0
user_favourites         0
user_verified           0
date                    0
text                    0
hashtags            51334
source                 77
is_retweet              0
mention                 0
english_text            0
cleaned_text            0
dtype: int64

In [None]:
def clean_tag(txt):
    try:
        chars = "'[]"
        for char in chars:
            txt = txt.replace(char, "")
        txt = txt.lower()
        return txt
    except: 
        return np.nan

data["cleaned_tags"] = data.hashtags.apply(clean_tag)

In [None]:
lst = []
for item in data.cleaned_tags:
    try:
        if item != np.nan:
            lst_word = item.split(", ")
        lst += lst_word
    except:
        pass

from collections import Counter
x = Counter(lst)
x.most_common(10)

[('covid19', 100312),
 ('coronavirus', 10197),
 ('pandemic', 1625),
 ('covid', 1299),
 ('india', 1193),
 ('corona', 1162),
 ('trump', 1101),
 ('lockdown', 963),
 ('coronaviruspandemic', 882),
 ('covid_19', 828)]

In [None]:
def get_len_hashtag(txt):
    try: 
        return len(txt.split(","))
    except:
        return np.nan

data["len_hashtag"] = data.hashtags.apply(get_len_hashtag)

In [None]:
data.dropna(subset=["user_description", "user_location", "hashtags", "cleaned_text", "text", "english_text", 'cleaned_tags'], inplace=True)

In [None]:
data.head(3)

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet,mention,english_text,cleaned_text,cleaned_tags,len_hashtag
2,Time4fisticuffs,"Pewee Valley, KY",#Christian #Catholic #Conservative #Reagan #Re...,2009-02-28 18:57:41,9275,9525,7254,False,2020-07-25 12:27:14,diane3443 wdunlap realDonaldTrump Trump never ...,['COVID19'],Twitter for Android,False,"@diane3443, @wdunlap, @realDonaldTrump",diane3443 wdunlap realdonaldtrump trump never ...,diane3443 wdunlap realdonaldtrump trump never ...,covid19,1.0
3,ethel mertz,Stuck in the Middle,#Browns #Indians #ClevelandProud #[]_[] #Cavs ...,2019-03-07 01:45:06,197,987,1488,False,2020-07-25 12:27:10,brookbanktv The one gift COVID19 has give me i...,['COVID19'],Twitter for iPhone,False,@brookbanktv,brookbanktv the one gift covid19 has give me i...,brookbanktv one gift covid19 give appreciation...,covid19,1.0
4,DIPR-J&K,Jammu and Kashmir,🖊️Official Twitter handle of Department of Inf...,2017-02-12 06:45:15,101009,168,101,False,2020-07-25 12:27:08,25 July Media Bulletin on Novel CoronaVirusUp...,"['CoronaVirusUpdates', 'COVID19']",Twitter for Android,False,"@DrSyedSehrish, @airnewsalerts, @ANI…",25 july media bulletin on novel coronavirusupd...,25 july media bulletin novel coronavirusupdate...,"coronavirusupdates, covid19",2.0


In [None]:
data = data.sample(n = 10000)
data.reset_index(inplace=True)
data.drop(['index', 'source'], axis = 1, inplace=True)
print(data.shape)
data.head(3)

(25000, 17)


Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,is_retweet,mention,english_text,cleaned_text,cleaned_tags,len_hashtag
0,Ageing Well B&H,Brighton & Hove,We help people aged 50+ in Brighton & Hove fin...,2019-08-21 09:38:48,267,347,569,False,2020-08-11 07:30:27,So much fun to be hard Dont miss out Email pau...,"['Brighton', 'Hove', 'zoom']",False,,so much fun to be hard dont miss out email pau...,much fun hard dont miss email paulacarterimpac...,"brighton, hove, zoom",3.0
1,The Hindu - Chennai,"Chennai, India",The official twitter account of The Hindu's re...,2012-06-20 11:24:09,90393,307,269,True,2020-07-28 06:28:28,At least 25 remand prisoners lodged in Peruran...,['Thoothukudi'],False,,at least 25 remand prisoners lodged in peruran...,least 25 remand prisoners lodged perurani dist...,thoothukudi,1.0
2,#COVID19: Stay at home,"Accra, Ghana",Fact-checker | Journalist @Citi973 covering en...,2010-08-22 16:49:27,3103,623,6951,False,2020-07-27 06:52:33,Talk to a psychologist today\n\nCOVID19 corona...,"['COVID19', 'coronavirus', 'COVIDー19']",False,,talk to a psychologist today covid19 coronavirus,talk psychologist today covid19 coronavirus,"covid19, coronavirus, covidー19",3.0


In [None]:
data.isnull().sum()

user_name           0
user_location       0
user_description    0
user_created        0
user_followers      0
user_friends        0
user_favourites     0
user_verified       0
date                0
text                0
hashtags            0
is_retweet          0
mention             0
english_text        0
cleaned_text        0
cleaned_tags        0
len_hashtag         0
dtype: int64

In [None]:
from langdetect import detect

def detect_lang(txt):
  try:
    return detect(txt)
  except:
    return np.nan

data["language"] = data.cleaned_text.apply(detect_lang)

In [None]:
new_data = data[data.language == "en"]
new_data.reset_index(inplace = True)

In [None]:
new_data.head(3)

Unnamed: 0,index,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,is_retweet,mention,english_text,cleaned_text,cleaned_tags,len_hashtag,language
0,0,Ageing Well B&H,Brighton & Hove,We help people aged 50+ in Brighton & Hove fin...,2019-08-21 09:38:48,267,347,569,False,2020-08-11 07:30:27,So much fun to be hard Dont miss out Email pau...,"['Brighton', 'Hove', 'zoom']",False,,so much fun to be hard dont miss out email pau...,much fun hard dont miss email paulacarterimpac...,"brighton, hove, zoom",3.0,en
1,1,The Hindu - Chennai,"Chennai, India",The official twitter account of The Hindu's re...,2012-06-20 11:24:09,90393,307,269,True,2020-07-28 06:28:28,At least 25 remand prisoners lodged in Peruran...,['Thoothukudi'],False,,at least 25 remand prisoners lodged in peruran...,least 25 remand prisoners lodged perurani dist...,thoothukudi,1.0,en
2,2,#COVID19: Stay at home,"Accra, Ghana",Fact-checker | Journalist @Citi973 covering en...,2010-08-22 16:49:27,3103,623,6951,False,2020-07-27 06:52:33,Talk to a psychologist today\n\nCOVID19 corona...,"['COVID19', 'coronavirus', 'COVIDー19']",False,,talk to a psychologist today covid19 coronavirus,talk psychologist today covid19 coronavirus,"covid19, coronavirus, covidー19",3.0,en


In [None]:
new_data.drop(['index'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
new_data.to_csv('english_tweets.csv')