In [2]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import html
import numpy as np

# Cleaning Tweet

In [3]:
df = pd.read_csv('../data_collecting/raw_data_all_lang.csv', lineterminator='\n')
df.head()

Unnamed: 0,tweetId,author_id,tweet,lang,created_at
0,1497456832754774016,1255947273604755457,Apparently large cap gold producers are not re...,en,2022-02-26 06:21:44+00:00
1,1497456822529130496,1372451500669030400,Aastha Agarbatti is made of ayurvedic herbs an...,en,2022-02-26 06:21:42+00:00
2,1497456816434860032,287655824,@Belive_Kinuthia Save the environment...do e-c...,en,2022-02-26 06:21:41+00:00
3,1497456814941618178,1361681804458156037,Awesome project.Looking so sustainable. Hope i...,en,2022-02-26 06:21:40+00:00
4,1497456809950408707,1431136428520468480,CJI: While adjudicating the claims of intellec...,en,2022-02-26 06:21:39+00:00


In [4]:
df.applymap(np.isreal).groupby(['author_id']).size()

author_id
True    246696
dtype: int64

In [5]:
df.loc[df['author_id'] == "en"]

Unnamed: 0,tweetId,author_id,tweet,lang,created_at


In [6]:
df = df.dropna()

In [7]:
# check is all the data in english
tweet_by_lang = df.groupby("lang")["tweet"].count()
print(tweet_by_lang)

lang
am          1
ar        183
bg          2
bn         49
ca        212
cs         20
cy         33
da         62
de        763
dv          1
el         82
en     230391
es       1450
et         95
eu          4
fa         25
fi         41
fr        802
gu         26
hi        349
ht         70
hu         26
in        887
is         21
it        265
iw         56
ja       1022
km          3
kn          7
ko        104
lo          6
lt         12
lv          9
ml         36
mr         77
my          1
ne         23
nl        287
no         18
or          2
pa          2
pl         85
ps          1
pt        170
ro         83
ru         23
si          3
sl          8
sr          3
sv         57
ta         28
te          4
th         93
tl        692
tr        441
uk          2
und      7212
ur         33
vi          5
zh        228
Name: tweet, dtype: int64


In [8]:
# need to use a stopword library
stopwords = ["for", "on", "an", "a", "of", "and", "in", "the", "to", "from"]


def clean_tweet(tweet):
    temp = tweet.lower()
    temp = temp.replace('\n', ' ')
    temp = html.unescape(temp)
    temp = re.sub("'", "", temp) # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", temp)  # removing handle 
    temp = re.sub("#[A-Za-z0-9_]+","", temp)  # removing hastags
    temp = re.sub(r'http\S+', '', temp) # removing any link
    temp = re.sub('[()!?]', ' ', temp)  # remove the punct 
    temp = re.sub('\[.*?\]',' ', temp)  # remove any weird symbol
    temp = re.sub("[^a-z0-9]"," ", temp) # remove any numbers
    temp = temp.split()
    temp = [w for w in temp if not w in stopwords] 
    temp = " ".join(word for word in temp)
    return temp

In [9]:
df['clean_tweet'] = [clean_tweet(tw) for tw in df['tweet']]

In [10]:
df.head(10)

Unnamed: 0,tweetId,author_id,tweet,lang,created_at,clean_tweet
0,1497456832754774016,1255947273604755457,Apparently large cap gold producers are not re...,en,2022-02-26 06:21:44+00:00,apparently large cap gold producers are not re...
1,1497456822529130496,1372451500669030400,Aastha Agarbatti is made of ayurvedic herbs an...,en,2022-02-26 06:21:42+00:00,aastha agarbatti is made ayurvedic herbs flowe...
2,1497456816434860032,287655824,@Belive_Kinuthia Save the environment...do e-c...,en,2022-02-26 06:21:41+00:00,save environment do e cards bwana
3,1497456814941618178,1361681804458156037,Awesome project.Looking so sustainable. Hope i...,en,2022-02-26 06:21:40+00:00,awesome project looking so sustainable hope it...
4,1497456809950408707,1431136428520468480,CJI: While adjudicating the claims of intellec...,en,2022-02-26 06:21:39+00:00,cji while adjudicating claims intellectual pro...
5,1497456799842136064,2854122349,@moefcc #ClimateChange should be treated as pa...,en,2022-02-26 06:21:37+00:00,should be treated as patriotism save country r...
6,1497456788706263048,1344586433139535873,Made an example of what the GUI Band would loo...,en,2022-02-26 06:21:34+00:00,made example what gui band would look like out...
7,1497456720175841280,1384811535139147777,Aastha Agarbatti is made of ayurvedic herbs an...,en,2022-02-26 06:21:18+00:00,aastha agarbatti is made ayurvedic herbs flowe...
8,1497456719550717952,1256882116614336512,@Sonnenmensh Fair enough. I should rather have...,en,2022-02-26 06:21:17+00:00,fair enough i should rather have said that mon...
9,1497456708717006850,1288418230743126016,Thought for the Day \nFor Best Solution to You...,en,2022-02-26 06:21:15+00:00,thought day best solution your solar requireme...


In [11]:
df.to_csv('clean_data_all_lang.csv', index=False)

In [12]:
len(df)

246696

# Cleaning Location
1. remove all the links, emojis
2. use Spacy to identify the a location 
3. use geopy to get the coordinate of the location and the country

In [15]:
loc_df = pd.read_csv('../data_collecting/raw_data_w_location.csv')
print(len(loc_df))
loc_df.head()

304396


Unnamed: 0.1,Unnamed: 0,tweetId,author_id_x,tweet,lang,created_at,clean_tweet,author_id_y,user_loc
0,0,1497456832754774016,1255947273604755457,Apparently large cap gold producers are not re...,en,2022-02-26 06:21:44+00:00,apparently large cap gold producers are not re...,1255947273604755457,
1,1,1497456822529130496,1372451500669030400,Aastha Agarbatti is made of ayurvedic herbs an...,en,2022-02-26 06:21:42+00:00,aastha agarbatti is made ayurvedic herbs flowe...,1372451500669030400,
2,2,1497456822529130496,1372451500669030400,Aastha Agarbatti is made of ayurvedic herbs an...,en,2022-02-26 06:21:42+00:00,aastha agarbatti is made ayurvedic herbs flowe...,1372451500669030400,
3,3,1497456822529130496,1372451500669030400,Aastha Agarbatti is made of ayurvedic herbs an...,en,2022-02-26 06:21:42+00:00,aastha agarbatti is made ayurvedic herbs flowe...,1372451500669030400,
4,4,1497456822529130496,1372451500669030400,Aastha Agarbatti is made of ayurvedic herbs an...,en,2022-02-26 06:21:42+00:00,aastha agarbatti is made ayurvedic herbs flowe...,1372451500669030400,


In [16]:
duplicate = loc_df[loc_df.duplicated()]
duplicate

Unnamed: 0.1,Unnamed: 0,tweetId,author_id_x,tweet,lang,created_at,clean_tweet,author_id_y,user_loc


In [17]:
len(loc_df)

304396

In [18]:
# clean out None Rows
loc_df['user_loc'].isnull().sum()

93029

In [19]:
clean_loc_df = loc_df.dropna(axis=0)
print(f'{len(loc_df)}')
print(len(clean_loc_df))
clean_loc_df.head()

304396
211367


Unnamed: 0.1,Unnamed: 0,tweetId,author_id_x,tweet,lang,created_at,clean_tweet,author_id_y,user_loc
10,10,1497456816434860032,287655824,@Belive_Kinuthia Save the environment...do e-c...,en,2022-02-26 06:21:41+00:00,save environment do e cards bwana,287655824,Nairobi
11,11,1497456816434860032,287655824,@Belive_Kinuthia Save the environment...do e-c...,en,2022-02-26 06:21:41+00:00,save environment do e cards bwana,287655824,Nairobi
12,12,1497456816434860032,287655824,@Belive_Kinuthia Save the environment...do e-c...,en,2022-02-26 06:21:41+00:00,save environment do e cards bwana,287655824,Nairobi
13,13,1497456816434860032,287655824,@Belive_Kinuthia Save the environment...do e-c...,en,2022-02-26 06:21:41+00:00,save environment do e cards bwana,287655824,Nairobi
14,14,1497456816434860032,287655824,@Belive_Kinuthia Save the environment...do e-c...,en,2022-02-26 06:21:41+00:00,save environment do e cards bwana,287655824,Nairobi


In [20]:
# remove emojies, numbers, non english words
import re

text = 'This is a smiley face \U0001f602'
print(text) # with emoji

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

print(deEmojify(text))

This is a smiley face 😂
This is a smiley face 


In [21]:
clean_loc_df['user_loc'] = clean_loc_df['user_loc'].apply(lambda row :deEmojify(row))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [22]:
clean_loc_df = clean_loc_df[~clean_loc_df.user_loc.str.contains('#')] 

In [23]:
clean_loc_df['user_loc'] = clean_loc_df['user_loc'].apply(lambda x: re.sub(r'http\S+', '', x)) #return na value if list is empty

In [25]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = "Apple is looking at buying U.K. startup for $1 billion"

def isLocation(doc):
    result = ''
    doc = nlp(doc)
    for ent in doc.ents:
        if ent.label_ == 'GPE':
            result = ent.text  
    return result

print(isLocation(doc))

U.K.


In [26]:
# entity extraction with spacy to only get country
clean_loc_df['extracted_user_loc'] = clean_loc_df['user_loc'].apply(lambda x: isLocation(x) if isLocation(x) else None) #return na value if list is empty

In [27]:
clean_loc_df['extracted_user_loc'].iloc[:3]

10    Nairobi
11    Nairobi
12    Nairobi
Name: extracted_user_loc, dtype: object

In [29]:
# use batches for the geolocators
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
# use geo filtering to figure out the country
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter


geolocator = Nominatim(user_agent="my-app")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1, max_retries=3)
clean_loc_df['location'] = clean_loc_df['extracted_user_loc'].apply(geocode, language='en')
clean_loc_df['coordinates'] = clean_loc_df['location'].apply(lambda loc: tuple(loc.point) if loc else None)
clean_loc_df['country'] = clean_loc_df['location'].apply(lambda loc: loc[0].split(',')[-1] if loc else None)

In [None]:
clean_loc_df

Unnamed: 0.1,Unnamed: 0,tweetId,author_id,tweet,lang,created_at,clean_tweet,user_loc,extracted_user_loc,location,coordinates,country
1,1,1490905219286245376,291609629,"Hong Kong to tighten social-distancing rules, ...",en,2022-02-08 04:27:58+00:00,hong kong tighten social distancing rules stan...,Singapore,Singapore,"(Singapore, (1.357107, 103.8194992))","(1.357107, 103.8194992, 0.0)",Singapore
3,3,1490905176953454597,2675645600,@jenhada @airallianceHOU How does the communit...,en,2022-02-08 04:27:48+00:00,how does community hold valero accountable tox...,"Wonderland, Michigan",Michigan,"(Michigan, United States, (43.6211955, -84.682...","(43.6211955, -84.6824346, 0.0)",United States
4,4,1490905163321987073,829956505851998209,@AUThackeray is noise pollution not harmful fo...,en,2022-02-08 04:27:45+00:00,is noise pollution not harmful environment,Santa Cruz west,Santa Cruz,"(Santa Cruz de Tenerife, Canary Islands, 38004...","(28.469648, -16.2540884, 0.0)",Spain
5,5,1490905158313967618,402452107,Putting a drink in a fridge/freezer “cools” it...,en,2022-02-08 04:27:44+00:00,putting drink fridge freezer cools it down ess...,TX,,"(None, Torino, Piemont, Italy, (44.933143, 7.5...","(44.933143, 7.540121, 0.0)",Italy
11,11,1490905141805199367,1279647779753611264,we do know what NFTs are. They're called scams...,en,2022-02-08 04:27:40+00:00,we do know what nfts are theyre called scams t...,Ætheria,Ætheria,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
4110,4110,1490895747574685697,1582316359,4. Try building out a full stack dapp\nIn addi...,en,2022-02-08 03:50:20+00:00,4 try building out full stack dapp addition so...,"Cochin, India",India,"(India, (22.3511148, 78.6677428))","(22.3511148, 78.6677428, 0.0)",India
4111,4111,1490895747574685697,1582316359,4. Try building out a full stack dapp\nIn addi...,en,2022-02-08 03:50:20+00:00,4 try building out full stack dapp addition so...,"Cochin, India",India,"(India, (22.3511148, 78.6677428))","(22.3511148, 78.6677428, 0.0)",India
4112,4112,1490895743703339013,1582316359,3. Get comfortable with the Remix IDE\nIt's re...,en,2022-02-08 03:50:19+00:00,3 get comfortable with remix ide its really ea...,"Cochin, India",India,"(India, (22.3511148, 78.6677428))","(22.3511148, 78.6677428, 0.0)",India
4113,4113,1490895743703339013,1582316359,3. Get comfortable with the Remix IDE\nIt's re...,en,2022-02-08 03:50:19+00:00,3 get comfortable with remix ide its really ea...,"Cochin, India",India,"(India, (22.3511148, 78.6677428))","(22.3511148, 78.6677428, 0.0)",India


In [None]:
clean_loc_df.to_csv('clean_data_all_lang_loc.csv', index=False)