# Objective:

- Find frequency of hashtags.
- Explore `date`, `language`, and `text`.
- Remove languages that have low frequency.
- Change `date` type to `datetime`.
- Clean `text` for sentimental analysis, tokenization, and remove stopwords (eng).

# Set up:

In [None]:
from google.colab import drive
drive.mount('drive')

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


In [None]:
import os
os.chdir('/content/drive/MyDrive/NLP project /data')

# Libraries:

In [None]:
import pandas as pd
import plotly.express as px
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import defaultdict


In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [31]:
df = pd.read_csv('tweets.csv')

# Exploratory Data Analysis:

## Language

In [32]:
df['language'].value_counts()

en     11072
qht     1808
qme     1356
ja       401
es       350
und      319
hi       203
zh        73
in        69
ko        68
pt        52
et        45
it        42
tl        37
th        36
fr        26
fa        18
vi        17
de        11
tr        11
ca        10
pl        10
ml         9
nl         9
ru         8
da         7
mr         5
ar         4
pa         4
sv         4
ht         4
te         3
cs         3
fi         2
sl         2
lv         2
ta         2
or         1
ne         1
ro         1
eu         1
hu         1
uk         1
cy         1
is         1
lt         1
no         1
Name: language, dtype: int64

In [38]:
fig = px.histogram(df, x="language",
                   title = ("Tweet languages histogram"),
                   text_auto=True).update_layout(bargap=0.2)

fig.update_xaxes(tickangle=-90)                   
fig.show()

In [None]:
df.drop(df[df['language'] != 'en'].index, inplace = True)

**COMMENT:** I decided to drop all non-english tweets because of low frequency.

## Hashtags:

In [None]:
#lower tweet contents in order to find all hashtags
lower_tweet_content = ''
for i in df.text:
    lower_tweet_content += i.lower()

#dictionary that contains hashtags and their count
hashtag_dict = defaultdict(int)

hashtag = re.findall(r'\#\w+', lower_tweet_content)
for i in hashtag:
    hashtag_dict[i] +=1

#sort descending order
hashtag_dict = dict(sorted(hashtag_dict.items(), key=lambda item: item[1],reverse=True))

#to DataFrame for visualization
hash_df = pd.DataFrame(hashtag_dict.items(), columns=['hashtag', 'count'])

In [None]:
hash_df = hash_df[hash_df['count'] > 50] #there are too many hashtags, so I decide to pick the ones with more than 50 counts

In [29]:
#visualization
fig = px.bar(hash_df, x='hashtag', y='count',
                   title = ("Hashtags frequency plot"),
                   labels={'hashtag': 'Hashtag', 
                           'count':'Count'},
                   text_auto=True).update_layout(bargap=0.2,font=dict(size=10))

fig.update_xaxes(tickangle=-45)
fig.show()

## Date:

In [None]:
df['date'] = df['date'].astype("datetime64")

In [None]:
df.sort_values(by='date').head(10)

Unnamed: 0,date,tweet_id,text,user_name,reply_count,retweet_count,like_count,language
16111,2021-11-25 02:17:18,1463693245997731843,@globaltimesnews Commitment to spread #Chinese...,KarnekarAnand,0,0,0,en
16110,2021-11-25 02:18:15,1463693483034566658,Why is this happening to #USA?\nIt is happenin...,HKChandnani,0,0,0,en
16109,2021-11-25 03:36:36,1463713202336776192,@globaltimesnews #China is the enemy of entire...,vivek_nana,0,0,1,en
16107,2021-11-25 05:56:04,1463748301111496705,@globaltimesnews No. these are majorly affecte...,hrishi4778,0,0,1,en
16106,2021-11-25 06:12:25,1463752416365408259,In #Philadelphia today 😞\nSeeing racial slurs ...,bulbulnyc,0,0,0,en
16105,2021-11-25 06:26:02,1463755839999602693,@globaltimesnews The problem with China is tha...,3170Anupam,0,0,5,en
16103,2021-11-25 06:52:38,1463762536138821637,Perfect shit colour for shitty army of #CCP wa...,BigDaddy310631,0,0,0,en
16102,2021-11-25 08:03:21,1463780333334065160,#COVID #ChineseVirus is still there,anmol0707,0,0,0,en
16101,2021-11-25 08:20:27,1463784636123811847,Considering the raise in #ChineseVirus cases i...,anmol0707,0,0,0,en
16100,2021-11-25 08:22:44,1463785210646974465,"Country after country dependent on Pfizer, Mod...",Sh07509514,0,0,2,en


In [None]:
# Tweets date histogram

dates_count = df['date'].dt.date.value_counts().sort_index()


fig = px.line(df, 
        x = dates_count.index, y = dates_count,
        title = ("Tweet counts over time on the topic of Asian hate speech"),
        labels={'x': 'Timeline', 
                'y':'Frequency'})
fig.show()

## Text:

In [None]:
stemmer = PorterStemmer() #stemming

In [None]:
df = df.drop_duplicates('text')

In [None]:
stop_words = set(stopwords.words("english"))

In [None]:
def tweet_cleaning(text):
    text = text.lower() #lower cases for sentimental analysis
    text = re.sub(r'https?:\/\/\S+','',text) #remove hyperlinks
    text = re.sub(r'#','',text) #remove hashtag symbols
    text = re.sub(r'@[a-z0-9]+','',text) #remove @mentions
    text = text = re.sub(r'[^^a-zA-Z0-9\s]','',text) #remove punctuations/special characters
    text = re.sub(r'(RT|rt|FAV|fav|VIA|via)', '', text) #remove all retweets, faves reference in content
    text_tokens = word_tokenize(text)
    text = [w for w in text_tokens if not w in stop_words] #keep words in tokenization, remove stopwords
    return " ".join(text)

In [None]:
df['text'] = df['text'].apply(tweet_cleaning)

In [None]:
df['text'].iloc[1]

'horrifying paly grew bloomington amp large number foreign students university made feel child town welcoming multiculturalism horrifying problem everywhere country stopasianhate stopaapihate'

In [None]:
df.head(10)

Unnamed: 0,date,tweet_id,text,user_name,reply_count,retweet_count,like_count,language
0,2023-01-15 16:19:28,1614658536038105088,hate crime needs charged suchstories like nati...,speedkitty,0,0,0,en
1,2023-01-15 15:52:49,1614651830130118659,horrifying paly grew bloomington amp large num...,ellen_adair,0,0,8,en
2,2023-01-15 15:32:53,1614646811352002560,another prime example media using antiasian am...,AllLeahWrote,1,0,11,en
4,2023-01-15 14:38:58,1614633243533545473,according cou documents white woman told polic...,queenkv,0,0,2,en
7,2023-01-15 10:09:35,1614565451274809349,msm quiet asian hate billie davis 56 admitted ...,FirstThemNEWS,7,50,78,en
10,2023-01-15 06:11:32,1614505546257960961,thank sharing lest think antiasian hate decrea...,dorisfchang,0,0,0,en
12,2023-01-15 05:00:00,1614487542769795073,hate asian american pacific islander aapi popu...,Respond2Racism,0,0,1,en
13,2023-01-15 04:02:16,1614473015227547649,stories media afraid discuss cover ask represe...,AsianSocialNet,0,1,2,en
15,2023-01-15 00:54:43,1614425814560567296,thanks hosting screening inspiring look linsan...,PhilTing,1,1,4,en
16,2023-01-15 00:23:59,1614418082230931457,good night tweeps getvaccinated getboosted sta...,kitchen5203,0,0,0,en


# Exporting data to .CSV file:

In [None]:
# df.to_csv('/content/drive/MyDrive/NLP project /data/cleaned_tweets.csv', encoding='utf-8', index=False)