In [1]:
import chart_studio, emoji, emot, os, re, stylecloud

import chart_studio.plotly as py
import pandas as pd
import plotly.graph_objs as go

from bertopic import BERTopic
from collections import Counter
from datetime import timedelta
from emosent import get_emoji_sentiment_rank
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from scipy.stats import zscore
from umap import UMAP
from whatstk import WhatsAppChat, FigureBuilder


In [2]:
chart_studio.tools.set_credentials_file(username=os.getenv('CHART_STUDIO_USERNAME'),
                                        api_key=os.getenv('CHART_STUDIO_API_KEY'))

# Data preparation

In [3]:
chat = WhatsAppChat.from_source(filepath='whatsapp_chat.txt', hformat='%d/%m/%Y, %I:%M %p - %name:').df

Text cleaning

In [4]:
def clean_text(text):
    text = text.replace('<Media omitted>', '').replace('This message was deleted', '').replace('\n', ' ').strip()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[0-9]+','', text)
    text = re.sub(r'\s+',' ', text)
    text = re.sub(r'[^\w\s]|_', '', text)
    text = re.sub(r'([a-zA-Z])\1\1','\\1', text)
    return text.lower()

chat['clean_msg'] = chat['message'].apply(clean_text)

# replace slang words with their formal equivalents
lexicon_df = pd.read_csv('lexicon/kamus-alay/colloquial-indonesian-lexicon.csv')
lexicon_dict = dict(zip(lexicon_df.slang, lexicon_df.formal))
chat['clean_msg'] = chat['clean_msg'].apply(lambda x: ' '.join([lexicon_dict.get(word, word) for word in x.split()]))

# remove stop words
with open('lexicon/ID-Stopwords/id.stopwords.02.01.2016.txt', 'r') as f:
    stop_words = f.read().splitlines()
chat['clean_msg'] = chat['clean_msg'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

chat.sample(5)

Unnamed: 0,date,username,message,clean_msg
561,2023-02-04 20:57:00,aris,sudahh kak 🙏,sudahh kak
136,2023-01-14 16:41:00,kajal,trimakasii sudah aktif yaa semoga bermanfaaatt...,trimakasii aktif ya semoga bermanfatt lecturenya
1880,2023-04-01 12:18:00,kajal,Pdhl gak ada yg typing wkwkwkwk,typing wkwkwkwk
2092,2023-04-13 21:19:00,adhit,Wkwkw,wkwkw
456,2023-01-28 11:50:00,ely,Apakah aman semuanya?? 👀,aman


Handling emoticon

In [5]:
emot_obj = emot.emot()

chat['message'].apply(lambda x: '' if re.compile(r'http\S+').search(x) else ''.join(emot_obj.emoticons(x)['value'])
                      ).value_counts().rename_axis('emoticon').reset_index(name='count').query("emoticon != ''")[['emoticon']]

Unnamed: 0,emoticon
1,:(
2,>:(
3,:)
4,:D


In [6]:
chat['message'] = chat['message'].replace({':\(': '☹️', '>:\(': '😠',
                                           ':\)': '🙂', ':D': '😃'
                                           }, regex=True)

Feature engineering

In [7]:
# emoji extraction
chat['emoji'] = chat['message'].apply(lambda x: ''.join(c for c in x if c in emoji.EMOJI_DATA))

# shift all dates back by one hour (UTC+7)
chat['date'] = chat['date'] - timedelta(hours=1)

# date extraction
chat['hour'] = chat['date'].dt.hour
chat['day_name'] = chat['date'].dt.day_name()
chat['week'] = (chat['date'] - pd.Timestamp('2023-01-09')).dt.days // 7

# module creation
chat['module_name'] = chat.week.map({0: 'Introduction & Orientation',
                                     1: 'Understanding Business Problems',
                                     2: 'Spreadsheet & Statistics', 3: 'Spreadsheet & Statistics',
                                     4: 'SQL', 5: 'SQL',
                                     6: 'Python', 7: 'Python', 8: 'Python',
                                     9: 'Group Final Project Preparation Week',
                                     10: 'Data Visualization', 11: 'Data Visualization',
                                     12: 'Data Communication',
                                     13: 'Pitching Week'
                                     }).fillna('')

chat.sample(5)

Unnamed: 0,date,username,message,clean_msg,emoji,hour,day_name,week,module_name
174,2023-01-16 21:16:00,erland,kerjaanku mikirn ka jalit,kerjaanku mikirn kak jalit,,21,Monday,1,Understanding Business Problems
1486,2023-03-24 21:31:00,aris,@kajal,kajal,,21,Friday,10,Data Visualization
1181,2023-03-01 18:39:00,aris,Yang ngajar kelasnya kak jal sampe jam 12 @kajal,mengajar kelasnya kak jal jam kajal,,18,Wednesday,7,Python
855,2023-02-21 21:14:00,erland,aku mau minta kajal tlg jelasin one on one,kajal tolong jelasin one on one,,21,Tuesday,6,Python
854,2023-02-21 21:13:00,rika,Hehehe,hehehe,,21,Tuesday,6,Python


# Exploratory data analysis

## Total messages

In [8]:
print(f'{chat.shape[0]} in total number of messages sent',
      f'\nFirst message sent at {chat.date.min()} by {chat.username[chat.date.idxmin()]}',
      f'\nLast message sent at {chat.date.max()} by {chat.username[chat.date.idxmax()]}')

2129 in total number of messages sent 
First message sent at 2023-01-06 14:38:00 by kajal 
Last message sent at 2023-04-17 22:43:00 by valeri


In [9]:
chat.groupby('username').agg({'message': 'count',
                              'emoji': lambda x: ' '.join(set(emoji for emojis in x.dropna() for emoji in emojis))
                              }).sort_values(by='message', ascending=False)

Unnamed: 0_level_0,message,emoji
username,Unnamed: 1_level_1,Unnamed: 2_level_1
kajal,626,🥳 ❤ 😃 🥰 😆 🥹 ☹ 🫶 😈
valeri,459,🔥 🥳 🤪 🦨 👍 👋 😌 😆 ☹ 🤨 🙏 🤡 😂 🥰 👀 😊 🤩 👌 😭 😝 ❤ 😒 😡 😘 🙃
erland,229,
aris,228,🤣 🙏 💪 🥳 😅 😁 🎉 👀 🏻 🤔 👍
ely,165,🤣 🥳 💨 😫 🛵 🙂 ☺ 🫠 👽 😌 🥲 ☹ 🫶 🤭 😂 👀 😅 🫥 🤧 🥹
alit_jak,126,☹
adhit,80,🤣 🙏 😅 😆 🥊 😁 🥲 ☄ 🤲 😄 🤸 💄 👍 🤩 🍌
ichsan,54,🔥 🙏 👍 😁 😌 🥲 😃 🥹 🏻 😅
rika,47,😂 👍 🥰
alex,35,🙏 🥳 😬 🥲 😂 😊


In [10]:
pd.DataFrame(Counter([emoji for message in chat.emoji for emoji in message]).most_common(),
             columns=['emoji', 'count'],
             index=range(1, len(Counter([emoji for message in chat.emoji for emoji in message]).most_common())+1)
             ).head()

Unnamed: 0,emoji,count
1,😂,69
2,👍,37
3,🙏,34
4,🥰,33
5,🥳,31


Message length boxplot

In [11]:
fig = FigureBuilder(chat.assign(message=chat['message'].apply(lambda x: ''.join([' ' for i in range(len(x.split())) if x != '<Media omitted>'])))
                    ).user_msg_length_boxplot(title=None, xlabel=None)
py.plot(fig, filename='Message length boxplot', auto_open=False)
fig

### Word cloud

In [12]:
stylecloud.gen_stylecloud(' '.join(chat['clean_msg']),
                          icon_name= 'fab fa-whatsapp',
                          colors= ['#25D366', '#128C7E', '#075E54'],
                          random_state= 13
                          )

*stylecloud.png*<br>
![stylecloud.png](stylecloud.png)

## Message activity

All users interventions count

In [13]:
fig = FigureBuilder(chat).user_interventions_count_linechart(title=None, xlabel=None, all_users=True)
py.plot(fig, filename='All users interventions count', auto_open=False)
fig

Cumulative interventions count

In [14]:
fig = FigureBuilder(chat).user_interventions_count_linechart(cumulative=True, title=None, xlabel=None)
py.plot(fig, filename='Cumulative interventions count', auto_open=False)
fig

Standardized interventions heatmap by module

In [15]:
pivot = pd.pivot_table(chat[chat.module_name != ''], index='module_name', columns='username', values='message', aggfunc='count').fillna(0).apply(zscore)
heatmap = go.Heatmap(z=pivot.values,
                     x=pivot.columns,
                     y=pivot.index,
                     hovertemplate='Interventions<extra>%{z}</extra>',
                     colorscale='Greens')
fig = go.Figure(data=[heatmap]).update_layout(yaxis={'categoryorder': 'array',
                                                     'categoryarray': ['Introduction & Orientation',
                                                                       'Understanding Business Problems',
                                                                       'Spreadsheet & Statistics', 'SQL',
                                                                       'Python', 'Group Final Project Preparation Week',
                                                                       'Data Visualization', 'Data Communication',
                                                                       'Pitching Week'
                                                                       ]})
py.plot(fig, filename='Standardized interventions heatmap by module', auto_open=False)
fig

Interventions heatmap by day & hour

In [16]:
pivot = pd.pivot_table(chat, index='hour', columns='day_name', values='message', aggfunc='count').fillna(0)
heatmap = go.Heatmap(z=pivot.values,
                     x=pivot.columns,
                     y=pivot.index,
                     hovertemplate='Interventions at %{y}-hour<extra>%{z}</extra>',
                     colorscale='Greens')
fig = go.Figure(data=[heatmap]).update_layout(xaxis={'categoryorder': 'array',
                                                     'categoryarray': ['Monday', 'Tuesday', 'Wednesday',
                                                                       'Thursday', 'Friday', 'Saturday', 'Sunday']})
py.plot(fig, filename='Interventions heatmap by day & hour', auto_open=False)
fig

Hourly user activity

In [17]:
fig = FigureBuilder(chat).user_interventions_count_linechart(date_mode='hour', title=None, xlabel=None)
py.plot(fig, filename='Hourly user activity', auto_open=False)
fig

Daily user activity

In [18]:
fig = FigureBuilder(chat).user_interventions_count_linechart(date_mode='weekday',
                                                             title=None,
                                                             xlabel=None).update_layout(xaxis={'tickvals': [0, 1, 2, 3, 4, 5, 6],
                                                                                               'ticktext': ['Monday', 'Tuesday', 'Wednesday',
                                                                                                            'Thursday', 'Friday', 'Saturday', 'Sunday']})
py.plot(fig, filename='Daily user activity', auto_open=False)
fig

## User interaction

In [19]:
fig = FigureBuilder(chat).user_message_responses_heatmap(title=None)
py.plot(fig, filename='User interaction heatmap', auto_open=False)
fig

User interaction flow

In [20]:
fig = FigureBuilder(chat).user_message_responses_flow(title=None)
py.plot(fig, filename='User interaction flow', auto_open=False)
fig

# Sentiment analysis

Sentiment labels

In [21]:
# create a function to get the sentiment score of an emoji
def emosent_score(emoji, u=False):
    score, count, em = 0, 0, []
    for e in set(emoji):
        try:
            score += get_emoji_sentiment_rank(e)['sentiment_score']
            count += 1
        except:
            em.append(e)
    if u:
        return em 
    return score/count if count > 0 else score

# initialize the VADER sentiment analyzer with custom lexicon
pos, neg = SentimentIntensityAnalyzer(), SentimentIntensityAnalyzer()

pos.lexicon.clear()
neg.lexicon.clear()

pos.lexicon.update(pd.read_table('lexicon/InSet/positive.tsv').set_index('word').to_dict()['weight'])
neg.lexicon.update(pd.read_table('lexicon/InSet/negative.tsv').set_index('word').to_dict()['weight'])

chat['sentiment'] = chat.apply(lambda x: (neg.polarity_scores(
    x['clean_msg'])['compound'] + pos.polarity_scores(
    x['clean_msg'])['compound'] + (
    0 if (score:=emosent_score(x['emoji'])) == 0 else score)
    ) / ((2 if score != 0 else 1) if (neg.polarity_scores(
    x['clean_msg'])['compound'] + pos.polarity_scores(
    x['clean_msg'])['compound']) != 0 else 1), axis=1)

# extract the compound score and label the sentiment
chat['sentiment'] = chat['sentiment'].apply(lambda x: 'positive' if x > 0 else ('negative' if x < 0 else 'neutral'))

chat.sample(5)

Unnamed: 0,date,username,message,clean_msg,emoji,hour,day_name,week,module_name,sentiment
401,2023-01-25 22:06:00,valeri,<Media omitted>,,,22,Wednesday,2,Spreadsheet & Statistics,neutral
1818,2023-03-16 19:54:00,aris,Hehe aku belum,hehe,,19,Thursday,9,Group Final Project Preparation Week,neutral
1765,2023-03-04 11:42:00,ichsan,Mau request apa di nilai2nya ? Hahaha,request nilainya hahaha,,11,Saturday,7,Python,neutral
1371,2023-03-08 21:14:00,valeri,"Kak Jal cerita dong, jujur aja waktu jadi stud...",kak jal cerita jujur student perasaannya,,21,Wednesday,8,Python,positive
599,2023-02-09 15:09:00,adhit,Atau poster mario sutoo,poster mario sutoo,,15,Thursday,4,SQL,neutral


Sentiment distribution

In [22]:
fig = go.Figure(data=go.Pie(labels=chat.groupby('sentiment').count()[['message']].reset_index()['sentiment'],
                            values=chat.groupby('sentiment').count()[['message']].reset_index()['message'],
                            hole=.4, marker=dict(colors=['#075E54','#dcf8c6', '#25D366', ])))
py.plot(fig, filename='Sentiment distribution', auto_open=False)
fig

Standardized user message sentiment heatmap

In [23]:
pivot = pd.pivot_table(chat, index='sentiment', columns='username', values='message', aggfunc='count').apply(zscore)
heatmap = go.Heatmap(z=pivot.values,
                     x=pivot.columns,
                     y=pivot.index,
                     hovertemplate='Interventions<extra>%{z}</extra>',
                     colorscale='Greens')
fig = go.Figure(data=[heatmap])
py.plot(fig, filename='User message sentiment heatmap', auto_open=False)
fig

Module sentiment heatmap

In [24]:
pivot = pd.pivot_table(chat[chat.module_name != ''], index='module_name', columns='sentiment', values='message', aggfunc='count')
heatmap = go.Heatmap(z=pivot.values,
                     x=pivot.columns,
                     y=pivot.index,
                     hovertemplate='Interventions count<extra>%{z}</extra>',
                     colorscale='Greens')
fig = go.Figure(data=[heatmap]).update_layout(yaxis={'categoryorder': 'array',
                                                     'categoryarray': ['Introduction & Orientation',
                                                                       'Understanding Business Problems',
                                                                       'Spreadsheet & Statistics', 'SQL',
                                                                       'Python', 'Group Final Project Preparation Week',
                                                                       'Data Visualization', 'Data Communication',
                                                                       'Pitching Week'
                                                                       ]})
py.plot(fig, filename='Module sentiment heatmap', auto_open=False)
fig

# Topic modeling

Initiate and fit the model

In [25]:
model = BERTopic(umap_model=UMAP(n_neighbors=15,
                                 n_components=5,
                                 min_dist=0.0,
                                 metric='cosine',
                                 random_state=0),
                 language='multilingual',
                 calculate_probabilities=True,
                 nr_topics='auto')

topics, probabilities = model.fit_transform(
    list(chat[chat.clean_msg != ''].assign(
    message=chat.message.str.replace(re.compile(r'http\S+'), ''))['message'].values))

11268 | INFO | Load pretrained SentenceTransformer: paraphrase-multilingual-MiniLM-L12-v2
11268 | INFO | Use pytorch device: cpu


## Extracting topics

In [26]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,714,-1_kak_ya_di_aku
1,0,102,0_wkwkwk_wkwkw_wkwkkw_wkkwkwk
2,1,96,1_brandon_tumben_kokk_gais
3,2,80,2_thank_you_congrats_makasih
4,3,80,3_menti_leave_yg_ngobrolin
5,4,59,4_aku_lg_jg_bs
6,5,58,5_jam_game_as_besok
7,6,52,6_kelas_lecture_pre_student
8,7,52,7_keren_ok_hebat_banget
9,8,50,8_link_ini_google_video


Topic terms bar chart

In [27]:
fig = model.visualize_barchart(top_n_topics=12, title='')
py.plot(fig, filename='Topic terms bar chart', auto_open=False)
fig

Top 10 terms for topic 0

In [28]:
pd.DataFrame(model.get_topic(0)).rename(columns={0: 'keyword', 1:'score'})

Unnamed: 0,keyword,score
0,wkwkwk,0.111344
1,wkwkw,0.073569
2,wkwkkw,0.071875
3,wkkwkwk,0.060446
4,wkwkwkkw,0.057278
5,hiyaaa,0.048409
6,wkwkwkwk,0.047572
7,wkwkwkw,0.043911
8,wkwk,0.038416
9,wkwkwkwkwkw,0.035172


Topics over time

In [29]:
topics_over_time = model.topics_over_time(list(chat[chat.clean_msg != ''].assign(
    message=chat.message.str.replace(re.compile(r'http\S+'), ''))['message'].values),
    list(chat[chat.clean_msg != '']['date'].values))
fig = model.visualize_topics_over_time(topics_over_time, top_n_topics=10, title='')
py.plot(fig, filename='Topics over time', auto_open=False)
fig

### Topic similarity

Intertopic distance map

In [30]:
fig = model.visualize_topics(title='')
py.plot(fig, filename='Intertopic distance map', auto_open=False)
fig

Topic probabilities with random message

In [31]:
sample = chat[(chat.message != '<Media omitted>') & (chat.clean_msg != '')].assign(
    message=chat.message.str.replace(re.compile(r'http\S+'), '')).reset_index().sample()
print('Sample message:', (sample['message'].iloc[0]))
fig = model.visualize_distribution(probabilities[sample['message'].index[0]], title='')
py.plot(fig, filename='Topic probabilities with random message', auto_open=False)
fig

Sample message: pakai skype


Similarity matrix

In [32]:
fig = model.visualize_heatmap(title='')
py.plot(fig, filename='Similarity matrix', auto_open=False)
fig

Topic hierarchy

In [33]:
fig = model.visualize_hierarchy(title='')
py.plot(fig, filename='Topic hierarchy', auto_open=False)
fig

Term score decline

In [34]:
fig = model.visualize_term_rank(title='')
py.plot(fig, filename='Term score decline', auto_open=False)
fig