In [1]:
import chart_studio, emoji, emot, os, re, stylecloud

import chart_studio.plotly as py
import pandas as pd
import plotly.graph_objs as go

from bertopic import BERTopic
from collections import Counter
from datetime import timedelta
from emosent import get_emoji_sentiment_rank
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from plotly.subplots import make_subplots
from umap import UMAP
from whatstk import WhatsAppChat, FigureBuilder


In [2]:
chart_studio.tools.set_credentials_file(username=os.getenv('CHART_STUDIO_USERNAME'),
                                        api_key=os.getenv('CHART_STUDIO_API_KEY'))

# Data preparation

In [3]:
chat = WhatsAppChat.from_source(filepath='whatsapp_chat.txt', hformat='%d/%m/%Y, %I:%M %p - %name:').df
chat

Unnamed: 0,date,username,message
0,2023-01-06 15:38:00,kajal,"Haii semua, selamat datang ke team 6 section p..."
1,2023-01-06 15:39:00,kajal,Ini linkedin aku yaa feel free to add and tany...
2,2023-01-06 15:39:00,kajal,https://www.linkedin.com/in/jalita-sendiko
3,2023-01-06 15:39:00,valeri,Hai haii!!! Terima kasih kak 🥰🥰🥰 Perkenalin a...
4,2023-01-06 15:40:00,ely,"Hai, Kak Jelita!! ☺️"
...,...,...,...
2124,2023-04-15 17:20:00,valeri,Jangan lupa si double khalid
2125,2023-04-15 17:20:00,kajal,and duo alit!!
2126,2023-04-15 17:20:00,valeri,Thank youuuu
2127,2023-04-16 21:40:00,aris,Congratsss Vale dan duo alitt Mantabbb menang...


Text cleaning

In [4]:
def clean_text(text):
    text = text.replace('<Media omitted>', '').replace('This message was deleted', '').replace('\n', ' ').strip()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[0-9]+','', text)
    text = re.sub(r'\s+',' ', text)
    text = re.sub(r'[^\w\s]|_', '', text)
    text = re.sub(r'([a-zA-Z])\1\1','\\1', text)
    return text.lower()

chat['clean_msg'] = chat['message'].apply(clean_text)

# replace slang words with their formal equivalents
lexicon_df = pd.read_csv('lexicon/kamus-alay/colloquial-indonesian-lexicon.csv')
lexicon_dict = dict(zip(lexicon_df.slang, lexicon_df.formal))
chat['clean_msg'] = chat['clean_msg'].apply(lambda x: ' '.join([lexicon_dict.get(word, word) for word in x.split()]))

# remove stop words
with open('lexicon/ID-Stopwords/id.stopwords.02.01.2016.txt', 'r') as f:
    stop_words = f.read().splitlines()
chat['clean_msg'] = chat['clean_msg'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

chat.sample(5)

Unnamed: 0,date,username,message,clean_msg
737,2023-02-16 22:10:00,kajal,isi dengan baik dan seperti masyarakat teladan...,isi masyarakat teladan kasar
1573,2023-04-11 13:14:00,kajal,disuruh apa jg ngeles mulu wkwkw,disuruh ngeles mulu wkwkw
625,2023-02-13 21:26:00,kajal,@mirza @alit_gor ga masuk kahh?,mirza alitgor masuk kahh
2051,2023-04-11 22:02:00,adhit,Udah boleh leave tidur ka jal?,leave tidur kak jal
1887,2023-04-03 21:15:00,alit_jak,udh gue benerin tuh,gue benerin tuh


Handling emoticon

In [5]:
emot_obj = emot.emot()

chat['message'].apply(lambda x: '' if re.compile(r'http\S+').search(x) else ''.join(emot_obj.emoticons(x)['value'])
                      ).value_counts().rename_axis('emoticon').reset_index(name='count').query("emoticon != ''")[['emoticon']]

Unnamed: 0,emoticon
1,:(
2,>:(
3,:)
4,:D


In [6]:
chat['message'] = chat['message'].replace({':\(': '☹️', '>:\(': '😠',
                                           ':\)': '🙂', ':D': '😃'
                                           }, regex=True)

Feature engineering

In [7]:
# emoji extraction
chat['emoji'] = chat['message'].apply(lambda x: ''.join(c for c in x if c in emoji.EMOJI_DATA))

# shift all dates back by one hour (UTC+7)
chat['date'] = chat['date'] - timedelta(hours=1)

# date extraction
chat['hour'] = chat['date'].dt.hour
chat['day_name'] = chat['date'].dt.day_name()
chat['week'] = (chat['date'] - pd.Timestamp('2023-01-09')).dt.days // 7

# module creation
chat['module_name'] = chat.week.map({0: 'Introduction & Orientation',
                                     1: 'Understanding Business Problems',
                                     2: 'Spreadsheet & Statistics', 3: 'Spreadsheet & Statistics',
                                     4: 'SQL', 5: 'SQL',
                                     6: 'Python', 7: 'Python', 8: 'Python',
                                     9: 'Group Final Project Preparation Week',
                                     10: 'Data Visualization', 11: 'Data Visualization',
                                     12: 'Data Communication',
                                     13: 'Pitching Week'
                                     }).fillna('')

chat.sample(5)

Unnamed: 0,date,username,message,clean_msg,emoji,hour,day_name,week,module_name
1596,2023-04-11 12:17:00,aris,keknya sih itu,kayaknya sih,,12,Tuesday,13,Pitching Week
1278,2023-03-06 21:27:00,aris,Besoookk aja abis kelas,besokk habis kelas,,21,Monday,8,Python
1228,2023-03-03 10:32:00,kajal,Gak ad yg mau join dah,join deh,,10,Friday,7,Python
1039,2023-02-23 16:17:00,kajal,Wkwkkw eek,wkwkkw eek,,16,Thursday,6,Python
23,2023-01-07 09:25:00,ely,<Media omitted>,,,9,Saturday,-1,


# Exploratory data analysis

## Total messages

In [8]:
chat.groupby('username').agg({'message': 'count',
                              'emoji': lambda x: ' '.join(set(emoji for emojis in x.dropna() for emoji in emojis))
                              }).sort_values(by='message', ascending=False)

Unnamed: 0_level_0,message,emoji
username,Unnamed: 1_level_1,Unnamed: 2_level_1
kajal,626,😈 🥰 ☹ 😃 ❤ 🫶 🥹 😆 🥳
valeri,459,😒 😡 😊 👋 🔥 👌 🦨 😆 😝 😌 😂 ❤ 🤩 🙃 😘 🙏 👍 ☹ 👀 🤪 🤡 🥳 😭 🥰 🤨
erland,229,
aris,228,🏻 🙏 👍 👀 💪 🤣 😅 🤔 😁 🎉 🥳
ely,165,🫥 🤣 👽 💨 😂 😌 🫠 🤭 🥲 🤧 🙂 😅 🫶 ☺ ☹ 👀 🥹 🥳 😫 🛵
alit_jak,126,☹
adhit,80,🍌 🙏 👍 😅 🥊 🤩 🤸 💄 🥲 ☄ 🤣 😄 😁 🤲 😆
ichsan,54,🏻 🙏 👍 😌 🔥 😃 🥲 😅 🥹 😁
rika,47,👍 🥰 😂
alex,35,🙏 😊 😂 😬 🥲 🥳


Emoji frequency in chats

In [9]:
fig = go.Figure(data=go.Pie(labels=['Chats without emoji', 'Chats with emoji'],
                            values=chat.assign(is_emoji=chat['emoji'].apply(lambda x: True if x != '' else False)).groupby('is_emoji').count()[['message']].reset_index()['message'],
                            hole=.4, marker=dict(colors=[ '#25D366', '#075E54',])))
fig.update_traces(hoverinfo='label+value')
py.plot(fig, filename='Emoji frequency in chats', auto_open=False)
fig

In [10]:
pd.DataFrame(Counter([emoji for message in chat.emoji for emoji in message]).most_common(),
             columns=['emoji', 'count'],
             index=range(1, len(Counter([emoji for message in chat.emoji for emoji in message]).most_common())+1)
             ).head()

Unnamed: 0,emoji,count
1,😂,69
2,👍,37
3,🙏,34
4,🥰,33
5,🥳,31


Message length boxplot

In [11]:
fig = FigureBuilder(chat.assign(message=chat['message'].apply(lambda x: ''.join([' ' for i in range(len(x.split())) if x != '<Media omitted>'])))
                    ).user_msg_length_boxplot(title=None, xlabel=None)
py.plot(fig, filename='Message length boxplot', auto_open=False)
fig

### Word cloud

In [12]:
stylecloud.gen_stylecloud(' '.join(chat['clean_msg']),
                          icon_name= 'fab fa-whatsapp',
                          colors= ['#25D366', '#128C7E', '#075E54'],
                          random_state= 13
                          )

*stylecloud.png*<br>
![stylecloud.png](stylecloud.png)

## Message activity

All users interventions count

In [13]:
fig = FigureBuilder(chat).user_interventions_count_linechart(title=None, xlabel=None, all_users=True)
py.plot(fig, filename='All users interventions count', auto_open=False)
fig

Cumulative interventions count

In [14]:
fig = FigureBuilder(chat).user_interventions_count_linechart(cumulative=True, title=None, xlabel=None)
py.plot(fig, filename='Cumulative interventions count', auto_open=False)
fig

Interventions heatmap by module

In [15]:
pivot = pd.pivot_table(chat[chat.module_name != ''],
                       index='module_name',
                       columns='username',
                       values='message', aggfunc='count').fillna(0).apply(lambda x: x/x.sum(), axis=0)
heatmap = go.Heatmap(z=pivot.values,
                     x=pivot.columns,
                     y=pivot.index,
                     hovertemplate='Interventions<extra>%{z:.2%}</extra>',
                     colorscale='Greens')
fig = go.Figure(data=[heatmap]).update_layout(yaxis={'categoryorder': 'array',
                                                     'categoryarray': ['Introduction & Orientation',
                                                                       'Understanding Business Problems',
                                                                       'Spreadsheet & Statistics', 'SQL',
                                                                       'Python', 'Group Final Project Preparation Week',
                                                                       'Data Visualization', 'Data Communication',
                                                                       'Pitching Week'
                                                                       ]})
py.plot(fig, filename='Interventions heatmap by module', auto_open=False)
fig

Interventions heatmap by day & hour

In [16]:
pivot = pd.pivot_table(chat, index='hour', columns='day_name', values='message', aggfunc='count').fillna(0)
heatmap = go.Heatmap(z=pivot.values,
                     x=pivot.columns,
                     y=pivot.index,
                     hovertemplate='Interventions at %{y}-hour<extra>%{z}</extra>',
                     colorscale='Greens')
fig = go.Figure(data=[heatmap]).update_layout(xaxis={'categoryorder': 'array',
                                                     'categoryarray': ['Monday', 'Tuesday', 'Wednesday',
                                                                       'Thursday', 'Friday', 'Saturday', 'Sunday']})
py.plot(fig, filename='Interventions heatmap by day & hour', auto_open=False)
fig

Hourly user activity

In [17]:
fig = FigureBuilder(chat).user_interventions_count_linechart(date_mode='hour', title=None, xlabel=None)
py.plot(fig, filename='Hourly user activity', auto_open=False)
fig

Daily user activity

In [18]:
fig = FigureBuilder(chat).user_interventions_count_linechart(date_mode='weekday',
                                                             title=None,
                                                             xlabel=None).update_layout(xaxis={'tickvals': [0, 1, 2, 3, 4, 5, 6],
                                                                                               'ticktext': ['Monday', 'Tuesday', 'Wednesday',
                                                                                                            'Thursday', 'Friday', 'Saturday', 'Sunday']})
py.plot(fig, filename='Daily user activity', auto_open=False)
fig

## User interaction

In [19]:
fig = FigureBuilder(chat).user_message_responses_heatmap(title=None)
py.plot(fig, filename='User interaction heatmap', auto_open=False)
fig

User interaction flow

In [20]:
fig = FigureBuilder(chat).user_message_responses_flow(title=None)
py.plot(fig, filename='User interaction flow', auto_open=False)
fig

# Sentiment analysis

Sentiment labels

In [21]:
# create a function to get the sentiment score of an emoji
def emosent_score(emoji):
    score, count = 0, 0
    for e in set(emoji):
        try:
            score += get_emoji_sentiment_rank(e)['sentiment_score']
            count += 1
        except:
            continue
    return score/count if count != 0 else score

# initialize the VADER sentiment analyzer with custom lexicon
pos, neg = SentimentIntensityAnalyzer(), SentimentIntensityAnalyzer()

pos.lexicon.clear()
neg.lexicon.clear()

pos.lexicon.update(pd.read_table('lexicon/InSet/positive.tsv').set_index('word').to_dict()['weight'])
neg.lexicon.update(pd.read_table('lexicon/InSet/negative.tsv').set_index('word').to_dict()['weight'])

chat['sentiment'] = chat.apply(lambda x: (neg.polarity_scores(
    x['clean_msg'])['compound'] + pos.polarity_scores(
    x['clean_msg'])['compound'] + (
    0 if (score:=emosent_score(x['emoji'])) == 0 else score)
    ) / ((2 if score != 0 else 1) if (neg.polarity_scores(
    x['clean_msg'])['compound'] + pos.polarity_scores(
    x['clean_msg'])['compound']) != 0 else 1), axis=1)

# extract the compound score and label the sentiment
chat['sentiment'] = chat['sentiment'].apply(lambda x: 'positive' if x > 0.05 else ('negative' if x < -0.05 else 'neutral'))

chat.sample(5)

Unnamed: 0,date,username,message,clean_msg,emoji,hour,day_name,week,module_name,sentiment
428,2023-01-26 11:01:00,kajal,Btww ini linkedin aku yaa buat yg blm connect ...,btww linkedin ya connect thank youu,,11,Thursday,2,Spreadsheet & Statistics,positive
1197,2023-03-01 20:27:00,kajal,knp didelet woy,didelet woy,,20,Wednesday,7,Python,neutral
1307,2023-03-07 20:07:00,erland,@alex,alex,,20,Tuesday,8,Python,neutral
781,2023-02-18 15:24:00,valeri,Makasih kak Gor 😌 semoga ntar anaknya jangan k...,terima kasih kak gor semoga entar anaknya kaya...,😌😂😂😂,15,Saturday,5,SQL,positive
1021,2023-02-23 16:08:00,kajal,ntr dah story timenya,entar deh story timenya,,16,Thursday,6,Python,neutral


Sentiment distribution

In [22]:
fig = go.Figure(data=go.Pie(labels=['Negative', 'Neutral', 'Positive'],
                            values=chat.groupby('sentiment').count()[['message']].reset_index()['message'],
                            hole=.4, marker=dict(colors=['#075E54','#dcf8c6', '#25D366', ])))
fig.update_traces(hoverinfo='label+value')
py.plot(fig, filename='Sentiment distribution', auto_open=False)
fig

In [23]:
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(labels=['Negative', 'Neutral', 'Positive'],
                     values=chat[chat.emoji == ''].groupby('sentiment').count()[['message']].reset_index()['message'],
                     marker=dict(colors=['#075E54','#dcf8c6', '#25D366', ]),
                     title=dict(text='<b>without</b>', font=dict(size=16))), 1, 1)
fig.add_trace(go.Pie(labels=['Negative', 'Neutral', 'Positive'],
                     values=chat[chat.emoji != ''].groupby('sentiment').count()[['message']].reset_index()['message'],
                     hole=.4, marker=dict(colors=['#075E54','#dcf8c6', '#25D366', ]),
                     title=dict(text='<b>with</b>', font=dict(size=16))), 1, 2)

fig.update_traces(hole=.4, hoverinfo='label+value')
py.plot(fig, filename='Sentiment of chat with & without emojis', auto_open=False)
fig

User message sentiment heatmap

In [24]:
pivot = pd.pivot_table(chat, index='sentiment',
                       columns='username',
                       values='message',
                       aggfunc='count').apply(lambda x: x/x.sum(), axis=0)
heatmap = go.Heatmap(z=pivot.values,
                     x=pivot.columns,
                     y=pivot.index,
                     hovertemplate='Interventions<extra>%{z:.2%}</extra>',
                     colorscale='Greens')
fig = go.Figure(data=[heatmap])
py.plot(fig, filename='User message sentiment heatmap', auto_open=False)
fig

Module sentiment heatmap

In [25]:
pivot = pd.pivot_table(chat[chat.module_name != ''],
                       index='module_name',
                       columns='sentiment',
                       values='message',
                       aggfunc='count').apply(lambda x: x/x.sum(), axis=0)
heatmap = go.Heatmap(z=pivot.values,
                     x=pivot.columns,
                     y=pivot.index,
                     hovertemplate='Interventions<extra>%{z:.2%}</extra>',
                     colorscale='Greens')
fig = go.Figure(data=[heatmap]).update_layout(yaxis={'categoryorder': 'array',
                                                     'categoryarray': ['Introduction & Orientation',
                                                                       'Understanding Business Problems',
                                                                       'Spreadsheet & Statistics', 'SQL',
                                                                       'Python', 'Group Final Project Preparation Week',
                                                                       'Data Visualization', 'Data Communication',
                                                                       'Pitching Week'
                                                                       ]})
py.plot(fig, filename='Module sentiment heatmap', auto_open=False)
fig

# Topic modeling

Initiate and fit the model

In [26]:
model = BERTopic(umap_model=UMAP(n_neighbors=15,
                                 n_components=5,
                                 min_dist=0.0,
                                 metric='cosine',
                                 random_state=13),
                 language='multilingual',
                 calculate_probabilities=True,
                 nr_topics='auto')

topics, probabilities = model.fit_transform(
    list(chat[chat.clean_msg != ''].assign(
    message=chat.message.str.replace(re.compile(r'http\S+'), ''))['message'].values))

11428 | INFO | Load pretrained SentenceTransformer: paraphrase-multilingual-MiniLM-L12-v2
11428 | INFO | Use pytorch device: cpu


## Extracting topics

In [27]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,697,-1_kak_ya_aku_di
1,0,115,0_brandon_tumben_kokk_awkawk
2,1,112,1_wkwkwk_wkwkw_wkwkkw_wkkwkwk
3,2,103,2_menti_yg_iya_aja
4,3,82,3_thank_you_congrats_makasih
5,4,66,4_jam_besok_game_as
6,5,60,5_aku_lg_bs_juga
7,6,56,6_absen_gak_lupa_ga
8,7,53,7_kelas_lecture_pre_student
9,8,49,8_link_google_ini_video


Topic terms bar chart

In [28]:
fig = model.visualize_barchart(top_n_topics=12, title='')
py.plot(fig, filename='Topic terms bar chart', auto_open=False)
fig

Topics over time

In [29]:
topics_over_time = model.topics_over_time(list(chat[chat.clean_msg != ''].assign(
    message=chat.message.str.replace(re.compile(r'http\S+'), ''))['message'].values),
    list(chat[chat.clean_msg != '']['date'].values))
fig = model.visualize_topics_over_time(topics_over_time, top_n_topics=10, title='')
py.plot(fig, filename='Topics over time', auto_open=False)
fig

### Topic similarity

Intertopic distance map

In [30]:
fig = model.visualize_topics(title='')
py.plot(fig, filename='Intertopic distance map', auto_open=False)
fig

Topic probabilities with random message

In [31]:
sample = chat[(chat.message != '<Media omitted>') & (chat.clean_msg != '')].assign(
    message=chat.message.str.replace(re.compile(r'http\S+'), '')).reset_index().sample()
print('Sample message:', (sample['message'].iloc[0]))
fig = model.visualize_distribution(probabilities[sample['message'].index[0]], title='')
py.plot(fig, filename='Topic probabilities with random message', auto_open=False)
fig

Sample message: boong bgt


Similarity matrix

In [32]:
fig = model.visualize_heatmap(title='')
py.plot(fig, filename='Similarity matrix', auto_open=False)
fig

Topic hierarchy

In [33]:
fig = model.visualize_hierarchy(title='')
py.plot(fig, filename='Topic hierarchy', auto_open=False)
fig

Term score decline

In [34]:
fig = model.visualize_term_rank(title='')
py.plot(fig, filename='Term score decline', auto_open=False)
fig