In [None]:
import chart_studio, emoji, emot, os, re, stylecloud

import chart_studio.plotly as py
import pandas as pd
import plotly.graph_objs as go

from bertopic import BERTopic
from collections import Counter
from datetime import timedelta
from emosent import get_emoji_sentiment_rank
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from plotly.subplots import make_subplots
from umap import UMAP
from whatstk import WhatsAppChat, FigureBuilder


In [None]:
chart_studio.tools.set_credentials_file(username=os.getenv('CHART_STUDIO_USERNAME'),
                                        api_key=os.getenv('CHART_STUDIO_API_KEY'))

# Data preparation

In [None]:
chat = WhatsAppChat.from_source(filepath='whatsapp_chat.txt', hformat='%d/%m/%Y, %I:%M %p - %name:').df
chat

Text cleaning

In [None]:
def clean_text(text):
    text = text.replace('<Media omitted>', '').replace('This message was deleted', '').replace('\n', ' ').strip()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[0-9]+','', text)
    text = re.sub(r'\s+',' ', text)
    text = re.sub(r'[^\w\s]|_', '', text)
    text = re.sub(r'([a-zA-Z])\1\1','\\1', text)
    return text.lower()

chat['clean_msg'] = chat['message'].apply(clean_text)

# replace slang words with their formal equivalents
lexicon_df = pd.read_csv('lexicon/kamus-alay/colloquial-indonesian-lexicon.csv')
lexicon_dict = dict(zip(lexicon_df.slang, lexicon_df.formal))
chat['clean_msg'] = chat['clean_msg'].apply(lambda x: ' '.join([lexicon_dict.get(word, word) for word in x.split()]))

# remove stop words
with open('lexicon/ID-Stopwords/id.stopwords.02.01.2016.txt', 'r') as f:
    stop_words = f.read().splitlines()
chat['clean_msg'] = chat['clean_msg'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

chat.sample(5)

Handling emoticon

In [None]:
emot_obj = emot.emot()

chat['message'].apply(lambda x: '' if re.compile(r'http\S+').search(x) else ''.join(emot_obj.emoticons(x)['value'])
                      ).value_counts().rename_axis('emoticon').reset_index(name='count').query("emoticon != ''")[['emoticon']]

In [None]:
chat['message'] = chat['message'].replace({':\(': '☹️', '>:\(': '😠',
                                           ':\)': '🙂', ':D': '😃'
                                           }, regex=True)

Feature engineering

In [None]:
# emoji extraction
chat['emoji'] = chat['message'].apply(lambda x: ''.join(c for c in x if c in emoji.EMOJI_DATA))

# shift all dates back by one hour (UTC+7)
chat['date'] = chat['date'] - timedelta(hours=1)

# date extraction
chat['hour'] = chat['date'].dt.hour
chat['day_name'] = chat['date'].dt.day_name()
chat['week'] = (chat['date'] - pd.Timestamp('2023-01-09')).dt.days // 7

# module creation
chat['module_name'] = chat.week.map({0: 'Introduction & Orientation',
                                     1: 'Understanding Business Problems',
                                     2: 'Spreadsheet & Statistics', 3: 'Spreadsheet & Statistics',
                                     4: 'SQL', 5: 'SQL',
                                     6: 'Python', 7: 'Python', 8: 'Python',
                                     9: 'Group Final Project Preparation Week',
                                     10: 'Data Visualization', 11: 'Data Visualization',
                                     12: 'Data Communication',
                                     13: 'Pitching Week'
                                     }).fillna('')

chat.sample(5)

# Exploratory data analysis

## Total messages

In [None]:
chat.groupby('username').agg({'message': 'count',
                              'emoji': lambda x: ' '.join(set(emoji for emojis in x.dropna() for emoji in emojis))
                              }).sort_values(by='message', ascending=False)

Emoji frequency in chats

In [None]:
fig = go.Figure(data=go.Pie(labels=['Chats without emoji', 'Chats with emoji'],
                            values=chat.assign(is_emoji=chat['emoji'].apply(lambda x: True if x != '' else False)).groupby('is_emoji').count()[['message']].reset_index()['message'],
                            hole=.4, marker=dict(colors=[ '#25D366', '#075E54',])))
fig.update_traces(hoverinfo='label+value')
py.plot(fig, filename='Emoji frequency in chats', auto_open=False)
fig

In [None]:
pd.DataFrame(Counter([emoji for message in chat.emoji for emoji in message]).most_common(),
             columns=['emoji', 'count'],
             index=range(1, len(Counter([emoji for message in chat.emoji for emoji in message]).most_common())+1)
             ).head()

Message length boxplot

In [None]:
fig = FigureBuilder(chat.assign(message=chat['message'].apply(lambda x: ''.join([' ' for i in range(len(x.split())) if x != '<Media omitted>'])))
                    ).user_msg_length_boxplot(title=None, xlabel=None)
py.plot(fig, filename='Message length boxplot', auto_open=False)
fig

### Word cloud

In [None]:
stylecloud.gen_stylecloud(' '.join(chat['clean_msg']),
                          icon_name= 'fab fa-whatsapp',
                          colors= ['#25D366', '#128C7E', '#075E54'],
                          random_state= 13
                          )

*stylecloud.png*<br>
![stylecloud.png](stylecloud.png)

## Message activity

All users interventions count

In [None]:
fig = FigureBuilder(chat).user_interventions_count_linechart(title=None, xlabel=None, all_users=True)
py.plot(fig, filename='All users interventions count', auto_open=False)
fig

Cumulative interventions count

In [None]:
fig = FigureBuilder(chat).user_interventions_count_linechart(cumulative=True, title=None, xlabel=None)
py.plot(fig, filename='Cumulative interventions count', auto_open=False)
fig

Interventions heatmap by module

In [None]:
pivot = pd.pivot_table(chat[chat.module_name != ''],
                       index='module_name',
                       columns='username',
                       values='message', aggfunc='count').fillna(0).apply(lambda x: x/x.sum(), axis=0)
heatmap = go.Heatmap(z=pivot.values,
                     x=pivot.columns,
                     y=pivot.index,
                     hovertemplate='Interventions<extra>%{z:.2%}</extra>',
                     colorscale='Greens')
fig = go.Figure(data=[heatmap]).update_layout(yaxis={'categoryorder': 'array',
                                                     'categoryarray': ['Introduction & Orientation',
                                                                       'Understanding Business Problems',
                                                                       'Spreadsheet & Statistics', 'SQL',
                                                                       'Python', 'Group Final Project Preparation Week',
                                                                       'Data Visualization', 'Data Communication',
                                                                       'Pitching Week'
                                                                       ]})
py.plot(fig, filename='Interventions heatmap by module', auto_open=False)
fig

Interventions heatmap by day & hour

In [None]:
pivot = pd.pivot_table(chat, index='hour', columns='day_name', values='message', aggfunc='count').fillna(0)
heatmap = go.Heatmap(z=pivot.values,
                     x=pivot.columns,
                     y=pivot.index,
                     hovertemplate='Interventions at %{y}-hour<extra>%{z}</extra>',
                     colorscale='Greens')
fig = go.Figure(data=[heatmap]).update_layout(xaxis={'categoryorder': 'array',
                                                     'categoryarray': ['Monday', 'Tuesday', 'Wednesday',
                                                                       'Thursday', 'Friday', 'Saturday', 'Sunday']})
py.plot(fig, filename='Interventions heatmap by day & hour', auto_open=False)
fig

Hourly user activity

In [None]:
fig = FigureBuilder(chat).user_interventions_count_linechart(date_mode='hour', title=None, xlabel=None)
py.plot(fig, filename='Hourly user activity', auto_open=False)
fig

Daily user activity

In [None]:
fig = FigureBuilder(chat).user_interventions_count_linechart(date_mode='weekday',
                                                             title=None,
                                                             xlabel=None).update_layout(xaxis={'tickvals': [0, 1, 2, 3, 4, 5, 6],
                                                                                               'ticktext': ['Monday', 'Tuesday', 'Wednesday',
                                                                                                            'Thursday', 'Friday', 'Saturday', 'Sunday']})
py.plot(fig, filename='Daily user activity', auto_open=False)
fig

## User interaction

In [None]:
fig = FigureBuilder(chat).user_message_responses_heatmap(title=None)
py.plot(fig, filename='User interaction heatmap', auto_open=False)
fig

User interaction flow

In [None]:
fig = FigureBuilder(chat).user_message_responses_flow(title=None)
py.plot(fig, filename='User interaction flow', auto_open=False)
fig

# Sentiment analysis

Sentiment labels

In [None]:
# create a function to get the sentiment score of an emoji
def emosent_score(emoji):
    score, count = 0, 0
    for e in set(emoji):
        try:
            score += get_emoji_sentiment_rank(e)['sentiment_score']
            count += 1
        except:
            continue
    return score/count if count != 0 else score

# initialize the VADER sentiment analyzer with custom lexicon
pos, neg = SentimentIntensityAnalyzer(), SentimentIntensityAnalyzer()

pos.lexicon.clear()
neg.lexicon.clear()

pos.lexicon.update(pd.read_table('lexicon/InSet/positive.tsv').set_index('word').to_dict()['weight'])
neg.lexicon.update(pd.read_table('lexicon/InSet/negative.tsv').set_index('word').to_dict()['weight'])

chat['sentiment'] = chat.apply(lambda x: (neg.polarity_scores(
    x['clean_msg'])['compound'] + pos.polarity_scores(
    x['clean_msg'])['compound'] + (
    0 if (score:=emosent_score(x['emoji'])) == 0 else score)
    ) / ((2 if score != 0 else 1) if (neg.polarity_scores(
    x['clean_msg'])['compound'] + pos.polarity_scores(
    x['clean_msg'])['compound']) != 0 else 1), axis=1)

# extract the compound score and label the sentiment
chat['sentiment'] = chat['sentiment'].apply(lambda x: 'positive' if x > 0.05 else ('negative' if x < -0.05 else 'neutral'))

chat.sample(5)

Sentiment distribution

In [None]:
fig = go.Figure(data=go.Pie(labels=['Negative', 'Neutral', 'Positive'],
                            values=chat.groupby('sentiment').count()[['message']].reset_index()['message'],
                            hole=.4, marker=dict(colors=['#075E54','#dcf8c6', '#25D366', ])))
fig.update_traces(hoverinfo='label+value')
py.plot(fig, filename='Sentiment distribution', auto_open=False)
fig

In [None]:
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(labels=['Negative', 'Neutral', 'Positive'],
                     values=chat[chat.emoji == ''].groupby('sentiment').count()[['message']].reset_index()['message'],
                     marker=dict(colors=['#075E54','#dcf8c6', '#25D366', ]),
                     title=dict(text='<b>without</b>', font=dict(size=16))), 1, 1)
fig.add_trace(go.Pie(labels=['Negative', 'Neutral', 'Positive'],
                     values=chat[chat.emoji != ''].groupby('sentiment').count()[['message']].reset_index()['message'],
                     hole=.4, marker=dict(colors=['#075E54','#dcf8c6', '#25D366', ]),
                     title=dict(text='<b>with</b>', font=dict(size=16))), 1, 2)

fig.update_traces(hole=.4, hoverinfo='label+value')
py.plot(fig, filename='Sentiment of chat with & without emojis', auto_open=False)
fig

User message sentiment heatmap

In [None]:
pivot = pd.pivot_table(chat, index='sentiment',
                       columns='username',
                       values='message',
                       aggfunc='count').apply(lambda x: x/x.sum(), axis=0)
heatmap = go.Heatmap(z=pivot.values,
                     x=pivot.columns,
                     y=pivot.index,
                     hovertemplate='Interventions<extra>%{z:.2%}</extra>',
                     colorscale='Greens')
fig = go.Figure(data=[heatmap])
py.plot(fig, filename='User message sentiment heatmap', auto_open=False)
fig

Module sentiment heatmap

In [None]:
pivot = pd.pivot_table(chat[chat.module_name != ''],
                       index='module_name',
                       columns='sentiment',
                       values='message',
                       aggfunc='count').apply(lambda x: x/x.sum(), axis=0)
heatmap = go.Heatmap(z=pivot.values,
                     x=pivot.columns,
                     y=pivot.index,
                     hovertemplate='Interventions<extra>%{z:.2%}</extra>',
                     colorscale='Greens')
fig = go.Figure(data=[heatmap]).update_layout(yaxis={'categoryorder': 'array',
                                                     'categoryarray': ['Introduction & Orientation',
                                                                       'Understanding Business Problems',
                                                                       'Spreadsheet & Statistics', 'SQL',
                                                                       'Python', 'Group Final Project Preparation Week',
                                                                       'Data Visualization', 'Data Communication',
                                                                       'Pitching Week'
                                                                       ]})
py.plot(fig, filename='Module sentiment heatmap', auto_open=False)
fig

# Topic modeling

Initiate and fit the model

In [None]:
model = BERTopic(umap_model=UMAP(n_neighbors=15,
                                 n_components=5,
                                 min_dist=0.0,
                                 metric='cosine',
                                 random_state=13),
                 language='multilingual',
                 calculate_probabilities=True,
                 nr_topics='auto')

topics, probabilities = model.fit_transform(
    list(chat[chat.clean_msg != ''].assign(
    message=chat.message.str.replace(re.compile(r'http\S+'), ''))['message'].values))

## Extracting topics

In [None]:
model.get_topic_info()

Topic terms bar chart

In [None]:
fig = model.visualize_barchart(top_n_topics=12, title='')
py.plot(fig, filename='Topic terms bar chart', auto_open=False)
fig

Term score decline

In [None]:
fig = model.visualize_term_rank(title='')
py.plot(fig, filename='Term score decline', auto_open=False)
fig

Topics over time

In [None]:
topics_over_time = model.topics_over_time(list(chat[chat.clean_msg != ''].assign(
    message=chat.message.str.replace(re.compile(r'http\S+'), ''))['message'].values),
    list(chat[chat.clean_msg != '']['date'].values), nr_bins=20)
fig = model.visualize_topics_over_time(topics_over_time, title='')
py.plot(fig, filename='Topics over time', auto_open=False)
fig

### Topic similarity

Intertopic distance map

In [None]:
fig = model.visualize_topics(title='')
py.plot(fig, filename='Intertopic distance map', auto_open=False)
fig

Similarity matrix

In [None]:
fig = model.visualize_heatmap(title='')
py.plot(fig, filename='Similarity matrix', auto_open=False)
fig

Topic hierarchy

In [None]:
fig = model.visualize_hierarchy(title='')
py.plot(fig, filename='Topic hierarchy', auto_open=False)
fig

Topic probability with a random message

In [None]:
sample = chat[(chat.message != '<Media omitted>') & (chat.clean_msg != '')].assign(
    message=chat.message.str.replace(re.compile(r'http\S+'), '')).reset_index().sample()
print('Sample message:', (sample['message'].iloc[0]))
fig = model.visualize_distribution(probabilities[sample['message'].index[0]], title='')
py.plot(fig, filename='Topic probability with a random message', auto_open=False)
fig