<a href="https://colab.research.google.com/github/jerpint/whatsapp/blob/master/whatsapp_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##  Getting started

### Obtain the chat
First, get your chat history from whatsapp. This should come in a .txt file (you can do this directly from WhatsApp).There are different ways to do this, [follow the official instruction](https://faq.whatsapp.com/android/chats/how-to-save-your-chat-history/?lang=en).


### ***A note on privacy***
WhatsApp conversations are encrypted, however once you obtain a `.txt` file, it is no longer encrypted. Anyone with the file can read its contents. 
Using the notebook will upload your file to Google servers.

In [None]:
#@title <=== Click the play button
import re
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
from google.colab import files

def extract_line(line):
    """
    Function to extract datetime, sender and message from a given line.

    It uses some fancy regex. 
    """
    date = []
    msg = []
    sender_name = []
    datetime_pat  = "\d\d\d\d\-\d\d-\d\d,\ \d:\d\d:\d\d [a-zA-Z][a-zA-Z]" 
    date = re.search(datetime_pat, line)

    sender_pat = "(?<=(\])) [^:]*" #"\-\ \w+\:|\-\ \w+\ \w+\:" # one or two names
    sender = re.search(sender_pat,line)

    if date and sender:
        if len(sender.group(0)) < 25:
            # assumes a name and last name is less than at most N chars. 
            # Avoids misclassifying a status change with a semi-colon for a name.
            date = pd.to_datetime(date.group(0))        
            sender_name = sender.group(0)[1:]

            msg = line[line.index(sender_name)+len(sender_name)+2:]
        
    return date, sender_name, msg


def chat_txt_to_df(filename):
    """
    Parse the entire convo as a pd.dataframe.
    
    This can be long depending to run depending on the length of the convo.
    Expects filename to be the path to the .txt file of the conversation.
    """
    f = open(filename, 'r', encoding='utf-8')
    chat_df = pd.DataFrame(columns=['date','sender','message'])
    for count, line in tqdm(enumerate(f)):
        date, sender, msg  = extract_line(line)
        if sender and msg:
            temp_df = pd.Series({'date':date ,'sender':sender,'message':msg})
            chat_df = chat_df.append(temp_df, ignore_index=True)

    f.close()

    # save processed dataframe for easy usage
    chat_df.to_csv('processed_chat.csv')
    return chat_df

uploaded = files.upload()
for fn in uploaded.keys():
  chat_df = chat_txt_to_df(filename)

# Run all the next cells

# Total messages sent :

In [None]:
print(f"We have sent a total of {len(chat_df)} Messages between {min(chat_df.date)} and {max(chat_df.date)}")

# Total number of messages sent per person

In [None]:
# Messages per participant

counts = chat_df['sender'].value_counts()/len(chat_df)*100
%matplotlib inline
fig = counts.plot(kind="bar", title="% Messages per person")

# Interactions

Interpret this next graph as "This person has answered directly to this person this many times". For example, Alice and Bob answer each other most often. For "equal" dialogue, the matrix should be symmetric along its diagonal. Interactions with oneself are ignored.

In [None]:
index = {}
names = []

for count, sender in enumerate(chat_df["sender"].unique()):
    index[sender] = count
    names.append(sender)


interactions = np.zeros((len(index), len(index)))

prev_sender = chat_df["sender"].iloc[0]
for jj in range(1, len(chat_df)):
    current_sender = chat_df["sender"].iloc[jj]
    if prev_sender != current_sender:
        interactions[index[prev_sender], index[current_sender]] += 1

    prev_sender = current_sender


def plot_interactions(
    cm, classes, normalize=False, title="People Interactions", cmap=plt.cm.coolwarm
):

    plt.figure(figsize=(20, 20))
    plt.imshow(cm, interpolation="nearest", cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = ".2f" if normalize else "f"
    thresh = cm.max() / 2.0

    plt.tight_layout()
    plt.ylabel("This person interacts most with")
    plt.xlabel("This person")


plot_interactions(interactions, names)

# The longest monologue

In [None]:
prev_sender = []
max_spam = 0
tmp_spam = 0

for jj in range(len(chat_df)):

    current_sender = chat_df["sender"].iloc[jj]
    if current_sender == prev_sender:
        tmp_spam += 1
        if tmp_spam > max_spam:
            max_spam = tmp_spam
            max_spammer = current_sender
    else:
        tmp_spam = 0

    prev_sender = current_sender

print("The most spam is from %s with %d consecutive messages" % (max_spammer, max_spam))

# Person who has sent the most gifs and stickers

In [None]:
gifs_sent = {}
stickers_sent = {}
for sender in chat_df["sender"].unique():
    gifs_sent[sender] = 0
    stickers_sent[sender] = 0

for jj in range(len(chat_df)):
    if chat_df["message"].iloc[jj] == "‎GIF omitted\n":
        gifs_sent[chat_df["sender"].iloc[jj]] += 1
    if chat_df["message"].iloc[jj] == "‎sticker omitted\n":
        stickers_sent[chat_df["sender"].iloc[jj]] += 1


gifs_pd = pd.DataFrame.from_dict(gifs_sent, orient="index")
gifs_pd.sort_values(by=0, ascending=False, inplace=True)
gifs_pd = gifs_pd.transpose().iloc[0]
_ = gifs_pd.plot(kind="bar", legend=False, title="Most gifs sent")
plt.show()

stickers_pd = pd.DataFrame.from_dict(stickers_sent, orient="index")
stickers_pd.sort_values(by=0, ascending=False, inplace=True)
stickers_pd = stickers_pd.transpose().iloc[0]
_ = stickers_pd.plot(kind="bar", legend=False, title="Most stickers sent")
plt.show()

# Most LOL

In [None]:
lulz_sent = {}
for sender in chat_df['sender'].unique():
    lulz_sent[sender] = 0

lulz = ["lol","lmao","lulz","rofl","lolol"]

for jj in range(len(chat_df)):
    if any(x in chat_df["message"].iloc[jj].lower() for x in lulz):
        lulz_sent[chat_df['sender'].iloc[jj]] += 1

lulz_pd  = pd.DataFrame.from_dict(lulz_sent,orient="index")
lulz_pd.sort_values(by=0,ascending=False, inplace=True)
lulz_pd = lulz_pd.transpose().iloc[0]

_ =lulz_pd.plot(kind='bar', legend = False, title = "Most LuLz")

# Bigget Slacker (texts during work hours)

In [None]:
work_txt_sent = {}
for sender in chat_df["sender"].unique():
    work_txt_sent[sender] = 0

for jj in range(len(chat_df)):
    if (
        chat_df["date"].iloc[jj].hour > 8
        and chat_df["date"].iloc[jj].hour < 17
        and chat_df["date"].iloc[jj].isoweekday() < 6
    ):
        work_txt_sent[chat_df["sender"].iloc[jj]] += 1


work_txt_pd = pd.DataFrame.from_dict(work_txt_sent, orient="index")
work_txt_pd.sort_values(by=0, ascending=False, inplace=True)
work_txt_pd = work_txt_pd.transpose().iloc[0]

_ = work_txt_pd.plot(kind="bar", legend=False, title="Most texts during work hours")

# The night owls

In [None]:
night_txt_sent = {}
for sender in chat_df["sender"].unique():
    night_txt_sent[sender] = 0

for jj in range(len(chat_df)):
    if chat_df["date"].iloc[jj].hour < 6:
        night_txt_sent[chat_df["sender"].iloc[jj]] += 1


night_txt_pd = pd.DataFrame.from_dict(night_txt_sent, orient="index")
night_txt_pd.sort_values(by=0, ascending=False, inplace=True)
night_txt_pd = night_txt_pd.transpose().iloc[0]

_ = night_txt_pd.plot(
    kind="bar", legend=False, title="Most texts between midnight and 6 am"
)

# WHY ARE WE YELLING

In [None]:
yelling_sent = {}
for sender in chat_df["sender"].unique():
    yelling_sent[sender] = 0

for jj in range(len(chat_df)):
    if chat_df["message"].iloc[jj].upper() == chat_df["message"].iloc[jj]:
        yelling_sent[chat_df["sender"].iloc[jj]] += 1


yelling_pd = pd.DataFrame.from_dict(yelling_sent, orient="index")
yelling_pd.sort_values(by=0, ascending=False, inplace=True)
yelling_pd = yelling_pd.transpose().iloc[0]

_ = yelling_pd.plot(kind="bar", legend=False, title="MOST YELLING")

# WorldClouds

In [None]:
!pip install wordcloud
import wordcloud
from os import path
from wordcloud import WordCloud

%matplotlib inline

# ignore messages with images, stickers, gifs, etc.
ignore_lines_with = ["media omitted", "sticker omitted", "GIF omitted", "image omitted"]

all_text = ""
for idx in range(len(chat_df)):
    message = chat_df["message"].iloc[idx]
    if not any(ignore in message for ignore in ignore_lines_with):
        all_text += (message).lower()

# Generate a word cloud image
wordcloud = WordCloud().generate(all_text)

# Display the generated image:
# the matplotlib way:
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")

# lower max_font_size
wordcloud = WordCloud(max_font_size=40).generate(all_text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
"""
Masked wordcloud
================
Using a mask you can generate wordclouds in arbitrary shapes.

"""
from urllib.request import urlopen
from PIL import Image
from wordcloud import WordCloud, STOPWORDS


# read the mask image
url = "https://freestencilgallery.com/wp-content/uploads/2014/09/Doge-Stencil-thumb.jpg"
wordcloud_mask = np.array(Image.open(urlopen(url)))

stopwords = set(STOPWORDS)
wc = WordCloud(
    background_color="white", max_words=4000, mask=wordcloud_mask, stopwords=stopwords
)
# generate word cloud
wc.generate(all_text)

# show image
plt.figure(figsize=(15, 15))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()