# WhatsApp analysis

### Some functions and imports

In [None]:
import re
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools

def extract_msg(line):
    "Regex to extract datetime, sender and message."
    date = []
    msg = []
    sender_name = []
    datetime_pat  = "\d\d\d\d\-\d\d-\d\d,\ \d:\d\d:\d\d [a-zA-Z][a-zA-Z]" 
    date = re.search(datetime_pat, line)

    sender_pat = "(?<=(\])) [^:]*" #"\-\ \w+\:|\-\ \w+\ \w+\:" # one or two names
    sender = re.search(sender_pat,line)

    if date and sender:
        if len(sender.group(0)) < 25:
            # assumes a name and last name is less than at most N chars. 
            # Avoids misclassifying a status change with a semi-colon for a name.
            date = pd.to_datetime(date.group(0))        
            sender_name = sender.group(0)[1:]

            msg = line[line.index(sender_name)+len(sender_name)+2:]
        
    return date, sender_name, msg

# parse the entire convo as a pd.dataframe

f = open('WhatsApp_chat_12_01_2020/_chat.txt', 'r', encoding='utf-8')
conv0 = pd.DataFrame(columns=['date','sender','message'])
for count, line in tqdm(enumerate(f)):
    date, sender, msg  = extract_msg(line)
    if sender and msg:
        temp_df = pd.Series({'date':date ,'sender':sender,'message':msg})
        conv0 = conv0.append(temp_df, ignore_index=True)
        
f.close()

# This can be long depending on length of convo, save processed dataframe for easy usage
conv0.to_csv('WhatsApp_chat_12_01_2020/processed_chat.csv')

In [None]:
conv0.to_csv('WhatsApp_chat_12_01_2020/processed_chat.csv')

In [None]:
conv0

In [None]:
conv0.message.value_counts()

# Total messages sent :

In [None]:
print(f"We have sent a total of {len(conv0)} Messages between {min(conv0.date)} and {max(conv0.date)}")

# Total number of messages sent per person

In [None]:
# Messages per harman

counts = conv0['sender'].value_counts()/len(conv0)*100
%matplotlib inline
fig = counts.plot(kind="bar", title="% Messages per person")

# Interactions

Interpret this next graph as "This person has answered directly to this person this many times". For example, Alice and Bob answer each other most often. For "equal" dialogue, the matrix should be symmetric along its diagonal. Interactions with oneself are ignored.

In [None]:
index = {}
names = []

for count,sender in enumerate(conv0['sender'].unique()):
    index[sender] = count
    names.append(sender)


interactions = np.zeros((len(index),len(index)))

prev_sender = conv0['sender'].iloc[0]
for jj in range(1,len(conv0)):
    
    current_sender = conv0['sender'].iloc[jj]
    if prev_sender != current_sender:
        interactions[index[prev_sender],index[current_sender]]+=1
    
    prev_sender = current_sender

def plot_interactions(cm, classes,
                      normalize=False,
                      title='People Interactions',
                      cmap=plt.cm.coolwarm):

    plt.figure(figsize=(20,20))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'f'
    thresh = cm.max() / 2.

    plt.tight_layout()
    plt.ylabel('This person interacts most with')
    plt.xlabel('This person')

plot_interactions(interactions,names)

# The longest monologue

In [None]:
prev_sender = []
max_spam = 0
tmp_spam = 0

for jj in range(len(conv0)):
    
    current_sender = conv0['sender'].iloc[jj]
    if current_sender == prev_sender:
        tmp_spam += 1
        if tmp_spam>max_spam:
            max_spam = tmp_spam
            max_spammer = current_sender
    else:
        tmp_spam = 0
    
    prev_sender = current_sender
        
print("The most spam is from %s with %d consecutive messages" % (max_spammer,max_spam))


# Person who has sent the most gifs and stickers

In [None]:
gifs_sent = {}
stickers_sent = {}
for sender in conv0['sender'].unique():
    gifs_sent[sender] = 0
    stickers_sent[sender] = 0

for jj in range(len(conv0)):
    if conv0["message"].iloc[jj] == "‎GIF omitted\n":
        gifs_sent[conv0['sender'].iloc[jj]] += 1
    if conv0["message"].iloc[jj] == "‎sticker omitted\n":
        stickers_sent[conv0['sender'].iloc[jj]] += 1


gifs_pd  = pd.DataFrame.from_dict(gifs_sent,orient="index")
gifs_pd.sort_values(by=0,ascending=False, inplace=True)
gifs_pd = gifs_pd.transpose().iloc[0]
_ = gifs_pd.plot(kind='bar', legend = False, title="Most gifs sent")
plt.show()

stickers_pd  = pd.DataFrame.from_dict(stickers_sent,orient="index")
stickers_pd.sort_values(by=0,ascending=False, inplace=True)
stickers_pd = stickers_pd.transpose().iloc[0]
_ = stickers_pd.plot(kind='bar', legend = False, title="Most stickers sent")
plt.show()

# Most LOL

In [None]:
lulz_sent = {}
for sender in conv0['sender'].unique():
    lulz_sent[sender] = 0

lulz = ["lol","lmao","lulz","rofl","lolol"]

for jj in range(len(conv0)):
    if any(x in conv0["message"].iloc[jj].lower() for x in lulz):
        lulz_sent[conv0['sender'].iloc[jj]] += 1

lulz_pd  = pd.DataFrame.from_dict(lulz_sent,orient="index")
lulz_pd.sort_values(by=0,ascending=False, inplace=True)
lulz_pd = lulz_pd.transpose().iloc[0]

_ =lulz_pd.plot(kind='bar', legend = False, title = "Most LuLz")


# Bigget Slacker (texts during work hours)

In [None]:
work_txt_sent = {}
for sender in conv0['sender'].unique():
    work_txt_sent[sender] = 0

for jj in range(len(conv0)):
    if (conv0["date"].iloc[jj].hour > 8 and 
        conv0["date"].iloc[jj].hour < 17 and 
        conv0["date"].iloc[jj].isoweekday()<6):
        work_txt_sent[conv0['sender'].iloc[jj]] += 1


work_txt_pd  = pd.DataFrame.from_dict(work_txt_sent,orient="index")
work_txt_pd.sort_values(by=0,ascending=False, inplace=True)
work_txt_pd = work_txt_pd.transpose().iloc[0]

_ = work_txt_pd.plot(kind='bar', legend = False, title = "Most texts during work hours")


# The night owls

In [None]:
night_txt_sent = {}
for sender in conv0['sender'].unique():
    night_txt_sent[sender] = 0

for jj in range(len(conv0)):
    if conv0["date"].iloc[jj].hour < 6:
        night_txt_sent[conv0['sender'].iloc[jj]] += 1


night_txt_pd  = pd.DataFrame.from_dict(night_txt_sent,orient="index")
night_txt_pd.sort_values(by=0,ascending=False, inplace=True)
night_txt_pd = night_txt_pd.transpose().iloc[0]

_ = night_txt_pd.plot(kind='bar', legend = False, title = "Most texts between midnight and 6 am")


# WHY ARE WE YELLING

In [None]:
yelling_sent = {}
for sender in conv0['sender'].unique():
    yelling_sent[sender] = 0

for jj in range(len(conv0)):
    if conv0["message"].iloc[jj].upper() == conv0["message"].iloc[jj]:
        yelling_sent[conv0['sender'].iloc[jj]] += 1


yelling_pd  = pd.DataFrame.from_dict(yelling_sent,orient="index")
yelling_pd.sort_values(by=0,ascending=False, inplace=True)
yelling_pd = yelling_pd.transpose().iloc[0]

_ = yelling_pd.plot(kind='bar', legend = False, title = "MOST YELLING")


# WorldClouds

In [None]:
import wordcloud
from os import path
from wordcloud import WordCloud
%matplotlib inline
# d = path.dirname(__file__)
ignore_words = ["<Media omitted>\n",'go',"yeah","Im","I'm",]

# Read the whole text.
text = ''
for jj in range(len(conv0)):
    
    if not any(x in conv0["message"].iloc[jj] for x in ignore_words):

        text+=(conv0['message'].iloc[jj]).lower()

# Generate a word cloud image
wordcloud = WordCloud().generate(text)

# Display the generated image:
# the matplotlib way:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

# lower max_font_size
wordcloud = WordCloud(max_font_size=40).generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()



In [None]:
#!/usr/bin/env python
"""
Masked wordcloud
================
Using a mask you can generate wordclouds in arbitrary shapes.
"""

from PIL import Image

from wordcloud import WordCloud, STOPWORDS


# read the mask image
# taken from
# http://www.stencilry.org/stencils/movies/alice%20in%20wonderland/255fk.jpg
alice_mask = (np.array(Image.open("dbutt.jpg")))

stopwords = set(STOPWORDS)

wc = WordCloud(background_color="white", max_words=2000, mask=alice_mask,
               stopwords=stopwords)
# generate word cloud
wc.generate(text)

# store to file
wc.to_file(path.join("dbutt.png"))

# show
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.figure()
plt.imshow(alice_mask, cmap=plt.cm.gray, interpolation='bilinear')
plt.axis("off")
plt.show()