<a href="https://colab.research.google.com/github/jianzhiw/WhatsappChatAnalysis/blob/master/Whatsapp_Chat_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Whatsapp Chat Analysis #

[Source](https://towardsdatascience.com/build-your-own-whatsapp-chat-analyzer-9590acca9014)

Step 1: Detect date and time tokens

In [0]:
def startsWithDateTime(s):
  pattern = '^([0-2][0-9]|(3)[0-1])(\/)(((0)[0-9])|((1)[0-2]))(\/)(\d{2}|\d{4}), ([0-9][0-9]):([0-9][0-9]) -'
  result = re.match(pattern, s)
  if result:
    return True
  
  return False

Step 2: Detect author tokens

In [0]:
def startsWithAuthor(s):
  patterns = [
    '([\w]+):',                        # First Name
    '([\w]+[\s]+[\w]+):',              # First Name + Last Name
    '([\w]+[\s]+[\w]+[\s]+[\w]+):',    # First Name + Middle Name + Last Name
    '([+]\d{2} \d{5} \d{5}):',         # Mobile Number (India)
    '([+]\d{2} \d{3} \d{3} \d{4}):',   # Mobile Number (US)
    '([+]\d{2} \d{4} \d{7})'           # Mobile Number (Europe)
  ]
  pattern = '^' + '|'.join(patterns)
  result = re.match(pattern, s)
  if result:
    return True
  
  return False

Step 3: Extracting and combining tokens

In [0]:
def getDataPoint(line):
  # line = 18/06/17, 22:47 - Loki: Why do you have 2 numbers, Banner?
    
  splitLine = line.split(' - ') # splitLine = ['18/06/17, 22:47', 'Loki: Why do you have 2 numbers, Banner?']
    
  dateTime = splitLine[0] # dateTime = '18/06/17, 22:47'
    
  date, time = dateTime.split(', ') # date = '18/06/17'; time = '22:47'
    
  message = ' '.join(splitLine[1:]) # message = 'Loki: Why do you have 2 numbers, Banner?'
    
  if startsWithAuthor(message): # True
    splitMessage = message.split(': ') # splitMessage = ['Loki', 'Why do you have 2 numbers, Banner?']
    author = splitMessage[0] # author = 'Loki'
    message = ' '.join(splitMessage[1:]) # message = 'Why do you have 2 numbers, Banner?'
  else:
    author = None
  return date, time, author, message

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import re

parsedData = [] # List to keep track of data so it can be used by a Pandas dataframe
conversationPath = './drive/My Drive/GPT-2 Training/WhatsApp Chat with RMM.txt' 
with open(conversationPath, encoding="utf-8") as fp:
  fp.readline() # Skipping first line of the file (usually contains information about end-to-end encryption)
        
  messageBuffer = [] # Buffer to capture intermediate output for multi-line messages
  date, time, author = None, None, None # Intermediate variables to keep track of the current message being processed
    
  while True:
    line = fp.readline() 
    if not line: # Stop reading further if end of file has been reached
      break
    line = line.strip() # Guarding against erroneous leading and trailing whitespaces
    if startsWithDateTime(line): # If a line starts with a Date Time pattern, then this indicates the beginning of a new message
      if len(messageBuffer) > 0: # Check if the message buffer contains characters from previous iterations
        parsedData.append([date, time, author, ' '.join(messageBuffer)]) # Save the tokens from the previous message in parsedData
      messageBuffer.clear() # Clear the message buffer so that it can be used for the next message
      date, time, author, message = getDataPoint(line) # Identify and extract tokens from the line
      messageBuffer.append(message) # Append message to buffer
    else:
      messageBuffer.append(line) # If a line doesn't start with a Date Time pattern, then it is part of a multi-line message. So, just append to buffer


In [0]:
import pandas as pd

df = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Author', 'Message'])
df.head

In [0]:
df.describe()

Who send the most text

In [0]:
author_value_counts = df['Author'].value_counts() # Number of messages per author
top_10_author_value_counts = author_value_counts.head(10) # Number of messages per author for the top 10 most active authors
top_10_author_value_counts.plot.barh() # Plot a bar chart using pandas built-in plotting apis

Deleted message

In [0]:
deleted_messages_df = df[df['Message'] == 'This message was deleted']
self_deleted_messages_df = df[df['Message'] == 'You deleted this message']
print ("Messages that others has deleted = ", deleted_messages_df['Message'].count())
print ("Messages that I have deleted = ", self_deleted_messages_df['Message'].count())

Messages that others has deleted =  26
Messages that I have deleted =  5


Media message

In [0]:
media_messages_df = df[df['Message'] == '<Media omitted>']
print(media_messages_df.head())

author_media_messages_value_counts = media_messages_df['Author'].value_counts()
top_10_author_media_messages_value_counts = author_media_messages_value_counts.head(10)
top_10_author_media_messages_value_counts.plot.barh()

Is there any null author?

In [0]:
null_authors_df = df[df['Author'].isnull()]
null_authors_df.head()

Remove media and null author

In [0]:
print("Before", df.shape[0])
messages_df = df.drop(null_authors_df.index) # Drops all rows of the data frame containing messages from null authors
messages_df = messages_df.drop(media_messages_df.index) # Drops all rows of the data frame containing media messages
messages_df = messages_df.drop(deleted_messages_df.index) # Drop deleted messages
messages_df = messages_df.drop(self_deleted_messages_df.index) #Drop self deleted messages
messages_df = messages_df.reset_index(drop=True)

print("After", messages_df.shape[0])

Import into txt (for OpenAI GPT-2) by appending the conversation into text file

In [0]:
for x in range (0, messages_df.shape[0] - 1):
  with open("result.txt", "a") as text_file:
    print(messages_df['Author'][x] + ": " + messages_df['Message'][x], end="\r\n", file=text_file)

Apply letter and word count

In [0]:
messages_df['Letter_Count'] = messages_df['Message'].apply(lambda s : len(s))
messages_df['Word_Count'] = messages_df['Message'].apply(lambda s : len(s.split(' ')))

In [0]:
messages_df['Letter_Count'].sum(), messages_df['Word_Count'].sum()

Total word

In [0]:
import matplotlib.pyplot as plt
total_word_count_grouped_by_author = messages_df[['Author', 'Word_Count']].groupby('Author').sum()
sorted_total_word_count_grouped_by_author = total_word_count_grouped_by_author.sort_values('Word_Count', ascending=False)
top_10_sorted_total_word_count_grouped_by_author = sorted_total_word_count_grouped_by_author.head(10)
top_10_sorted_total_word_count_grouped_by_author.plot.barh()
plt.xlabel('Number of Words')
plt.ylabel('Authors')

Most common number of words in a message

In [0]:
plt.figure(figsize=(15, 2)) # To ensure that the bar plot fits in the output cell of a Jupyter notebook
word_count_value_counts = messages_df['Word_Count'].value_counts()
top_40_word_count_value_counts = word_count_value_counts.head(40)
top_40_word_count_value_counts.plot.bar()
plt.xlabel('Word Count')
plt.ylabel('Frequency')

Total Letter

In [0]:
total_letter_count_grouped_by_author = messages_df[['Author', 'Letter_Count']].groupby('Author').sum()
sorted_total_letter_count_grouped_by_author = total_letter_count_grouped_by_author.sort_values('Letter_Count', ascending=False)
top_10_sorted_total_letter_count_grouped_by_author = sorted_total_letter_count_grouped_by_author.head(10)
top_10_sorted_total_letter_count_grouped_by_author.plot.barh()
plt.xlabel('Number of Letters')
plt.ylabel('Authors')

Most common number of letter

In [0]:
plt.figure(figsize=(15, 2))
letter_count_value_counts = messages_df['Letter_Count'].value_counts()
top_40_letter_count_value_counts = letter_count_value_counts.head(40)
top_40_letter_count_value_counts.plot.bar()
plt.xlabel('Letter count')
plt.ylabel('Frequency')

Number of messages by date

In [0]:
messages_df['Date'].value_counts().head(10).plot.barh() # Top 10 Dates on which the most number of messages were sent
plt.xlabel('Number of Messages')
plt.ylabel('Date')

Number of messages by time

In [0]:
messages_df['Time'].value_counts().head(10).plot.barh() # Top 10 Times of the day at which the most number of messages were sent
plt.xlabel('Number of messages')
plt.ylabel('Time')

Number of messages by hour

In [0]:
messages_df['Hour'] = messages_df['Time'].apply(lambda x : x.split(':')[0]) # The first token of a value in the Time Column contains the hour (Eg., "20" in "20:15")

messages_df['Hour'].value_counts().head(10).sort_index(ascending=False).plot.barh() # Top 10 Hours of the day during which the most number of messages were sent
plt.xlabel('Number of messages')
plt.ylabel('Hour of Day')