### # Import libraries to be used


In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from wordcloud import WordCloud, STOPWORDS
import plotly.express as px
import datetime
import emoji
from collections import Counter

## Preparing the data

### Example of a chat line 

In [None]:
## 11/10/19, 12:01 - Nishant Jio: Monday ko sunil ko dekhe attendance maang lena
## {Date}, {Time} - {Author}: {Message}

###  Detecting {Date} and {Time} tokens

#### In order to detect if a line of text is a new message or belongs to a multi-line message, we will have to check if that line begins with a Date and Time, for which we will need a little bit of regular expression (regex).

In [None]:
def startsWithDate(s):
    pattern = '^([0-2][0-9]|(3)[0-1])(\/)(((0)[0-9])|((1)[0-2]))(\/)(\d{2}|\d{4}), ([0-9][0-9]):([0-9][0-9]) -'
    result = re.match(pattern, s)
    if result:
        return True
    return False

### Detecting the {Author} token

#### Once again, it will require some more regular expression matching. Objective is to detect the author of this message. While there could be a variety of patterns depending on how you have saved your friends’ names in any phone contacts app, the most commonly used patterns I have identified are as follows.

In [None]:
def startsWithAuthor(s):
    patterns = [
        '([\w]+):',                        # First Name
        '([\w]+[\s]+[\w]+):',              # First Name + Last Name
        '([\w]+[\s]+[\w]+[\s]+[\w]+):',    # First Name + Middle Name + Last Name
        '([+]\d{2} \d{5} \d{5}):',         # Mobile Number (India)
        '([+]\d{2} \d{3} \d{3} \d{4}):',   # Mobile Number (US)
        '([+]\d{2} \d{4} \d{7})'           # Mobile Number (Europe)
        '([\w]+)[\u263a-\U0001f999]+:',    # Name and Emoji  
    ]
    pattern = '^' + '|'.join(patterns)
    result = re.match(pattern, s)
    if result:
        return True
    return False

### Extracting and Combining tokens

#### Now that I am able to identify the Date, Time, Author and Message tokens in a single message, it is time to split each line based on the separator tokens like commas (,), hyphens(-), colons(:) and spaces( ), so that the required tokens can be extracted and saved in a dataframe. This time, let me invert things by highlighting the separator tokens instead of the Date, Time, Author and Message tokens:
#### Date{Comma }Time{ Hyphen }Author{Colon }Message
#### 11/10/19{, }12:01{ - }Nishant Jio{: }Monday ko sunil ko dekhe attendance maang lena

In [None]:
def getDataPoint(line):
    # line = 11/10/19, 12:01 - Nishant Jio: Monday ko sunil ko dekhe attendance maang lena
    
    splitLine = line.split(' - ') # splitLine = ['11/10/19, 12:01', 'Nishant Jio: Monday ko sunil ko dekhe attendance maang lena']
    
    dateTime = splitLine[0] # dateTime = '11/10/19, 12:01'
    
    date, time = dateTime.split(', ') # date = '11/10/19'; time = '12:01'
    
    message = ' '.join(splitLine[1:]) # message = 'Nishant Jio: Monday ko sunil ko dekhe attendance maang lena'
    
    if startsWithAuthor(message): # True
        splitMessage = message.split(': ') # splitMessage = ['Nishant Jio', 'Monday ko sunil ko dekhe attendance maang lena']
        author = splitMessage[0] # author = 'Nishant Jio'
        message = ' '.join(splitMessage[1:]) # message = 'Monday ko sunil ko dekhe attendance maang lena'
    else:
        author = None
    return date, time, author, message

### Parsing the entire file and handling Multi-Line Messages

In [None]:
parsedData = [] # List to keep track of data so it can be used by a Pandas dataframe

conversationPath = 'group.txt'   # The path of txt file

with open(conversationPath, encoding="utf-8") as fp:
    fp.readline() # Skipping first line of the file (usually contains information about end-to-end encryption)
        
    messageBuffer = [] # Buffer to capture intermediate output for multi-line messages
    date, time, author = None, None, None # Intermediate variables to keep track of the current message being processed
    
    while True:
        line = fp.readline() 
        
        if not line:    # Stop reading further if end of file has been reached
            break
        line = line.strip()    # Guarding against erroneous leading and trailing whitespaces
        
        if startsWithDate(line):      # If a line starts with a Date Time pattern, then this indicates the beginning of a new message
            if len(messageBuffer) > 0:     # Check if the message buffer contains characters from previous iterations
                parsedData.append([date, time, author, ' '.join(messageBuffer)])    # Save the tokens from the previous message in parsedData
                
            messageBuffer.clear()    # Clear the message buffer so that it can be used for the next message
            date, time, author, message = getDataPoint(line)     # Identify and extract tokens from the line
            messageBuffer.append(message)    # Append message to buffer
            
        else:
            messageBuffer.append(line)    # If a line doesn't start with a Date Time pattern, then it is part of a multi-line message. So, just append to buffer


In [None]:
df = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Author', 'Message'])
df.head()

In [None]:
## Describe data
df.describe(include='all')

In [None]:
media_messages = df[df['Message'] == '<Media omitted>'].shape[0]
media_messages

### Group Wise Stats

#### One thing to notice here is, we are exporting the chat without media. So in place of media messages, we will find the phrase Media omitted>. With the help of this, we can find the total number of media messages shared in the group. For finding the total emojis used we will be using the emoji library. We will also create a separate column emojis that consists of only the emojis for that particular message. For finding the total number of links shared we will write a regex pattern and use the re library in python to identify URLs in a given message. We will also create a separate column urlcount that consists of the count of URLs in a particular message. All the additional created columns will be used later.

In [None]:
def split_count(text):

    emoji_list = []
    data = re.findall(r'\S', text)
    for word in data:
        if any(char in emoji.UNICODE_EMOJI for char in word):
            emoji_list.append(word)

    return emoji_list

total_messages = df.shape[0]
media_messages = df[df['Message'] == '<Media omitted>'].shape[0]

df['emoji'] = df.Message.apply(split_count)
emojis = sum(df['emoji'].str.len())

URLPATTERN = r'(https?://\S+)'
df['urlcount'] = df.Message.apply(lambda x: re.findall(URLPATTERN, x)).str.len()
links = np.sum(df.urlcount)


In [None]:
media_messages_df = df[df['Message'] == '<Media omitted>']
messages_df = df.drop(media_messages_df.index)


In [None]:
messages_df['Letter_Count'] = messages_df['Message'].apply(lambda s : len(s))
messages_df['Word_Count'] = messages_df['Message'].apply(lambda s : len(s.split(' ')))

In [None]:
messages_df.head()

## Data Exploration

### Highly talkative

In [None]:
author_value_counts = messages_df['Author'].value_counts()         # Number of messages per author
top_author_value_counts = author_value_counts.head(3)     # Number of messages per author for the top 10 most active authors
top_author_value_counts.plot.barh()                       # Plot a bar chart using pandas built-in plotting apis

### Mysterious Messages with No Authors!

In [None]:
messages_df = messages_df.dropna()

In [None]:
null_authors_df = messages_df[messages_df['Author'].isnull()]
null_authors_df.head()

### Emojis stats

In [None]:
## Total Emojis

total_emojis_list = list(set([a for b in messages_df.emoji for a in b]))
total_emojis = len(total_emojis_list)
print(total_emojis)

In [None]:
## Most used Emoji in Group

total_emojis_list = list([a for b in messages_df.emoji for a in b])
emoji_dict = dict(Counter(total_emojis_list))
emoji_dict = sorted(emoji_dict.items(), key=lambda x: x[1], reverse=True)

emoji_df = pd.DataFrame(emoji_dict, columns=['emoji', 'count'])
emoji_df

In [None]:
## Emoji distribution

fig = px.pie(emoji_df, values='count', names='emoji',
             title='Emoji Distribution')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [None]:
# Creates a list of unique Authors 
l = messages_df.Author.unique()
for i in range(len(l)):
  dummy_df = messages_df[messages_df['Author'] == l[i]]
  total_emojis_list = list([a for b in dummy_df.emoji for a in b])
  emoji_dict = dict(Counter(total_emojis_list))
  emoji_dict = sorted(emoji_dict.items(), key=lambda x: x[1], reverse=True)
  print('Emoji Distribution for', l[i])
  author_emoji_df = pd.DataFrame(emoji_dict, columns=['emoji', 'count'])
  fig = px.pie(author_emoji_df, values='count', names='emoji')
  fig.update_traces(textposition='inside', textinfo='percent+label')
  fig.show()

### Word Cloud

In [None]:
text = " ".join(review for review in messages_df.Message)
print ("There are {} words in all the messages.".format(len(text)))

In [None]:
stopwords = set(STOPWORDS)
stopwords.update(["ra", "ga", "na", "ani", "em", "ki", "ah","ha","la","eh","ne","le"])
  # Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)
  # Display the generated image:
  # the matplotlib way:
  
plt.figure( figsize=(10,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

### Number of messages as time moves on.

In [None]:
date_df = messages_df.groupby("Date").sum()
date_df.reset_index(inplace=True)
fig = px.line(date_df, x="Date", y="Word_Count", title='Number of Messages as time moves on.')
fig.update_xaxes(nticks=20)
fig.show()


In [None]:
def dayofweek(i):
  l = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
  return l[i];
day_df=pd.DataFrame(df["Message"])
day_df['day_of_date'] = df['Date'].dt.weekday
day_df['day_of_date'] = day_df["day_of_date"].apply(dayofweek)
day_df["messagecount"] = 1
day = day_df.groupby("day_of_date").sum()
day.reset_index(inplace=True)

fig = px.line_polar(day, r='messagecount', theta='day_of_date', line_close=True)
fig.update_traces(fill='toself')
fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0,6000]
    )),
  showlegend=False
)
fig.show()

### Most happening days

In [None]:
messages_df['Date'].value_counts().head(10).plot.barh()
plt.xlabel('Number of Messages')
plt.ylabel('Date')

### When are the group members most active?

In [None]:
messages_df['Time'].value_counts().head(10).plot.barh() 
plt.xlabel('Number of messages')
plt.ylabel('Time')

In [None]:
## What is the most suitable hour of the day at which to message to increase your chances of getting a response from someone?

messages_df['Hour'].value_counts().head(10).sort_index(ascending=False).plot.barh() # Top 10 Hours of the day during which the most number of messages were sent
plt.xlabel('Number of messages')
plt.ylabel('Hour of Day')

### What is the most common number of letters in a message?

In [None]:
plt.figure(figsize=(15, 2))
letter_count_value_counts = messages_df['Letter_Count'].value_counts()
top_40_letter_count_value_counts = letter_count_value_counts.head(40)
top_40_letter_count_value_counts.plot.bar()
plt.xlabel('Letter count')
plt.ylabel('Frequency')

### Conclusion


#### The long analysis comes to an end! In this code , I tried to analyze our Whatsapp group chats using Python and Plotly. I have analyze the pssible questions one can ask for whats app chat analisis.