# WhatsApp Chat Analysis

In [None]:
# Import modules and libriaries
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
import re 
import datetime as dt 
from urlextract import URLExtract # used to extract url's in a string
from collections import Counter
from wordcloud import WordCloud
import string 
import nltk
pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words("english")
wn = nltk.WordNetLemmatizer()
punct = string.punctuation

%matplotlib inline

## Data (text) Preprocessing

### Import Text Data

We will now import the text file which contains the whatsapp group chat in read mode using utf-8 encoding.

In [None]:
# def rawToDf(file, key):
#     split_formats = {
#         '12hr' : '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s',
#         '24hr' : '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s',
#         'custom' : ''
#     }
#     datetime_formats = {
#         '12hr' : '%m/%d/%y, %I:%M %p - ',
#         '24hr' : '%m/%d/%y, %H:%M - ',
#         'custom': ''
#     }
    
#     with open(file, 'r', encoding="utf8") as raw_data:
#         raw_string = ' '.join(raw_data.read().split('\n')) # converting the list split by newline char. as one whole string as there can be multi-line messages
#         user_msg = re.split(split_formats[key], raw_string) [1:] # splits at all the date-time pattern, resulting in list of all the messages with user names
#         date_time = re.findall(split_formats[key], raw_string) # finds all the date-time patterns
        
#         df = pd.DataFrame({'date_time': date_time, 'user_msg': user_msg}) # exporting it to a df
        
#     # converting date-time pattern which is of type String to type datetime,
#     # format is to be specified for the whole string where the placeholders are extracted by the method 
#     df['date_time'] = pd.to_datetime(df['date_time'], format=datetime_formats[key])
    
#     # split user and msg 
#     usernames = []
#     msgs = []
#     for i in df['user_msg']:
#         a = re.split('([\w\W]+?):\s', i) # lazy pattern match to first {user_name}: pattern and spliting it aka each msg from a user
#         if(a[1:]): # user typed messages
#             usernames.append(a[1])
#             msgs.append(a[2])
#         else: # other notifications in the group(eg: someone was added, some left ...)
#             usernames.append("Group Notification")
#             msgs.append(a[0])

#     # creating new columns         
#     df['user'] = usernames
#     df['msg'] = msgs

#     # dropping the old user_msg col.
#     df.drop('user_msg', axis=1, inplace=True)
    
#     return df

In [None]:
# rawToDf("whatsapp_chat_data.txt", "24hr")

In [None]:
# import text data
f = open("whatsapp_chat_data.txt", "r", encoding="utf-8")

data = f.read()

In [None]:
dummy = data.split("\n")
dummy

### Separate messages and date/time
We want to ensure that every text entry has a date and time stamp in the specified example below:

Example for dry run : `'16/08/18, 20:09 - '`

\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s

- the first part "d{1,2}" this means here we can have one or 2 digits
- then after we have "/"
- then after we have "d{1,2}" again,same we can have one or 2 digits
- then after we have "/"
- then after we have "d{2,4}" here we check for 2 or more digits
- then a ","
- then we have a space represented as "\s"
- then we have hour representation it can take either one or 2 digits 
- then we have a seperator ":" 

and the pattern repeats so on..

In [None]:
# regex pattern to track date and time
pattern = '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s'

# Extract only the messages
messages = re.split(pattern, data)[1:]
print(len(messages))

# Extracting only the date/time
dates = re.findall(pattern, data)
print(len(dates))

**Sample illustration**

In [None]:
# regex pattern to track date and time
pattern = '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s'

text_data =  '8/25/22, 04:08 - Segun Lawal: Hello, please this video stopped playing at the 2:45 timestamp, only the audio was running'

In [None]:
# Extract message
re.split(pattern, text_data)[1]

In [None]:
# Extracting only the dates
re.findall(pattern, text_data)[0]

In [None]:
messages[:5]

In [None]:
dates[:5]

This is  a simple time string, we will apply the following transformation in order to get the date and the time for our analysis

In [None]:
string = "8/26/22, 20:09 - "
string

In [None]:
string = string.split(',')
string

In [None]:
date,time = string[0],string[1]
date,time

In [None]:
time = time.split('-')
time = time[0].strip() # remove white spaces
time

In [None]:
print(date+ " and "+ time)

### Separate date and time
We will create a function that separates the time from the date

In [None]:
# This function separetes the time from the date
def get_date_and_time(string):
    string = string.split(",")
    date, time = string[0], string[1]
    time = time.split("-")
    time = time[0].strip()
    
    return date+" "+time 

### Create a dataframe for messages and their corresponding time

Now we have separated the messages and the time, let's create a dataframe with two columns for messages and date/time.

In [None]:
df = pd.DataFrame({"user_messages": messages, 
                   "message_date": dates})

# Apply the function that separates the time from the date
df["message_date"] = df["message_date"].apply(lambda text: get_date_and_time(text))

df.head()

In [None]:
# Let's rename the "message_date" solumn to "date"
df.rename(columns={"message_date": "date"}, inplace=True)

df.head()

In [None]:
df["user_messages"]

### Separating users name/numbers from users message
If we observe the `user_messages` column, we find that the users name/number is attached with the user message. Therefore we need to get rid of this. For that we will use the concept of regular expression to separate the users name/number from the users message

**Sample illustration**

In [None]:
import random
txt = df["user_messages"][random.randrange(0, len(df["user_messages"]))]
txt

In [None]:
re.split('([\w\W]+?):\s', txt)

In [None]:
# Separate users number/name from users message
users = []
messages = []

# loop through the "user_messages" column
for message in df["user_messages"]:
    
    # Split on the regex expression match."users name or number"
    entry = re.split('([\w\W]+?):\s', message)
    
    # very this message has a name/number
    if entry[1:]:
        users.append(entry[1])
        messages.append(entry[2])
        
    # else it is a "Group Notification"
    else:
        users.append("Group Notification")
        messages.append(entry[0])

# add the users and messages to new columns in he dataframe      
df["User"] = users
df["message"] = messages

# remove the traling "\n" and the end of each message
def get_string(text):
    return text.split("\n")[0]

df["message"] = df["message"].apply(lambda text: get_string(text))

df.head()

In [None]:
# Drop and rename columns
df = df.drop(columns=["user_messages"])
df = df[["message", "date", "User"]]
df = df.rename(columns={"message": "Message", 
                   "date": "Date"})
df.head()

In [None]:
df.head()

In [None]:
# get the shape of the dataframe
df.shape

### Breaking Down the "Date" Column

Our dates are in the format

`8/22/22 19:06`

Which represents:
    
`MM-DD-YY`

However, `pd.to_datetime()` will convert it to this format

`YY-MM-DD`

In [None]:
# Extract the date without time
df['Only date'] = pd.to_datetime(df['Date']).dt.date

# Get only the Year
df['Year'] = pd.to_datetime(df['Date']).dt.year

# Get month number
df['Month_num'] = pd.to_datetime(df['Date']).dt.month

# Get name of month
df['Month'] = pd.to_datetime(df['Date']).dt.month_name()

# Get Day
df['Day'] = pd.to_datetime(df['Date']).dt.day

# Get name of Day
df['Day_name'] = pd.to_datetime(df['Date']).dt.day_name()

# Get hour
df['Hour'] = pd.to_datetime(df['Date']).dt.hour

# Get minutes
df['Minute'] = pd.to_datetime(df['Date']).dt.minute

# View dataframe head
df.head()

We have successfully completed the `Text Preprocessing`. We will not proceed to `Text Analysis`


## Text Analysis

## Numerical Analysis

**Total number of messages in the group**

In [None]:
# Get total number of messsages in the group
len(df["Message"])

**Total number of words in the group**

In [None]:
# Total number of words in the group chat
words = []
for message in df["Message"]:
    words.extend(message.split())
    
print(len(words))

**Total number of media files shared in the group**

In [None]:
# Get number of media files shared in the group
len(df[df["Message"] == "<Media omitted>"])

**Total number of links in the group**

In [None]:
# Get number of links in the group
extract = URLExtract()

links = []
for message in df["Message"]:
    links.extend(extract.find_urls(message))
    
len(links)

**Most active users**

In [None]:
# discard Group notifications
active_users_df = df[df['User'] != 'Group Notification']
active_users_df.head()

In [None]:
# Avtive users and number of messages posted in the group
pd.DataFrame(active_users_df["User"].value_counts()).rename(columns={"User":"posts"})

In [None]:
# Top 5 active users
top_5_users = active_users_df["User"].value_counts().head()
top_5_users

In [None]:
# Plot the bar chat of top 5 active users
plt.figure(figsize=(10, 7))
plt.bar(top_5_users.index, top_5_users.values)
plt.xticks(rotation = 'vertical', fontsize=14)
plt.yticks(fontsize=14)
plt.title("Top 5 Active Users", fontsize=18);

### Word Cloud

In [None]:
# Get dataframe without "<Media omitted>"
wc_df = df[df["Message"] != "<Media omitted>"]
wc_df.head()

In [None]:
# create word cloud object
wc = WordCloud(width=1000, height=500, min_font_size=10, 
              background_color="white")

# generate wordcloud image
wc = wc.generate(wc_df["Message"].str.cat(sep=" "))

# plot word cloud
plt.figure(figsize=(10, 7))
plt.imshow(wc)
plt.axis("off");

In [None]:
# create word cloud object
wc = WordCloud(width=1000, height=500, min_font_size=10, 
              background_color="white")

# generate wordcloud image
wc = wc.generate(wc_df["Message"].str.cat(sep=" "))

# plot word cloud
fig, ax = plt.subplots(figsize=(10, 7))
ax.imshow(wc)
ax.axis("off");

### Most Common Words

In [None]:
# without "<Media omitted>" in meassages column
wc_df

In [None]:
# Clean text
def clean_text(text):
    # Remove punctuations 
    text = "".join([word.lower() for word in text if word not in punct])
    
    # Remove any other signs
    text = " ".join(re.split('\W+', text))
    
    # Remove numerial leaving only aplabets
    text = re.sub('[^A-Za-z]+', ' ', str(text))
    
    # Remove links and websites
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"www.\S+", "", text)
    
    text = text.split()
    
    # Remove stopwords
    text = [word for word in text if word not in stopwords]
    
    # lemmatize
    text = [wn.lemmatize(word) for word in text]
    
    #
    text = " ".join([word for word in text])
    
    return text

wc_df["clean_message"] = wc_df["Message"].apply(clean_text)
wc_df.head()

In [None]:
# Get top 20 words

words = []
for message in wc_df["clean_message"]:
    words.extend(message.split())

most_common = pd.DataFrame(Counter(words).most_common(20)).sort_values(1)
most_common

In [None]:
# Plot a graph of the most common words
plt.figure(figsize=(10, 7))
plt.barh(most_common[0], most_common[1])
plt.xticks(rotation="vertical")
plt.title("Most Common Words");

### Number of Emojis