<a href="https://colab.research.google.com/github/jogianni/sectionBapp/blob/main/SectionBWhatsapp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**IMPORTS**

In [None]:
import pandas as pd
import re
import io
import os
from google.colab import drive
from google.colab import files
import numpy as np
import datetime
!pip install emoji --upgrade
import emoji
from collections import Counter

**Load - Mount GDrive and Read-In**

In [None]:
#mount to gdrive - you will be asked to sign into 
drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/IESE/'

In [None]:
#os.listdir('/content/gdrive/My Drive/IESE')
os.chdir('/content/gdrive/My Drive/IESE')

In [None]:
def read_file(file):
    '''Reads Whatsapp text file into a list of strings'''
    x = open(file,'r', encoding = 'utf-8') #Opens the text file into variable x but the variable cannot be explored yet
    y = x.read() #By now it becomes a huge chunk of string that we need to separate line by line
    content = y.splitlines() #The splitline method converts the chunk of string into a list of strings
    return content

chat = read_file('sectionB10.24.2021.txt')

**Transform - Add some auxiliary data as well as parse dates and do light analysis**

In [None]:
#Drop lines that don't start with '[' - the class tends to send lots of lists with carriage returns. We don't want to count thoe as individual messages and it's hard to identify their owner, so we drop them
chat = [x for x in chat if x.startswith('[')]

In [None]:
#reads date in time if your phone is set to miliatary time
#if you phone is am/pm d{2}.{3} is needed to replace the last d{2} in time 
regex = re.compile(r'\[(?P<date>\d{1,2}\/\d{1,2}\/\d{2})\s(?P<time>\d{1,2}:\d{2}:\d{2})]\s(?P<Name>[^:]*):\s(?P<content>.+|\n+(?!)\[\d{2}\/\d{2}\/\d{4})')

#print the before and after chat count, since the chat list action eliminates lines that don't comply with the regex - we are only interested in timestamped actions anyways
print (len(chat))
chat_matches = [regex.search(content) for content in chat]
chat_list = [m.groupdict() for m in chat_matches if not m is None]
print (len(chat_list))
#print (chat_list)

In [None]:
#reads date in time if your phone is set to am pm time

#regex = re.compile(r'\[(?P<date>\d{1,2}\/\d{1,2}\/\d{2})\s(?P<time>\d{1,2}:\d{2}:\d{2}.{3})]\s(?P<Name>[^:]*):\s(?P<content>.+|\n+(?!)\[\d{2}\/\d{2}\/\d{4})')

#print (len(chat))
#chat_matches = [regex.search(content) for content in chat]
#chat_list = [m.groupdict() for m in chat_matches if not m is None]
#print (len(chat_list))
#print (chat_list)

In [None]:
#make Dataframe
df = pd.DataFrame(chat_list)

df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'])

df['date'] = [datetime.datetime.strptime(x, '%d/%m/%y') for x in df['date']]

#add some datums
df['msg_len']  = df['content'].str.len()
#Get word and message count
df['Letter_Count'] = df['content'].apply(lambda s : len(s.replace(' ','')))
df['Word_Count'] = df['content'].apply(lambda s : len(s.split(' ')))
#letter and word totals
df['Letter_Count'].sum(), df['Word_Count'].sum()

In [None]:
#Read-in Auciliary data - I took in my classmates' team and country of origin to give further analytics
coa_df = pd.read_csv('COA.csv') 

In [None]:
#tack on auxiliary data 
merged_df = pd.merge(left=df, right=coa_df, how='left', left_on='Name', right_on='Name')

In [None]:
#output basid data before doing some emoji analysis - pull name from input file so they're distinguishable 
merged_df.to_csv("sectionB10.24.2021_converted.csv",index=False)

**Emoji analysis** - Here we use the emoji library to parse emojis in the chat. We will take some counts and output into a large table of every emoji, so we can use properly in Tableau


In [23]:
#use emoji package to decode emoji output
def extract_emojis(str):
    return ''.join(c for c in str if c in emoji.UNICODE_EMOJI)


In [24]:
#create emoji counts 
def Count_Emojis(df, only_one_per_message = False):
    series = df['content']
    all_words = ""
    for sentence in series:
        all_words += extract_emojis(sentence)
    word_count = Counter(all_words)
    
    
    ordered = {}
    
    for key, number in word_count.most_common()[:50]:
        ordered[key] = []
    
    for sentence in series:
        sentence_count = Counter(extract_emojis(sentence))
        
        for word in ordered:
            if only_one_per_message:
                count = 1 if sentence_count[word] else 0
            else:
                count = sentence_count[word] if sentence_count[word] else 0
            ordered[word] += [count] 
             
            
    ordered['content'] = list(series)
    ordered['date'] = list(df['date'])
    ordered['Name'] = list(df['Name'])
    ordered['Country of Origin'] = list(df['Country of Origin'])
    ordered['Diagnosed Country'] = list(df['Diagnosed Country'])
    ordered['Team'] = list(df['Team'])
    
    return pd.DataFrame(ordered)

In [25]:
#add on previously ingested demograpic data
emojis_counts = Count_Emojis(merged_df)

In [26]:
#output
emojis_counts.to_csv("sectionB10.24.2021_converted_emoji.csv",index=False)

**Crush Metric** - who is talkative around who? Within the section chat we look at who talks the most within a 2 minute window after someone else speaks? In tableau we will go on to measure vs their average. So if suzy usually sends 1 message per day but will send 5 if Johnny talks maybe she likes to see his name?



In [27]:
#get top 3 repliers function
def get_most_replied_users(replies_dict):
    most_replied ='|'.join(sorted(replies_dict, key=lambda key: replies_dict[key],reverse=True)[:3])
    return most_replied

In [28]:
#create counter function
def count_user_replies(df):
    reply_duration_minutes = 2 #minutes
    users_replied_to_user = {}
    #iterate through messages
    for index, row in df.iterrows():
        d = row['datetime'] #current messge datetime
        user = row['Name']
        #if user is not in the dictionary add it
        if not user in users_replied_to_user:
            users_replied_to_user[user] = []
        
        #find the messages within a time window after the current message datetime
        user_replies = df[(df['datetime'] > d) & (df['datetime'] < d+datetime.timedelta(minutes=reply_duration_minutes))]['Name']
        users_replied_to_user[user]+=list(set(user_replies)) #only non-duplicates

    #use Counter to count frequency of replies
    for k,v in users_replied_to_user.items():
        users_replied_to_user[k] = Counter(v)
    
    return users_replied_to_user

In [29]:
#build dataframe
replies =   count_user_replies(df)

In [31]:
#reshape the 
replies_reshaped =[]
for k,v in replies.items():
    replies_reshaped.append([k, get_most_replied_users(v)])

replies_reshaped_df = pd.DataFrame(replies_reshaped)

#print(replies_reshaped_df)



**Output**

In [32]:
#output to csva
replies_reshaped_df.to_csv("sectionB10.24.2021_replies_df.csv",index=False)