In [1]:
import pandas as pd
import os
import re

In [2]:
path = "chat_history/"
zoom_logs = path + "meeting_saved_chat 3.txt"

In [3]:
# Read the txt file
lines = open(zoom_logs).read().splitlines()

In [4]:
# Split the lines into the header(tag) and the message.
# Multiline messages are combined into a comma separated string

tags = []
messages = []


while lines:
    tags.append(lines.pop(0))
    message = ""
    while (lines and lines[0].startswith('\t')):
        if len(message) == 0:
            message += lines.pop(0).lstrip("\t")
        else:
            message += ', ' + lines.pop(0).lstrip("\t")
    messages.append(message)

In [5]:
# Parse through the tags to extract the sender, recipient, and timestamp
# Senders are the value between the 'From' and 'to' in the tag
# Recipients are after the 'to' and have the 'Direct Message' tag removed if it is not to Everyone

senders = [text[text.index("From ")+len("From "):text.index(" to")].strip() for text in tags]
recipients = [text[text.index(" to ")+len(" to "):-1].strip().replace("(Direct Message)", "") for text in tags]
timestamps = [text.split(" ")[0] for text in tags]

In [6]:
# Extra structured_logs - Structured Logs

structured_logs = pd.DataFrame(data={'Sender': senders, "Recipients": recipients, "Time": timestamps, "Message": messages})

In [7]:
structured_logs.describe()

Unnamed: 0,Sender,Recipients,Time,Message
count,168,168,168,168
unique,55,9,160,161
top,Vikram Chandna,Everyone,08:58:50,https://chat.whatsapp.com/EJvwZfLezsdKyVzN71tuSu
freq,25,150,2,3


In [9]:
# Extract unique senders 
senders = structured_logs['Sender'].unique()

# Extract and group all messages by Sender into a comma separated string
chat = [", ".join([message for message in structured_logs['Message'].loc[structured_logs['Sender'] == sender]]) for sender in senders]

# Extract any email mentioned in a message using a regex pattern.
# None is replaced with empty strings
# TODO: Add verification to determine if email is the Senders email
pattern = r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
emails = [re.search(pattern, line) for line in chat]
for i in range(len(emails)):
    if emails[i]:
        emails[i] = emails[i].group(0)
emails = ["" if not email else email for email in emails]
print(emails)

['', 'noahtsehai@gmail.com', '', '', 'raindrops.1021@gmail.com', 'Rachelziering1@icloud.com', 'adwaitgo@gmail.com', 'st3phani3.montgom3ry@gmail.com', 'kajalpatel7@yahoo.com', 'romasel.dadea@yahoo.com', '', 'ssiva.mobile@gmail.com', '', '', '', 'Brittany.thomson15@outlook.com', 'vchandna0612@gmail.com', '', 'abhiraammv@gmail.com', 'emailstephney@gmail.com', 'Hasibe.kahraman@hotmail.com', 'Maritere.sanabria1984@gmail.com', '', 'beshob098@gmail.com', '', 'susan.yeruski@gmail.com', '', 'abhishekguti.145@gmail.com', '', 'vvatkinson@yahoo.com', '', '', 'Bearbarebobo@gmail.com', 'jhrp64@gmail.com', '', 'robin_l_t@icloud.com', 'ping2teja@yahoo.com', 'Pguimond@govst.edu', 'faiconperry449@gmail.com', 'iceland9596@live.com', 'vijithkumarkanchana@gmail.com', '', '', '', '', '', '', 'preetidav@gmail.com', '', 'perezcruzmaria1@gmail.com', 'yili.hengg@gmail.com', '', '', '', '']


In [10]:
# Construct the DataFrame for the logs with chat messages grouped by sender
chat_logs_df = pd.DataFrame(data={'name': senders, 'email': emails, 'chat': chat})

In [11]:
# Sort and reindex
chat_logs_df.sort_values('name', inplace=True)
chat_logs_df.reset_index(drop=True, inplace=True)

In [12]:
# Export to csv
chat_logs_df.to_csv('zoom_chat_logs.csv')