In [1]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import csv

In [None]:
file = open('path/to/your/chat/data.csv', encoding='utf-8')

chat = np.loadtxt(file, delimiter = "\t",  dtype = 'O')

print(chat)

['[13/05/2023, 20:50:36] Milo: \u200eMessages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them.'
 '[13/05/2023, 20:50:36] Milo: Heyhey'
 "[13/05/2023, 20:50:47] Milo: I'm in a spoons in Holborn if ya fancy joining"
 ... '[16/11/2023, 01:25:29] Milo: Yeah'
 '[16/11/2023, 09:23:03] Ina: Yeah neither'
 '[16/11/2023, 09:23:14] Ina: But just sucks']


In [3]:
print(len(chat))

2317


In [None]:
#define function for cleaning rules for our dataset

def cleaner(message):
    cleaning_rules = [(r'\[\d+/\d+/\d+, \d+:\d+:\d+]\D+:', ''), (r'‎', ''),
                      (r' Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them.', ''),
                      (r'image omitted', ''), (r'This message was deleted.', ''),
                      (r'Missed voice call', ''), (r'<This message was edited>', ''),
                      (r'video omitted', ''), (r'GIF omitted', ''),
                      (r'sticker omitted', ''), (r'\D+\d+.\d+,-\d+.\d+', ''),] 

    for pattern, replacement in cleaning_rules:
        message = re.sub(pattern, replacement, message)
    
    return message

#this code removes all patterns and strings with empty strings by use of RegEx

In [None]:
#apply the cleaning rules to each message

cleaned_chat = [(cleaner(messages)) for messages in chat]

print(cleaned_chat)

['', ' Heyhey', " I'm in a spoons in Holborn if ya fancy joining", ' Is Milo btw hahaha', ' Yoyoyo', ' We in french house having pints', ' Feel free to join w ur mate if ur bothered 😎', ' Whats ur plans tn we might go to some gaff but i’m not sure', " What's french house", ' Oh hahaha a french pub in soho', ' Tbf I don’t love soho too damn busy', ' Oh interesting', ' Who ya with', ' We could mebbe be down', ' Whats this gaff u going to', ' U know just ‘the irish’', ' Lmao no aoife sadbhb etc', ' Lmk anyways we’ll be here a little while', ' Like some guy we kinda know from college I’m not 100% about going', ' Whats ur guys buzz this eve', ' Fair fair fair', ' Tbf we are p chilled atm not sure', ' Depends how many more pints we get through', ' Else if I don’t end up seeing u tn we going to a thing in ndr next saturday and there’s talks of a thing in rory’s gaff after', ' U should come to that', ' 😎😎', ' Could be doooon', " What's ndr hahaha", ' Next door records its in shé booooo', ' Cou

The above got rid of unnecessary metadata and irrelevant chat parts. If more irrelevant components are found, go back and adjust the list of regex.

In [None]:
#replace empty string with None

for val in range(len(cleaned_chat)):
    if cleaned_chat[val] == '' or cleaned_chat[val] == ' ':
        cleaned_chat[val] = None 

print(cleaned_chat)

[None, ' Heyhey', " I'm in a spoons in Holborn if ya fancy joining", ' Is Milo btw hahaha', ' Yoyoyo', ' We in french house having pints', ' Feel free to join w ur mate if ur bothered 😎', ' Whats ur plans tn we might go to some gaff but i’m not sure', " What's french house", ' Oh hahaha a french pub in soho', ' Tbf I don’t love soho too damn busy', ' Oh interesting', ' Who ya with', ' We could mebbe be down', ' Whats this gaff u going to', ' U know just ‘the irish’', ' Lmao no aoife sadbhb etc', ' Lmk anyways we’ll be here a little while', ' Like some guy we kinda know from college I’m not 100% about going', ' Whats ur guys buzz this eve', ' Fair fair fair', ' Tbf we are p chilled atm not sure', ' Depends how many more pints we get through', ' Else if I don’t end up seeing u tn we going to a thing in ndr next saturday and there’s talks of a thing in rory’s gaff after', ' U should come to that', ' 😎😎', ' Could be doooon', " What's ndr hahaha", ' Next door records its in shé booooo', ' C

In [7]:
#now remove None values

clean_chat =[]
for x in cleaned_chat:
    if (x != None):
        clean_chat.append(x)

print(clean_chat)

[' Heyhey', " I'm in a spoons in Holborn if ya fancy joining", ' Is Milo btw hahaha', ' Yoyoyo', ' We in french house having pints', ' Feel free to join w ur mate if ur bothered 😎', ' Whats ur plans tn we might go to some gaff but i’m not sure', " What's french house", ' Oh hahaha a french pub in soho', ' Tbf I don’t love soho too damn busy', ' Oh interesting', ' Who ya with', ' We could mebbe be down', ' Whats this gaff u going to', ' U know just ‘the irish’', ' Lmao no aoife sadbhb etc', ' Lmk anyways we’ll be here a little while', ' Like some guy we kinda know from college I’m not 100% about going', ' Whats ur guys buzz this eve', ' Fair fair fair', ' Tbf we are p chilled atm not sure', ' Depends how many more pints we get through', ' Else if I don’t end up seeing u tn we going to a thing in ndr next saturday and there’s talks of a thing in rory’s gaff after', ' U should come to that', ' 😎😎', ' Could be doooon', " What's ndr hahaha", ' Next door records its in shé booooo', ' Could d

In [8]:
len(clean_chat)

2277

In [None]:
#basic stats to help tune model parameters

word_count = [len(line.split()) for line in clean_chat]

average_words = sum(word_count) / len(clean_chat)

print(average_words)

print(word_count)

max(word_count)

5.0184453227931485


In [None]:
#create a file path for message pairs

data_path = "/replace/with/your/file/path"


In [None]:
#pair the messages into input / response format

message_pairs = []

for i in range(len(clean_chat) - 1):
    previous_message = clean_chat[i].strip()
    current_message = clean_chat[i+1].strip()

    if previous_message and current_message:
        message_pairs.append([previous_message, current_message]) #make sure each message is part of a pair
    
for pair in message_pairs[:10]:
    print(pair)

['Heyhey', "I'm in a spoons in Holborn if ya fancy joining"]
["I'm in a spoons in Holborn if ya fancy joining", 'Is Milo btw hahaha']
['Is Milo btw hahaha', 'Yoyoyo']
['Yoyoyo', 'We in french house having pints']
['We in french house having pints', 'Feel free to join w ur mate if ur bothered 😎']
['Feel free to join w ur mate if ur bothered 😎', 'Whats ur plans tn we might go to some gaff but i’m not sure']
['Whats ur plans tn we might go to some gaff but i’m not sure', "What's french house"]
["What's french house", 'Oh hahaha a french pub in soho']
['Oh hahaha a french pub in soho', 'Tbf I don’t love soho too damn busy']
['Tbf I don’t love soho too damn busy', 'Oh interesting']


In [None]:
datafile = os.path.join(data_path, "organised_chat.tsv")

delimiter = '\t'

with open(datafile, 'w', encoding ='utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter=delimiter, lineterminator='\n')
    for pair in message_pairs:
        writer.writerow(pair)

print(datafile)

In [None]:
data_file = '/path/to/your/organised_chat.tsv'