In [18]:
import os
import json
import pandas as pd

def load_synthetic_data(folder_path: str):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r') as file:
                chats = json.load(file)["messages"]
                for chat in chats:
                    data.append({"filename": filename, "message": chat['message'], "user": chat['user']})
    return data


synthetic_data = load_synthetic_data("../synthetic_data/")
synthetic_data[:5]

[{'filename': 'chat_142.json',
  'message': "Hey everyone! We've got a bug in our production environment that's causing some downtime.",
  'user': 'alice'},
 {'filename': 'chat_142.json',
  'message': 'What kind of bug are we talking about here?',
  'user': 'bob'},
 {'filename': 'chat_142.json',
  'message': "It seems to be an issue with the user authentication module. Users can't log in after the latest update.",
  'user': 'charlie'},
 {'filename': 'chat_142.json',
  'message': "Right, and it's affecting a lot of our users. We need to fix this ASAP.",
  'user': 'alice'},
 {'filename': 'chat_142.json',
  'message': "I can look into the logs to see what's going wrong with authentication.",
  'user': 'dave'}]

In [19]:
pd.DataFrame(synthetic_data)['filename'].value_counts().describe()

count    360.000000
mean       7.786111
std        2.657061
min        2.000000
25%        6.000000
50%        7.000000
75%        9.000000
max       21.000000
Name: count, dtype: float64

There is a bit of variety in the number of messages, LLMs as usual don't follow number commands that consistently and I didn't really put any validation loop around it, so let's take that into account while generating the split of the data

next step: add noise to the dataset by adding multiple discussions happening at the same time and then split into train/test set. 

In [10]:
!wget -O ../data/ubuntu_irc_data_raw.txt "https://raw.githubusercontent.com/jkkummerfeld/irc-disentanglement/refs/heads/master/data/train/2005-02-06.train-c.raw.txt" 

--2025-02-26 09:26:41--  https://raw.githubusercontent.com/jkkummerfeld/irc-disentanglement/refs/heads/master/data/train/2005-02-06.train-c.raw.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8002::154, 2606:50c0:8001::154, 2606:50c0:8003::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8002::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 93553 (91K) [text/plain]
Saving to: ‘../data/ubuntu_irc_data_raw.txt’


2025-02-26 09:26:41 (3.78 MB/s) - ‘../data/ubuntu_irc_data_raw.txt’ saved [93553/93553]



In [None]:
def get_irc_messages(file_path:str) -> list[str]:
    irc_messages = []
    with open (file_path, 'r') as file:
        for line in file.readlines():
            if not line.startswith('=='):
                # remove the timestamps
                irc_messages.append(line.split(']')[-1].strip())
    return irc_messages


irc_messages = get_irc_messages("../data/ubuntu_irc_data_raw.txt")
irc_messages[:10]

["<jdub> usual: yes, because that's safe",
 '<usual> jdub, I see',
 '<usual> jdub, makes sense',
 '<hams> do i make the linux softlink point to linux-headers-2.6.8.1-4 or linux-hedaers-2.6.8.1-amd64-k8?',
 '<knghtbrd> ahhh, I had forgotten what a joy application keys are...',
 '<Adrenal> eheh',
 "<drspin> eyequeue: and a good philosophy it is :) but, taking iTunes for example, what's wrong with being able to do everything with my music (organize, burn, save, share, find, listen, etc...) all in once place?",
 '<usual> jdub, do you or have you used beagle?',
 '<eyequeue> drspin: never seen it, sorry (no apple stuff in years)',
 '<knghtbrd> and the joy that is having your terminal program claim to be a given terminal, but send the wrong keystrokes for that terminfo  ;)']

In [28]:
import re

def chat_message_structure_from_irc_messages(irc_messages):
    parsed = []
    for entry in irc_messages:
        # Match the pattern: <username> followed by the message text
        match = re.match(r'<([^>]+)>\s*(.*)', entry)
        if match:
            username = match.group(1)
            message = match.group(2)
            parsed.append({'user': username, 'message': message})
    return parsed

noise_messages = chat_message_structure_from_irc_messages(irc_messages)
len(noise_messages), noise_messages[:5]

(1306,
 [{'user': 'jdub', 'message': "usual: yes, because that's safe"},
  {'user': 'usual', 'message': 'jdub, I see'},
  {'user': 'usual', 'message': 'jdub, makes sense'},
  {'user': 'hams',
   'message': 'do i make the linux softlink point to linux-headers-2.6.8.1-4 or linux-hedaers-2.6.8.1-amd64-k8?'},
  {'user': 'knghtbrd',
   'message': 'ahhh, I had forgotten what a joy application keys are...'}])

In [40]:
synth_df = pd.DataFrame(synthetic_data)
synth_df['calendar_event'] = True
noisy_df = pd.DataFrame(noise_messages)
noisy_df['calendar_event'] = False

def split_data(synth_df, noisy_df, train_size=0.7, test_size=0.2, validation_size=0.1):

    def custom_sample(synth_df, noisy_df, count):
        sample_df = synth_df.sample(n=count, random_state=42)
        sample_noisy_df = noisy_df.sample(n=count, random_state=42, replace=True)
        synth_df = synth_df.drop(sample_df.index)
        return pd.concat([sample_df, sample_noisy_df], ignore_index=True), synth_df, noisy_df


    train_count = int(len(synth_df) * train_size)
    test_count = int(len(synth_df) * test_size)
    validation_count = int(len(synth_df) * validation_size)

    train, synth_df, noisy_df = custom_sample(synth_df, noisy_df, train_count)
    test, synth_df, noisy_df = custom_sample(synth_df, noisy_df, test_count)
    validation, synth_df, noisy_df = custom_sample(synth_df, noisy_df, validation_count)
    
    return train, test, validation


train, test, eval = split_data(synth_df, noisy_df)

In [57]:
assert set(train.columns) == set(test.columns)
assert set(test.columns) == set(eval.columns)

train.head()

Unnamed: 0,filename,message,user,calendar_event
0,chat_239.json,Maybe next week? We're a bit swamped this week...,carol,True
1,chat_82.json,"Hey alice, maybe there are some conflicts in t...",carol,True
2,chat_32.json,I appreciate the suggestions! I'll try updatin...,hannah123,True
3,chat_57.json,"Hi alice, I think monday could work for me if ...",bob,True
4,chat_32.json,"Thanks sarah_coder! I did try that, but I'm ha...",hannah123,True


In [58]:
train.calendar_event.value_counts(), test.calendar_event.value_counts(), eval.calendar_event.value_counts()

(calendar_event
 True     1962
 False    1962
 Name: count, dtype: int64,
 calendar_event
 True     560
 False    560
 Name: count, dtype: int64,
 calendar_event
 True     280
 False    280
 Name: count, dtype: int64)

In [60]:
train.to_csv('../data/train.csv', index=False)
test.to_csv('../data/test.csv', index=False)
eval.to_csv('../data/eval.csv', index=False)