# mlx-imessage
**Author**: Hansen Han   
**Date**: September 8th, 2024   

**Objective:**  
The goal of this project is to locally fine-tune a large language model (LLM) using iMessage chat history to replicate your personal communication patterns. 

Doing fine-tuning directly on the device ensure privacy and avoiding data leakage to online services. The project focuses on maintaining data security while achieving a personalized, conversational AI that matches your own writing style / text to help come up with organic messages.

**Sources / Inspiration:**
- https://github.com/ShawhinT/YouTube-Blog/tree/main/LLMs/qlora-mlx: Local Fine-tuning on Mac (QLoRA with MLX)
- https://github.com/ml-explore/mlx-examples/tree/main/lora:Fine-Tuning with LoRA or QLoRA
- https://github.com/gavi/mlx-whatsapp:  An mlx project to train a base model on your whatsapp chats using (Q)Lora finetuning
- https://github.com/ishan0102/iClone: Clone your friends with iMessage and MLX


### Libraries / Config

In [None]:
import sqlite3
import pandas as pd
import csv
import random
import json
import subprocess
from mlx_lm import load, generate

pd.set_option('display.max_rows', 100)  # Display all rows
pd.set_option('display.max_columns', 25)  # Display all columns

db_path = "/path/to/chat.db" # change this to your path
your_name = "Your Name" # change this to your name

# set the model to use (see: https://huggingface.co/mlx-community for other models available)
model_path = "mlx-community/Mistral-7B-Instruct-v0.2-4bit"
adapter_path = "adapters.npz"

max_tokens = 500
max_tokens_str = str(max_tokens)

### Helper Functions

In [None]:
def run_command_with_live_output(command: list[str]) -> None:
    """
    Courtesy of ChatGPT:
    Runs a command and prints its output line by line as it executes.

    Args:
        command (List[str]): The command and its arguments to be executed.

    Returns:
        None
    """
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

    # Print the output line by line
    while True:
        output = process.stdout.readline()
        if output == '' and process.poll() is not None:
            break
        if output:
            print(output.strip())
        
    # Print the error output, if any
    err_output = process.stderr.read()
    if err_output:
        print(err_output)

def construct_shell_command(command: list[str]) -> str:
    return str(command).replace("'","").replace("[","").replace("]","").replace(",","")

### Load and Process iMessage Chat Data

In [None]:
# Connect to the database and load the tables that you need
conn = sqlite3.connect(db_path)

query = "SELECT * FROM chat;"
chat_table = pd.read_sql_query(query, conn)

query = "SELECT * FROM message;"
messages_table = pd.read_sql_query(query, conn)

query = "SELECT * FROM chat_message_join;"
chat_message_join = pd.read_sql_query(query, conn)

# Close the connection
conn.close()

# merge the tables together so that you get data you need (the text message, who is it from, did I send it?)
all_data = messages_table[['ROWID', 'text', 'is_from_me']].merge(chat_message_join, left_on="ROWID", right_on="message_id")[['text', 'is_from_me', 'chat_id', 'message_id', 'message_date']].merge(chat_table[['ROWID', 'chat_identifier', 'guid']], left_on='chat_id', right_on="ROWID")
all_data

In [None]:
# remove groupchats (this could be very confusing)
filtered_data = all_data[~all_data['guid'].str.contains(';chat')]
filtered_data = filtered_data[['text', 'is_from_me', 'message_date', 'chat_identifier']]
filtered_data

In [None]:
# only look at chats where I've sent more than 100 messages to the person (actual relationships)
# this was something I expermented with, you can ignore this or look at other ones! 
sum_is_from_me = filtered_data.groupby('chat_identifier')['is_from_me'].sum().reset_index()
sum_is_from_me

# look at the numbers that I probably have an actual relationship with
sum_is_from_me[sum_is_from_me.is_from_me > 100]

In [None]:
# lets look at those identifiers 
chat_identifiers_to_use = list(sum_is_from_me[sum_is_from_me.is_from_me > 100]['chat_identifier'])
chat_identifiers_to_use

In [13]:
# requires manual annotation: give each number a name, and define a relationship for each name
# with imessage, you may have duplicates, so this is helpful 

number_to_name_map = {
    "+134289190384918319489134": "NAME1"
}

name_to_relationship_map = {
    "NAME1": "Friend",
    "NAME2": "Parent",
    "NAME3": "Boss",
}
# map and add the name and relationship data to the message data 
prep_df = filtered_data[filtered_data.chat_identifier.isin(chat_identifiers_to_use)]
prep_df['name'] = prep_df['chat_identifier'].map(number_to_name_map)
prep_df['relationship'] = prep_df['name'].map(name_to_relationship_map)
prep_df

In [None]:
name_result_dict = {}
for name in prep_df[prep_df.relationship == "Friend"]['name'].unique(): # only look at friends to start with
    subset_df = prep_df[prep_df.name == name][~prep_df.text.isin(["None", None, ""])]
    # we need to split these into questions and answers.
    messages = []
    replies = []

    last_symbol = subset_df.iloc[0]['is_from_me']
    current_text = subset_df.iloc[0]['text']
    working_text = current_text


    # skip the first message since we've already extracted it
    for x in range(1, len(subset_df)):
        current_symbol = subset_df.iloc[x]['is_from_me']
        current_text = subset_df.iloc[x]['text']

        # if it has switched, record and wipe the last message/reply
        if current_symbol != last_symbol:
            if last_symbol == 0:
                messages.append(working_text)
                working_text = current_text
            else:
                # add a signature if it is the end of a reply
                working_text = working_text
                replies.append(working_text)
                working_text = current_text
        else:
            working_text = working_text + "\n" + current_text
        
        last_symbol = current_symbol

    # save results
    name_result_dict[name] = {"messages": messages, "replies": replies}

In [18]:
intstructions_string = f"""
You are a chatbot trained on {your_name}'s message history. Your goal is to communicate in a tone and style that closely matches {your_name}'s natural way of speaking.
Please reply to the following message from your
"""

prompts = []

prompt_template = lambda comment, response: f'''<s>[INST] {intstructions_string} {relationship}: \n{comment} \n[/INST]\n''' + response + "</s>"

for name in list(name_result_dict.keys()):
    messages = name_result_dict[name]['messages']
    replies = name_result_dict[name]['replies']
    relationship = name_to_relationship_map[name]

    for i in range(len(messages)):
        try:
            prompt = {"text":prompt_template(messages[i], relationship, replies[i])}
            prompts.append(prompt)
        except:
            pass


In [19]:
# create test and val data
num_test = 10
num_val = 10
test_val_index_list = random.sample(range(0, len(prompts)-1), num_test+num_val)

test_list = [prompts[index] for index in test_val_index_list[:num_test]]
val_list = [prompts[index] for index in test_val_index_list[num_test:]]

for example in test_list+val_list:
    prompts.remove(example)

### save data to files
with open('./data/train.jsonl', 'w') as output_file:
    for prompt in prompts:
        json.dump(prompt, output_file)
        output_file.write('\n')

with open('./data/test.jsonl', 'w') as output_file:
    for prompt in test_list:
        json.dump(prompt, output_file)
        output_file.write('\n')

with open('./data/valid.jsonl', 'w') as output_file:
    for prompt in val_list:
        json.dump(prompt, output_file)
        output_file.write('\n')

### Fine-Tuning Model w/ mlx 

In [None]:
# load the model from HF
model, tokenizer = load(model_path)

In [None]:
# try running inference without fine tuning
prompt_builder = lambda comment, relationship: f'''<s>[INST] {intstructions_string} {relationship}: \n{comment} \n[/INST]\n'''
prompt = prompt_builder("Hey man, whats up?")
response = generate(model, tokenizer, prompt=prompt, max_tokens = max_tokens,verbose=True)

In [None]:
# create command for fine-tuning with QLoRA
num_iters = "100"
steps_per_eval = "10"
val_batches = "-1"
learning_rate = "1e-5" 
num_layers = 16 

command = ['python', 'scripts/lora.py', '--model', model_path, '--train', '--iters', num_iters, '--steps-per-eval', steps_per_eval, '--val-batches', val_batches, '--learning-rate', learning_rate, '--lora-layers', num_layers, '--test']
print(construct_shell_command(command)) # run this command in the terminal to generate the adapters.npz file

In [None]:
# now try running inference with your fine-tuned model and it should reply with your tone and style.
context = "This is your friend, Adam." # After experimenting, I found that adding context improved coherence. 
relationship = "friend"

instructions_string = f"""
You are a chatbot trained on {your_name}'s message history. Your goal is to communicate in a tone and style that closely matches {your_name}'s natural way of speaking.

{context}

Please reply to the following message from your {relationship}:
"""

prompt_builder = lambda comment: f'''<s>[INST] {instructions_string} \n{comment} \n[/INST]\n'''

comment = "Hey buddy, do you want to hang out soon?"
prompt = prompt_builder(comment)

In [None]:
# define command
command = ['python', 'scripts/lora.py', '--model', model_path, '--adapter-file', adapter_path, '--max-tokens', max_tokens_str, '--prompt', prompt]

# run command and print results continuously 
run_command_with_live_output(command)