<a href="https://colab.research.google.com/github/isaac-mackey/mind-uploading/blob/main/Template_SMS_XML_to_TXT_CSV_JSON.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**How To Use**

1) Get export xml file from "SMS Backup & Restore" Android app

2) Put the xml file in Google Drive

3) Run this notebook using that xml file

4) Export to TXT, CSV, or JSON file

## XML helper library

In [None]:
!pip install lxml

## XML helper functions

In [None]:
from lxml import etree

def raw_extract_sms_from_xml(file_name):
    sms_messages = []
    sent_messages = []
    contacts = {}

    # Create an iterative parser
    context = etree.iterparse(file_name, events=("end",), tag="sms")

    for event, elem in context:
        # Extract SMS data
        sms_data = {
            'address': elem.get('address'),
            'date': elem.get('date'),
            'type': elem.get('type'),
            'body': elem.get('body'),
            'contact_name': elem.get('contact_name')
        }
        sms_messages.append(sms_data)

        # Only sent messages
        # if sms_data['type'] == '2':
        #     sent_messages.append(sms_data)

        # Update contacts dictionary
        contact_name = sms_data['contact_name']
        if contact_name not in contacts:
            contacts[contact_name] = {
                'address': sms_data['address'],
                'messages': [],
                'contact_name': sms_data['contact_name']
            }
        contacts[contact_name]['messages'].append(sms_data)

        # Clear the element to free up memory
        elem.clear()

        # Also eliminate now-empty references from the root node to <sms>
        while elem.getprevious() is not None:
            del elem.getparent()[0]

    # Close the context to free resources
    del context

    return sms_messages, sent_messages, contacts

In [None]:
from datetime import datetime

def UNIX_timestamp_to_formatted_datetime(date):
    unix_timestamp = int(date) / 1000  # Convert to integer and then to seconds
    date_time_obj = datetime.utcfromtimestamp(unix_timestamp)
    # Format the datetime object as a string
    formatted_date = date_time_obj.strftime('%Y-%m-%d %H:%M:%S')
    return formatted_date

def write_output_to_text_file(output_file_path, sms_messages, contacts):
    # Writing the output to a text file
    with open(output_file_path, 'w') as file:
        print("output_file_path" + output_file_path)
        file.write('Total SMS messages: ' + str(len(sms_messages)) + '\n')
        file.write('Contacts found: ' + str(len(contacts)) + '\n')

        earliest = sms_messages[0]['date']
        latest = sms_messages[0]['date']

        for message in combined_sms_messages:
            message_date = message['date']
            if message['date'] < earliest:
                earliest = message_date
            if message['date'] > latest:
                latest = message_date

        file.write('Earliest message: ' + UNIX_timestamp_to_formatted_datetime(earliest) + '\n')
        file.write('Latest message: ' + UNIX_timestamp_to_formatted_datetime(latest) + '\n')

        file.write('\n')

        for contact in contacts.values():
            if contact['contact_name'] == '(Unknown)':
                continue
            file.write('Conversation with '+contact['contact_name'] + ' at ' + contact['address'] + '\n')
            contact_name = contact['contact_name']
            other_name = "Isaac Mackey"
            # Determine the length of the longer string
            max_length = max(len(contact_name), len(other_name))
            padded_contact_name = contact_name.ljust(max_length)
            padded_other_name = other_name.ljust(max_length)

            file.write(str(len(contact['messages'])) + " messages" + '\n')
            date_sorted_messages = sorted(contact['messages'], key=lambda x: x['date'])

            earliest = date_sorted_messages[0]['date']
            latest = date_sorted_messages[-1]['date']

            file.write('Earliest message: ' + UNIX_timestamp_to_formatted_datetime(earliest) + '\n')
            file.write('Latest message: ' + UNIX_timestamp_to_formatted_datetime(latest) + '\n')

            for message in date_sorted_messages:

                # Convert UNIX timestamp to datetime object
                unix_timestamp = int(message['date']) / 1000  # Convert to integer and then to seconds
                date_time_obj = datetime.utcfromtimestamp(unix_timestamp)
                # Format the datetime object as a string
                formatted_date = date_time_obj.strftime('%Y-%m-%d %H:%M:%S')

                if message['type'] == '1':
                    file.write(formatted_date + ": " + padded_contact_name + ": "+message['body'] + '\n')
                if message['type'] == '2':
                    file.write(formatted_date + ": " + padded_other_name + ": " + message['body'] + '\n')
            file.write('\n')

    print('File closed')

## write_prompt_completion_pairs_to_csv

In [None]:
from datetime import datetime
import csv

# Function to check if a message is a question
def is_question(message):
    question_indicators = ["who", "what", "where", "when", "why", "how", "is", "are", "was", "were", "do", "does", "did", "can", "could", "should", "would", "will", "whose", "which"]

    message_lower = str(message.lower())
    if any(message_lower.startswith(sub) for sub in question_indicators):
        return True
    return message.endswith('?')

def write_prompt_completion_pairs_to_csv(output_file_path, sms_messages, contacts):
    if not 'csv' in output_file_path:
        print("csv not found in",output_file_path)
        return

    # Writing the output to a text file
    with open(output_file_path, 'w', newline='') as csvfile:
        print("writing to output_file_path:" + output_file_path)
        writer = csv.writer(csvfile)

        # Write the header
        writer.writerow(['prompt','completion'])

        for contact in contacts.values():
            if contact['contact_name'] == '(Unknown)':
                continue
            contact_name = contact['contact_name']
            other_name = "Isaac Mackey"
            # Determine the length of the longer string
            max_length = max(len(contact_name), len(other_name))
            padded_contact_name = contact_name.ljust(max_length)
            padded_other_name = other_name.ljust(max_length)

            date_sorted_messages = sorted(contact['messages'], key=lambda x: x['date'])

            # Iterate over messages
            for i in range(len(date_sorted_messages) - 1):
                current_message = date_sorted_messages[i]
                next_message = date_sorted_messages[i + 1]

                # Check if the current message is a question
                if is_question(current_message['body']):
                    if current_message['type'] == '1' and next_message['type'] == '2':
                        # Write the current message (prompt) and the next message (completion)
                        writer.writerow([current_message['body'], next_message['body']])

    print('File closed')

## write_messages_to_role_system_user_format_json

In [None]:
from datetime import datetime
import json

def write_messages_to_role_system_user_format_json(output_file_path, sms_messages, contacts):
    if not 'json' in output_file_path:
        print("json not in",output_file_path)
        return

    # Writing the output to a text file
    with open(output_file_path, 'w') as file:
        print("writing to output_file_path:", output_file_path)

        for contact in contacts.values():
            if contact['contact_name'] == '(Unknown)':
                continue
            contact_name = contact['contact_name']
            other_name = "Isaac Mackey"

            date_sorted_messages = sorted(contact['messages'], key=lambda x: x['date'])

            conversation = []

            system_message = ("You are a computer science PhD graduate in the Marine Corps"
                              " who wants to achieve immortality through mind-uploading."
                              " Be polite and formal. Do not apologize. Use correct grammar and avoid logic fallacies.")
            conversation.append({"role": "system", "content": system_message})

            assistant_present = False
            # Iterate over messages
            for message in date_sorted_messages:
                if message['type'] == '1':
                    role = "user"
                    content = message['body']
                    conversation.append({"role": role, "content":content})

                if message['type'] == '2':
                    role = "assistant"
                    content = message['body']
                    conversation.append({"role": role, "content":content})
                    assistant_present = True

            # conversation.append({"role": "separator", "content": "<END_OF_CONVERSATION>"})
            if assistant_present:
                json_record = json.dumps({'messages':conversation})
                file.write(json_record + '\n')

    print('File closed')

## Mount Google Drive

In [None]:
from google.colab import drive

# It would be really great to automate this step... selenium?
drive.mount('/content/drive')

import os
for x in os.listdir('/content/drive/MyDrive'):
  if 'sms-' in x:
    print(x)
    # pass

## Load SMS XML files

In [None]:
from datetime import datetime
# Format the current date and time in a human-readable format
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

# Replace with the latest filename
input_file_name = 'sms-20210904143609.xml'
input_file_path = '/content/drive/My Drive/'+input_file_name

print('input_file_name:',input_file_name)
print('input_file_path:',input_file_path)

import os
if not input_file_name in os.listdir('/content/drive/MyDrive'):
    print("Input file doesn't exist")

## Parse SMS XML into Python dictionary

In [None]:
from datetime import datetime
# Format the current date and time in a human-readable format
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

# Extract contents of input file
sms_messages, sent_messages, contacts = raw_extract_sms_from_xml(input_file_path)

print('input_file_name:',input_file_name)
print('input_file_path:',input_file_path)
# Print the lengths of the lists
print('Total SMS messages:', len(sms_messages))
# print('Sent SMS messages:', len(sent_messages))
print('Contacts found:', len(contacts))

In [None]:
combined_sms_messages = sms_messages

In [None]:
len(combined_sms_messages)

31016

In [None]:
for c in contacts.keys():
    print(c)
    print(contacts[c]['contact_name'])

In [None]:
for c in sorted(contacts.keys()):
    print(c)

In [None]:
 # Examine contacts extracted from input file
pairs = []
for contact in contacts.values():
      pairs.append((contact['contact_name'],contact['address']))
pairs = sorted(pairs, key=lambda x: x[0])
for pair in pairs:
      # print(pair)
      pass

## OUTPUT CONTACT-SORTED CHATLOGS TO TEXT FILE

In [None]:
from datetime import datetime
# Format the current date and time in a human-readable format
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

print('input_file_name:',input_file_name)
print('input_file_path:',input_file_path)
# Print the lengths of the lists
print('Total SMS messages:', len(combined_sms_messages))
# print('Sent SMS messages:', len(sent_messages))
print('Contacts found:', len(combined_contacts))

# Path for the output text file
input_file_name = 'sms-combined.xml'
output_file_name = input_file_name+'-dated-messages.txt'

output_file_path = '/content/drive/My Drive/'+output_file_name

print('output_file_name:',output_file_name)
print('output_file_path:',output_file_path)

if output_file_name in os.listdir('/content/drive/MyDrive'):
    print("Output file already exists")
else:
  # Write contents into output file
  write_output_to_text_file(output_file_path, combined_sms_messages, combined_contacts)

## OUTPUT PROMPT/COMPLETIONS TO CSV FILE

In [None]:
from datetime import datetime
# Format the current date and time in a human-readable format
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

# Path for the output text file
output_file_name = input_file_name+'-prompt-completion-1.csv'
output_file_path = '/content/drive/My Drive/'+output_file_name

print('output_file_name:',output_file_name)
print('output_file_path:',output_file_path)

if output_file_name in os.listdir('/content/drive/MyDrive'):
    print("Output file already exists")

# Write contents into output file
write_prompt_completion_pairs_to_csv(output_file_path, sms_messages, contacts)

## OUTPUT ROLE/SYSTEM/USER TO JSON FILE

In [None]:
from datetime import datetime
# Format the current date and time in a human-readable format
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

# Path for the output text file
input_file_name = 'sms-combined.xml'
output_file_name = input_file_name+'-role-system-user-7.json'
output_file_path = '/content/drive/My Drive/'+output_file_name

print('output_file_name:',output_file_name)
print('output_file_path:',output_file_path)

if output_file_name in os.listdir('/content/drive/MyDrive'):
    print("Output file already exists")

# Write contents into output file
write_messages_to_role_system_user_format_json(output_file_path, sms_messages, contacts)