<a href="https://colab.research.google.com/github/isaac-mackey/mind-uploading/blob/main/SMS_Labeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install and configuration

In [None]:
%%capture
from google.colab import drive
drive.mount('/content/drive')
# drive.mount('/content/drive', force_remount=True)
# # Save authentication token for future use.
# !cp /content/drive/MyDrive/Colab\ Notebooks/token.json ~/.config/Google/DriveFS/credentials.json

!pip install openai
from openai import OpenAI
import json


def show_json(obj):
    display(json.loads(obj.model_dump_json()))

## Load SMS from raw JSON

In [None]:
!pip install lxml
from lxml import etree

def raw_extract_sms_from_xml(file_name):
    sms_messages = []
    sent_messages = []
    contacts = {}

    # Create an iterative parser
    context = etree.iterparse(file_name, events=("end",), tag="sms")

    for event, elem in context:
        # Extract SMS data
        sms_data = {
            'address': elem.get('address'),
            'date': elem.get('date'),
            'type': elem.get('type'),
            'body': elem.get('body'),
            'contact_name': elem.get('contact_name')
        }
        sms_messages.append(sms_data)

        # Only sent messages
        # if sms_data['type'] == '2':
        #     sent_messages.append(sms_data)

        # Update contacts dictionary
        contact_name = sms_data['contact_name']
        if contact_name not in contacts:
            contacts[contact_name] = {
                'address': sms_data['address'],
                'messages': [],
                'contact_name': sms_data['contact_name']
            }
        contacts[contact_name]['messages'].append(sms_data)

        # Clear the element to free up memory
        elem.clear()

        # Also eliminate now-empty references from the root node to <sms>
        while elem.getprevious() is not None:
            del elem.getparent()[0]

    # Close the context to free resources
    del context

    return sms_messages, sent_messages, contacts



In [None]:
from datetime import datetime
# Format the current date and time in a human-readable format
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

# Replace with the latest filename
input_file_name = 'sms-20210904143609.xml'
print('input_file_name:',input_file_name)

input_file_path = '/content/drive/My Drive/'+input_file_name
print('input_file_path:',input_file_path)

# Extract contents of input file
sms_messages, sent_messages, contacts = raw_extract_sms_from_xml(input_file_path)

print('Total SMS messages:', len(sms_messages))
# print('Sent SMS messages:', len(sent_messages))
print('Contacts found:', len(contacts))

input_file_name_2 = 'sms-20231224020008.xml'
print('input_file_name:',input_file_name_2)

input_file_path_2 = '/content/drive/My Drive/'+input_file_name_2
print('input_file_path:',input_file_path_2)

# Extract contents of input file
sms_messages_2, sent_messages_2, contacts_2 = raw_extract_sms_from_xml(input_file_path_2)

# Print the lengths of the lists
print('Total SMS messages:', len(sms_messages_2))
# print('Sent SMS messages:', len(sent_messages))
print('Contacts found:', len(contacts_2))

print()
print("combined")
combined_sms_messages = sms_messages + sms_messages_2

print('Total SMS messages:', len(combined_sms_messages))
# print('Sent SMS messages:', len(sent_messages))

contacts = list(set(contacts.keys() + contacts_2.keys()))
print('Contacts found:', len(contacts))

In [None]:
len(list(set(list(contacts.keys()) + list(contacts_2.keys()))))

197

In [None]:
# for c in contacts.keys():
#     print(c)
#     print(contacts[c]['contact_name'])

combined_contacts = {}
for c in contacts.keys():
    if c in contacts_2.keys():
        contact_name = contacts[c]['contact_name']
        combined_contacts[contact_name] = {
            'address': contacts[c]['address'],
            'messages': contacts[c]['messages'] + contacts_2[c]['messages'],
            'contact_name': contact_name
        }
    else:
        contact_name = contacts[c]['contact_name']
        combined_contacts[contact_name] = contacts[c]
for c in contacts_2.keys():
    if c not in contacts.keys():
        contact_name = contacts_2[c]['contact_name']
        combined_contacts[contact_name] = contacts_2[c]

# for c in sorted(combined_contacts.keys()):
#     print(c)

In [None]:
type(combined_sms_messages)

list

In [None]:
combined_sms_messages[107]

{'address': '9158677121',
 'date': '1597246984712',
 'type': '2',
 'body': "I'm outside 302, could you let me in?",
 'contact_name': 'Andy Vargas'}

In [None]:
type(combined_contacts)

dict

In [None]:
print(combined_contacts['Nick Grubb'].keys())

dict_keys(['address', 'messages', 'contact_name'])


## Save contacts json to Drive

In [None]:
import json

# Convert the dictionary to a JSON string
json_string = json.dumps(combined_contacts)

# Save the JSON string to a file in Google Drive
with open('/content/drive/MyDrive/combined_contacts-2024-04-07.json', 'w') as outfile:
    json.dump(json_string, outfile)

## Open saved contacts json into dictionary

In [None]:
with open('/content/drive/MyDrive/combined_contacts-2024-04-07.json', 'r') as infile:
    combined_contacts = json.load(infile)
contacts = eval(combined_contacts)

In [None]:
print(type(contacts))

<class 'dict'>


In [None]:
i = 0
for x in contacts.items():
    print(x)
    i += 1
    if i == 5:
        break

In [None]:
from datetime import datetime

def UNIX_timestamp_to_formatted_datetime(date):
    unix_timestamp = int(date) / 1000  # Convert to integer and then to seconds
    date_time_obj = datetime.utcfromtimestamp(unix_timestamp)
    # Format the datetime object as a string
    formatted_date = date_time_obj.strftime('%Y-%m-%d %H:%M:%S')
    return formatted_date

contact_date_dict = {c:{} for c in contacts}

#def chunk_by_contact_date(output_file_path, sms_messages, contacts):
i = 0
for contact in contacts.values():
    i += 1
    # if i == 4:
    #     break
    if contact['contact_name'] == '(Unknown)':
        continue
    contact_name = contact['contact_name']
    print(contact_name)
    my_name = "Isaac Mackey"

    date_sorted_messages = sorted(contact['messages'], key=lambda x: x['date'])

    this_contact_date_dict = {}

    for message in date_sorted_messages:

        # Convert UNIX timestamp to datetime object
        unix_timestamp = int(message['date']) / 1000  # Convert to integer and then to seconds
        date_time_obj = datetime.utcfromtimestamp(unix_timestamp)
        # Format the datetime object as a string
        formatted_date = date_time_obj.strftime('%Y-%m-%d')

        # print(formatted_date)

        if formatted_date not in this_contact_date_dict:
            this_contact_date_dict[formatted_date] = []
        # Add the message to the list for that date.
        this_contact_date_dict[formatted_date].append(message)

    contact_date_dict[contact_name] = this_contact_date_dict
        # if message['type'] == '1':
        #     file.write(formatted_date + ": " + contact_name + ": "+message['body'] + '\n')
        # if message['type'] == '2':
        #     file.write(formatted_date + ": " + my_name + ": " + message['body'] + '\n')

In [None]:
contact_date_dict['Jonathan Harrison']['2020-08-05']

In [None]:
conversations = []
for contact_name in contact_date_dict.keys():
    for date in contact_date_dict[contact_name].keys():
        conversation = {}
        conversation['contact_name'] = contact_name
        conversation['date'] = date
        messages = contact_date_dict[contact_name][date]
        conversation['content'] = '. '.join(x['body'] for x in messages)
        conversations.append(conversation)

In [None]:
print("len(conversations)",str(len(conversations)))
print(conversations[0])

In [None]:
all_dates = []
for contact_name in contact_date_dict.keys():
    for date in contact_date_dict[contact_name].keys():
        if date not in all_dates:
            all_dates.append(date)
    #print(f'{contact_name}: {str(len(contact_date_dict[contact_name].keys()))}')

contact_unique_days = [(contact_name,len(contact_date_dict[contact_name].keys())) for contact_name in contact_date_dict.keys()]
sorted_contact_unique_days = sorted(contact_unique_days, key=lambda x: x[1])
sorted_contact_unique_days.reverse()
print("Contact Name and Number of Days I've Texted Them From August 2020 to December 2023")
for x in sorted_contact_unique_days:
    print(x)

## SMS labeling (without function tools)

In [None]:
pure_system_message = "You are an assistant that identifies and extracts keywords of relevance from text message conversations."

my_assistant = client.beta.assistants.create(
    instructions=pure_system_message,
    name="Diary Entry Labeling Assistant",
    tools=[],
    model='gpt-3.5-turbo-1106',
    #model="gpt-4-turbo-preview",
    file_ids= [] #[files_to_retrieve]
)

In [None]:
import time
import random

def wait_on_run(run, thread):
    print("wait on run")
    # show_json(run)
    while run.status == "queued" or run.status == "in_progress":
        run = client.beta.threads.runs.retrieve(
            thread_id=thread.id,
            run_id=run.id,
        )
        time.sleep(0.5)
    return run

created_conversation_labels = {}

i = 0

for conversation in conversations:
    print('i:',str(i),'out of',str(len(conversations)))
    i += 1
    # if i == 4:
    #     break
    contact_name = conversation['contact_name']
    date = conversation['date']
    content = conversation['content']

    thread = client.beta.threads.create()
    # show_json(thread)

    content = "Return a Python list of a few keywords that could identify or classify the following conversation: "+content

    # print(content)

    message = client.beta.threads.messages.create(
        thread_id=thread.id,
        role="user",
        content=content,
        file_ids=[] #files_to_retrieve
    )
    # show_json(message)

    run = client.beta.threads.runs.create(
    thread_id=thread.id,
    assistant_id=my_assistant.id
    )

    run = wait_on_run(run, thread)

    if run.status == 'completed' or run.status == 'requires_action':
      messages = client.beta.threads.messages.list(
        thread_id=thread.id
      )
      # show_json(messages)
      m = []
      for y in messages:
          # print(y)
          m.append(y)
      m.reverse()

      answer = ""
      for x in m:
          if x.assistant_id:
              answer += ' '+x.content[0].text.value
      if contact_name not in created_conversation_labels:
          created_conversation_labels[contact_name] = {}
      created_conversation_labels[contact_name][date] = answer
    else:
      print("run.status")
      print(run.status)

2024-04-08: 2790 out of 5191

## Save labels

In [None]:
dict_name = "contact-dates-labels-json-gpt3-sms-1"

import datetime
eastern_now_str = datetime.datetime.utcnow().astimezone(datetime.timezone(datetime.timedelta(hours=-4))).strftime("%Y_%m_%d_%H:%M:%S")
conversation_date = eastern_now_str

# Create the file name.
file_name = f"{dict_name}_{conversation_date}"

print('json file_name',file_name)

import json

json_string = json.dumps(created_conversation_labels)

# Save the JSON string to a file in Google Drive.
with open(f"/content/drive/My Drive/SMS Chatbot/{file_name}.json", "w") as f:
    f.write(json_string)

## Load label dictionary from saved json file

In [None]:
# prompt: Load a dictionary from the json file above

file_name = "contact-dates-labels-json-gpt3-sms-1_2024_04_08_10:38:22.json"

import json

with open(f"/content/drive/My Drive/SMS Chatbot/{file_name}", "r") as f:
    created_conversation_labels = json.load(f)

In [None]:
import re

def extract_keywords_numbered(text):
    # Use regular expression to find the lines containing keywords
    keyword_lines = re.findall(r'\d+\.\s+(.*)', text)

    # Extract keywords from the lines
    keywords = [line.strip() for line in keyword_lines]

    return keywords

def extract_keywords_dashed(text):
    # Use regular expression to find the lines containing keywords
    keyword_lines = re.findall(r'-\s*(.*)', text)

    # Extract keywords from the lines
    keywords = [line.strip() for line in keyword_lines]

    return keywords

def extract_keywords_quotes(text):

    keyword_lines = re.findall(r"'(\w+)'", text)

    # Extract keywords from the lines
    keywords = [line.strip() for line in keyword_lines]

    return keywords

def extract_keywords_brackets(text):

    keyword_lines = re.findall(r"[\w']+|[.,!?;]", text)

    # Extract keywords from the lines
    keywords = [line.strip() for line in keyword_lines]

    return keywords

def extract_keywords(text):
    keyword_functions = [extract_keywords_numbered, extract_keywords_dashed, extract_keywords_quotes, extract_keywords_brackets]

    for func in keyword_functions:
        try:
            keywords = func(text)
            if keywords:
                forbidden_keywords = ["I'm", "conversation", "keywords", "Keywords", ","]
                forbidden_keywords_present = False
                for f in forbidden_keywords:
                    if f in keywords:
                        forbidden_keywords_present = True
                        break
                if not forbidden_keywords_present:
                    return keywords
        except Exception as e:
            #print(f"An error occurred: {e}")
            pass

    return []

# Example usage
entry = """2011/07/01 (Friday)
Based on the provided diary entry, the following keywords could be used to identify or classify the entry:

1. Cycling
2. Adventure
3. Railroad tracks
4. Mile-long straight aways
5. Bonding
6. Gas station
7. Parenting
8. Rules
9. Mistakes
10. Disappearance
11. Responsibility
12. Death"""

keywords = extract_keywords_numbered(entry)
print(keywords)

In [None]:
list_label_dict = {}
i = 4
for x in created_conversation_labels.keys():
    # print(x)
    list_label_dict[x] = {}
    for y in created_conversation_labels[x]:
        # print(y)
        # print(created_conversation_labels[x][y])
        list_label_dict[x][y] = extract_keywords(created_conversation_labels[x][y])[:6]
        i += 1
        if i == 3:
            break
    i += 1
    if i == 3:
        break

In [None]:
# prompt: in created_conversation_labels, count the number of empty lists and non-list entries

empty_lists = 0
non_lists = 0
for contact_name, dates in list_label_dict.items():
    for date, labels in dates.items():
        if type(labels) == list and len(labels) == 0:
            empty_lists += 1
        elif not isinstance(labels, list):
            non_lists += 1

print("Number of empty lists:", empty_lists)
print("Number of non-list entries:", non_lists)

In [None]:
# prompt: print the first 10 dates with empty lists

# Print the first 10 dates with empty lists
empty_list_dates = []
for contact_name, dates in list_label_dict.items():
    for date, labels in dates.items():
        if type(labels) == list and len(labels) == 0:
            empty_list_dates.append((contact_name,date))
            if len(empty_list_dates) == 10:
                break
    if len(empty_list_dates) == 10:
        break

print("First 10 dates with empty lists:")
for contact_name,date in empty_list_dates:
    print(contact_name,date)

In [None]:
print(list_label_dict['Jonathan Harrison']['2022-05-30'])

y = list_label_dict['Jonathan Harrison']['2021-05-28']

print(y)

for x in ['2020-09-30', '2020-10-01', '2020-10-04']:
    print(list_label_dict['Jonathan Harrison'][x])

In [None]:
# prompt: determine the most common labels in the double dictionary list_label_dict

from collections import Counter

# Initialize an empty Counter object
counter = Counter()

# Loop through the dictionary
for contact_name, dates in list_label_dict.items():
    for date, labels in dates.items():
        # Update the counter with the current labels
        counter.update(labels)

# Get the most common labels
most_common_labels = counter.most_common(10)

# Print the most common labels
for label, count in most_common_labels:
    print(f"{label}: {count}")

Seattle: 43
Tomorrow: 41
Santa Barbara: 34
Call: 32
Zoom: 32
Time: 29
Thanks: 28
Chat: 28
Email: 27
Weekend: 27


In [None]:
# prompt: map each label to the date and contact that has that label

label_to_date_contact = {}

for contact_name, dates in list_label_dict.items():
    for date, labels in dates.items():
        for label in labels:
            if label not in label_to_date_contact:
                label_to_date_contact[label] = []
            label_to_date_contact[label].append((contact_name, date))

i = 0
# Print the first 10 entries of the dictionary
for label, date_contacts in label_to_date_contact.items():
    print(f"{label}:")
    i += 1
    for contact_name, date in date_contacts[:10]:
        print(f"  - {contact_name}: {date}")
    print()
    if i == 5:
        break

In [None]:
label_to_date_contact['Watermaker']

[('Jonathan Harrison', '2020-08-04')]

## Save label_to_date_contact to Google Drive

In [None]:
import json

# Convert the dictionary to a JSON string
json_string = json.dumps(label_to_date_contact)

# Save the JSON string to a file in Google Drive
with open('/content/drive/MyDrive/SMS Chatbot/label_to_date_contact-2024-04-08.json', 'w') as outfile:
    outfile.write(json_string)

## Load label_to_dates dict

In [None]:
with open('/content/drive/MyDrive/SMS Chatbot/label_to_date_contact-2024-04-08.json', 'r') as f:
  loaded_label_to_date_contact = json.load(f)

In [None]:
loaded_label_to_date_contact['Marines']

## Print and search labels

In [None]:
# prompt: print the top k labels with the most dates in their entry in label_to_date_dict

# First, we need to count the number of dates associated with each label.
label_counts = {}
for label, dates in loaded_label_to_date_contact.items():
    label_counts[label] = len(dates)

# Then, we can sort the labels by their counts in descending order.
sorted_labels = sorted(label_counts, key=label_counts.get, reverse=True)

# Finally, we can print the top k labels with the most dates.
k = 20  # Change this value to adjust the number of labels to print.
for label in sorted_labels[:k]:
    print(f"Label: {label}, Dates: {label_counts[label]}")

Label: Seattle, Dates: 43
Label: Tomorrow, Dates: 41
Label: Santa Barbara, Dates: 34
Label: Call, Dates: 32
Label: Zoom, Dates: 32
Label: Time, Dates: 29
Label: Thanks, Dates: 28
Label: Chat, Dates: 28
Label: Email, Dates: 27
Label: Weekend, Dates: 27
Label: Thursday, Dates: 25
Label: Isaac, Dates: 25
Label: Address, Dates: 25
Label: Wednesday, Dates: 24
Label: Sunday, Dates: 24
Label: Rowing, Dates: 22
Label: Dinner, Dates: 22
Label: tomorrow, Dates: 21
Label: California, Dates: 21
Label: YouTube, Dates: 21


In [None]:
def remove_duplicates(list_of_pairs):
  unique_pairs = set()
  for pair in list_of_pairs:
    tuple_pair = tuple(pair)
    if tuple_pair not in unique_pairs:
      unique_pairs.add(tuple_pair)
  return list(unique_pairs)

import random

keyword = "start"
keyword_list = []

top_k = 10
random_k = 10

for label in random.sample(sorted_labels[:top_k*3], top_k) + random.sample(sorted_labels, random_k):
    print(f"Label: {label}, Dates: {label_counts[label]}")

while keyword != "":
    print("\nEnter a keyword to get the related journal entries: (enter blank to stop)")
    keyword = input()
    try:
        print("You entered: ", keyword,',',str(len(loaded_label_to_date_contact[keyword])),'entries')
    except KeyError:
        print("You entered: ", keyword,', 0 entries')
    keyword_list.append(keyword)

contacts_dates_selected = []

for keyword in keyword_list:
    if keyword in loaded_label_to_date_contact:
        for date_contact_pairs in loaded_label_to_date_contact[keyword]:
            contacts_dates_selected.append(date_contact_pairs[:10])

contacts_dates_selected  = remove_duplicates(contacts_dates_selected)

window_size = 80

if not contacts_dates_selected:
    print("No entries found for "+' '.join(keyword_list))
else:
    for x in contacts_dates_selected:
        x = tuple(x)
        contact_name, date = x
        print("Contact:", contact_name)
        print("Date:", date)
        print()
        for x in contact_date_dict[contact_name][date]:
            print("Isaac Mackey: " if x['type'] == 1 else contact_name+":",x['body'])
        # lines = entry.split("\n")
        # for line in lines:
        #   if line == "":
        #     # print()  # Print a blank line
        #     continue
        #   for i in range(0, len(line), window_size):
        #     print(line[i:i + window_size])
        #   print()  # Print a newline after each block
        print('#'*window_size)
        print()