In [18]:
import sys
import re
from collections import Counter

In [19]:

def parse_chat_log(file_path):
    messages = []
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith("User:"):
                message = line[len("User:"):].strip()
                messages.append({"speaker": "User", "message": message})
            elif line.startswith("AI:"):
                message = line[len("AI:"):].strip()
                messages.append({"speaker": "AI", "message": message})
    return messages


In [20]:
# Directly specify the file path
file_path = r"C:\Users\User\Desktop\thesis\qtech_project\ai-chat-log-summarizer\chat.txt"

# Call the function and print the result
messages = parse_chat_log(file_path)
print(messages)

[{'speaker': 'User', 'message': 'Hi, what is Python?'}, {'speaker': 'AI', 'message': 'Python is a programming language.'}, {'speaker': 'User', 'message': 'What can I do with it?'}, {'speaker': 'AI', 'message': 'You can build websites, analyze data, and more.'}]


In [23]:
# Define common stop words to exclude from keyword analysis
stop_words = set([
    "the", "is", "and", "to", "a", "in", "it", "of", "for", "on", "that", "with", "as", "at", "by", "from", "about",
    "how", "what", "when", "where", "who", "which", "why", "can", "you", "i", "me", "my", "mine", "your", "yours",
    "he", "she", "we", "they", "them", "this", "those", "am", "are", "was", "were", "be", "been", "being", "have",
    "has", "had", "do", "does", "did", "will", "would", "shall", "should", "may", "might", "must", "could"
])



# Calculate message statistics
number_of_exchanges = len([m for m in messages if m["speaker"] == "User"])

# Keyword extraction
all_text = ' '.join([m["message"] for m in messages])
words = [word.lower() for word in re.split(r'\W+', all_text) if word and word.lower() not in stop_words]
word_counts = Counter(words)
top_keywords = [word for word, count in word_counts.most_common(5)]

# Print results
print(f"Number of exchanges: {number_of_exchanges}")
print(f"Top keywords: {top_keywords}")

Number of exchanges: 2
Top keywords: ['python', 'hi', 'programming', 'language', 'build']


In [24]:
# Generate summary
summary = (
    "Summary:\n"
    f"- The conversation had {number_of_exchanges} exchanges.\n"
    f"- The conversation was about {', '.join(top_keywords)}.\n"
    f"- Most common keywords: {', '.join(top_keywords)}"
)
print(summary)

Summary:
- The conversation had 2 exchanges.
- The conversation was about python, hi, programming, language, build.
- Most common keywords: python, hi, programming, language, build


TF-IDF Solution

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

messages = parse_chat_log(file_path)
all_text = [' '.join([m["message"] for m in messages])]
vectorizer = TfidfVectorizer(stop_words=list(stop_words), max_features=5)
tfidf_matrix = vectorizer.fit_transform(all_text)
top_keywords = vectorizer.get_feature_names_out()
print(f"Top keywords: {top_keywords}")


Top keywords: ['analyze' 'build' 'data' 'hi' 'python']


In [28]:
total_messages = len(messages)
user_messages = len([m for m in messages if m["speaker"] == "User"])
ai_messages = len([m for m in messages if m["speaker"] == "AI"])
print(f"Total messages: {total_messages}")
print(f"User messages: {user_messages}")
print(f"AI messages: {ai_messages}")

Total messages: 4
User messages: 2
AI messages: 2
