In [None]:
# pip install nltk --upgrade

In [None]:
import os
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

In [None]:
# You must need to download those toolkits 
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

In [2]:
lemmatizer = WordNetLemmatizer()
STOP_WORDS = set(stopwords.words('english'))

In [3]:
def parse_chat_log(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        lines = file.readlines()

    user_msgs = []
    ai_msgs = []

    for line in lines:
        if line.startswith("User:"):
            user_msgs.append(line[len("User:"):].strip())
        elif line.startswith("AI:"):
            ai_msgs.append(line[len("AI:"):].strip())

    return user_msgs, ai_msgs

In [4]:
def clean_and_lemmatize(text_list):
    cleaned = []
    for text in text_list:
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        tokens = nltk.word_tokenize(text)
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in STOP_WORDS]
        cleaned.append(' '.join(tokens))
    return cleaned

In [5]:
def extract_keywords_tfidf(text_list, top_n=5):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(text_list)
    feature_names = vectorizer.get_feature_names_out()

    scores = tfidf_matrix.sum(axis=0).A1
    word_scores = dict(zip(feature_names, scores))

    sort_keys = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
    return [word for word, score in sort_keys[:top_n]]

In [6]:
def generate_topic_summary(keywords):
    if not keywords:
        return

    if len(keywords) == 1:
        return f"The conversation was mainly about {keywords[0]}."
    elif len(keywords) == 2:
        return f"The conversation was mainly about {keywords[0]} and {keywords[1]}."
    else:
        main_part = ", ".join(keywords[:-1])
        return f"The conversation was mainly about {main_part}, and {keywords[-1]}."

In [9]:
def summarize_chat(file_path):
    if not os.path.exists(file_path):
        print(f"Error: File not found at {file_path}")
        return

    user_msgs, ai_msgs = parse_chat_log(file_path)
    all_msgs = user_msgs + ai_msgs
    total_exchanges = len(all_msgs)

    cleaned_msgs = clean_and_lemmatize(all_msgs)
    top_keywords = extract_keywords_tfidf(cleaned_msgs)
    
    topic_summary = generate_topic_summary(top_keywords)

    print(f"Total exchanges        : {total_exchanges}")
    print(f"User messages          : {len(user_msgs)}")
    print(f"AI messages            : {len(ai_msgs)}")
    print(f"Nature of conversation : {topic_summary}")
    print(f"Most common keywords   : {', '.join(top_keywords)}")

In [10]:
if __name__ == "__main__":
    chat_log_path = "/kaggle/input/simple-chatlog/Chat_log.txt"
    summarize_chat(chat_log_path)

Total exchanges        : 34
User messages          : 17
AI messages            : 17
Nature of conversation : The conversation was mainly about learning, machine, deep, unsupervised, and supervised.
Most common keywords   : learning, machine, deep, unsupervised, supervised
