In [None]:
import logging
from datetime import datetime

current_file_name = "9_Transcripts_Analysis"

dt_string = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f"logs/{current_file_name}/{dt_string}.log"
logging.basicConfig(level=logging.INFO, filename=log_file,filemode="w", format="%(asctime)s %(levelname)s %(message)s")

# https://blog.sentry.io/logging-in-python-a-developers-guide/

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

import nltk
from collections import Counter
import string

from scipy.spatial import distance
import plotly.express as px
from sklearn.cluster import KMeans
from umap import UMAP

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
from helpers.pages import *
from helpers.constants import *
from helpers.utils import *

In [None]:
pd.set_option('display.max_columns', 500)

In [None]:
def get_dict_of_paths(root_path):
    dict_of_paths = {}
    for root, dirs, files in os.walk(root_path):
        if len(files) > 0:
            files = [f for f in files if f.endswith(".csv")]
            files = [os.path.join(root, f) for f in files]
            
            folder_name = root.split("\\")[-1]
            dict_of_paths[folder_name] = files
    return dict_of_paths

In [None]:
extracted_transcripts_fg_path = "data\\8_Transcripts_Processing_GPT\\FG"
extracted_transcripts_h_path = "data\\8_Transcripts_Processing_GPT\\H"

In [None]:
fg_paths = get_dict_of_paths(extracted_transcripts_fg_path)
h_paths = get_dict_of_paths(extracted_transcripts_h_path)

In [None]:
def merge_transcripts(dict_of_paths):
    dfs = []
    for k, v in dict_of_paths.items():
        for file in v:
            df = pd.read_csv(file, sep="~")
            dfs.append(df)
    return pd.concat(dfs)

In [None]:
fg_transcripts = merge_transcripts(fg_paths)
h_transcripts = merge_transcripts(h_paths)

data = pd.concat([fg_transcripts, h_transcripts])

In [None]:
data.columns

In [None]:
# Abbreviation	Meaning
# CC	coordinating conjunction
# CD	cardinal digit
# DT	determiner
# EX	existential there
# FW	foreign word
# IN	preposition/subordinating conjunction
# JJ	This NLTK POS Tag is an adjective (large)
# JJR	adjective, comparative (larger)
# JJS	adjective, superlative (largest)
# LS	list market
# MD	modal (could, will)
# NN	noun, singular (cat, tree)
# NNS	noun plural (desks)
# NNP	proper noun, singular (sarah)
# NNPS	proper noun, plural (indians or americans)
# PDT	predeterminer (all, both, half)
# POS	possessive ending (parent\ ‘s)
# PRP	personal pronoun (hers, herself, him, himself)
# PRP$	possessive pronoun (her, his, mine, my, our )
# RB	adverb (occasionally, swiftly)
# RBR	adverb, comparative (greater)
# RBS	adverb, superlative (biggest)
# RP	particle (about)
# TO	infinite marker (to)
# UH	interjection (goodbye)
# VB	verb (ask)
# VBG	verb gerund (judging)
# VBD	verb past tense (pleaded)
# VBN	verb past participle (reunified)
# VBP	verb, present tense not 3rd person singular(wrap)
# VBZ	verb, present tense with 3rd person singular (bases)
# WDT	wh-determiner (that, what)
# WP	wh- pronoun (who)
# WRB	wh- adverb (how)

In [None]:
def calculate_nltk_metrics(text):
    # Tokenize the text into words
    words = nltk.word_tokenize(text)

    # Part-of-speech tagging to identify nouns, verbs, adjectives, adverbs, pronouns, etc.
    pos_tags = nltk.pos_tag(words)

    # Count occurrences of nouns, verbs, adjectives, adverbs, pronouns, and punctuation
    noun_count = sum(1 for word, tag in pos_tags if tag.startswith('NN') or tag.startswith('PRP') or tag.startswith('WP'))
    verb_count = sum(1 for word, tag in pos_tags if tag.startswith('VB'))
    adj_count = sum(1 for word, tag in pos_tags if tag.startswith('JJ'))
    adv_count = sum(1 for word, tag in pos_tags if tag.startswith('RB'))
    pronoun_count = sum(1 for word, tag in pos_tags if tag.startswith('PRP'))
    punctuation_count = sum(1 for word in words if word in string.punctuation)

    # Count total number of words
    total_words = len(words)

    # Other metrics
    unique_words = len(set(words))
    word_lengths = [len(word) for word in words]
    average_word_length = sum(word_lengths) / total_words
    lexical_diversity = len(set(words)) / total_words
    word_freq = Counter(words).most_common(10)  # Top 10 most frequent words
    
    return {
        "noun_count": noun_count,
        "verb_count": verb_count,
        "adj_count": adj_count,
        "adv_count": adv_count,
        "pronoun_count": pronoun_count,
        "punctuation_count": punctuation_count,
        "total_words": total_words,
        "unique_words": unique_words,
        "average_word_length": average_word_length,
        "lexical_diversity": lexical_diversity,
        "most_frequent_words:": word_freq
    }

In [None]:
applied_data = data.apply(lambda row: calculate_nltk_metrics(row["transcript"]), axis='columns', result_type='expand')
data = pd.concat([data, applied_data], axis='columns')

In [None]:
def sanitaze_response(data, column, possible_answers):
    # Sometimes gpt answers with whole sentences, sometimes with just a word. This function sanitizes the response to be a word from the possible answers.
    
    # Remove . from the column
    data[column] = data[column].str.replace(".", "")

    # Check if some if some value from possible answers is in the column
    for answer in possible_answers:
        data.loc[data[column].str.contains(answer, case=False), column] = answer

    return data

In [None]:
data = sanitaze_response(data, "relevant", ["Yes", "No"])
data = sanitaze_response(data, "quality", ["Good", "Average", "Poor"])
data = sanitaze_response(data, "honesty", ["Yes", "No"])
data = sanitaze_response(data, "tone", ["Positive", "Neutral", "Negative"])
data = sanitaze_response(data, "language_complexity", ["Simple", "Average", "Complex"])
data = sanitaze_response(data, "linguistic_cues", ["Yes", "No"])
data = sanitaze_response(data, "defensiveness", ["Yes", "No"])
data = sanitaze_response(data, "contradictions", ["Yes", "No"])
data = sanitaze_response(data, "consistency", ["Yes", "No"])
data = sanitaze_response(data, "intent", ["Informative", "Evasive", "Defensive"])

In [None]:
data.to_csv("data/9_Transcripts_Analysis/merged_transcripts.csv", index=False, sep="~")

In [None]:
# Plot pie chart of the 'relevant', 'quality', 'honesty', 'tone', 'language_complexity', 'linguistic_cues', 'defensiveness', 'contradictions', 'consistency', 'intent' columns from dataframe data using the 'seaborn' library, for variant FG and H separately side by side.
# Share legend between subplots.

def plot_pie_chart(data, column_name, title):
    plt.figure(figsize=(10, 5))
    plt.suptitle(title)
    plt.subplot(1, 2, 1)
    data[data["variant"] == "FG"][column_name].value_counts().plot.pie(autopct="%.1f%%")
    plt.title("FG")
    plt.subplot(1, 2, 2)
    data[data["variant"] == "H"][column_name].value_counts().plot.pie(autopct="%.1f%%")
    plt.title("H")

    plt.show()

plot_pie_chart(data, "relevant", "Relevant")
plot_pie_chart(data, "quality", "Quality")
plot_pie_chart(data, "honesty", "Honesty")
plot_pie_chart(data, "tone", "Tone")
plot_pie_chart(data, "language_complexity", "Language Complexity")
plot_pie_chart(data, "linguistic_cues", "Linguistic Cues")
plot_pie_chart(data, "defensiveness", "Defensiveness")
plot_pie_chart(data, "contradictions", "Contradictions")
plot_pie_chart(data, "consistency", "Consistency")
plot_pie_chart(data, "intent", "Intent")

In [None]:
# Plot bar chart of the 'relevant', 'quality', 'honesty', 'tone', 'language_complexity', 'linguistic_cues', 'defensiveness', 'contradictions', 'consistency', 'intent' columns from dataframe data using the 'seaborn' library, for variant FG and H separately side by side.

def plot_bar_chart(data, column_name, title):
    plt.figure(figsize=(10, 5))
    plt.suptitle(title)
    plt.subplot(1, 2, 1)
    sns.countplot(data=data[data["variant"] == "FG"], x=column_name)
    plt.title("FG")
    plt.subplot(1, 2, 2)
    sns.countplot(data=data[data["variant"] == "H"], x=column_name)
    plt.title("H")
    plt.show()

plot_bar_chart(data, "relevant", "Relevant")
plot_bar_chart(data, "quality", "Quality")
plot_bar_chart(data, "honesty", "Honesty")
plot_bar_chart(data, "tone", "Tone")
plot_bar_chart(data, "language_complexity", "Language Complexity")
plot_bar_chart(data, "linguistic_cues", "Linguistic Cues")
plot_bar_chart(data, "defensiveness", "Defensiveness")
plot_bar_chart(data, "contradictions", "Contradictions")
plot_bar_chart(data, "consistency", "Consistency")
plot_bar_chart(data, "intent", "Intent")

In [None]:
def plot_histogram(data, column_name, title):
    plt.figure(figsize=(10, 5))
    plt.suptitle(title)
    plt.subplot(1, 2, 1)
    sns.histplot(data[data["variant"] == "FG"][column_name])
    plt.title("FG")
    plt.subplot(1, 2, 2)
    sns.histplot(data[data["variant"] == "H"][column_name])
    plt.title("H")

    plt.show()

plot_histogram(data, "noun_count", "Noun Count")
plot_histogram(data, "verb_count", "Verb Count")
plot_histogram(data, "adj_count", "Adjective Count")
plot_histogram(data, "adv_count", "Adverb Count")
plot_histogram(data, "pronoun_count", "Pronoun Count")
plot_histogram(data, "punctuation_count", "Punctuation Count")
plot_histogram(data, "total_words", "Total Words")
plot_histogram(data, "unique_words", "Unique Words")
plot_histogram(data, "average_word_length", "Average Word Length")
plot_histogram(data, "lexical_diversity", "Lexical Diversity")

In [None]:
# Plot kde where FG and H are plotted on the same graph
def plot_kde(data, column_name, title):
    plt.figure(figsize=(10, 5))
    plt.suptitle(title)
    sns.kdeplot(data[data["variant"] == "FG"][column_name], label="FG")
    sns.kdeplot(data[data["variant"] == "H"][column_name], label="H")
    plt.legend()
    plt.show()

plot_kde(data, "noun_count", "Noun Count")
plot_kde(data, "verb_count", "Verb Count")
plot_kde(data, "adj_count", "Adjective Count")
plot_kde(data, "adv_count", "Adverb Count")
plot_kde(data, "pronoun_count", "Pronoun Count")
plot_kde(data, "punctuation_count", "Punctuation Count")
plot_kde(data, "total_words", "Total Words")
plot_kde(data, "unique_words", "Unique Words")
plot_kde(data, "average_word_length", "Average Word Length")
plot_kde(data, "lexical_diversity", "Lexical Diversity")

In [None]:
# Pairplot of the 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pronoun_count', 'punctuation_count', 'total_words', 'unique_words', 'average_word_length', 'lexical_diversity' columns from dataframe data using the 'seaborn' library, for variant FG and H separately side by side.
sns.pairplot(data, hue="variant", vars=['noun_count', 'verb_count', 'adj_count', 'adv_count', 'pronoun_count', 'punctuation_count', 'total_words', 'unique_words', 'average_word_length', 'lexical_diversity'])

In [None]:
# Top 10 most frequent words in the transcripts for variant FG and H separately side by side.
def plot_most_frequent_words(data, variant):
    plt.figure(figsize=(10, 5))
    plt.suptitle(f"Top 10 most frequent words in {variant}")
    plt.subplot(1, 2, 1)
    data[data["variant"] == variant]["most_frequent_words:"].apply(lambda x: dict(x)).apply(pd.Series).sum().sort_values(ascending=False).head(10).plot.bar()
    plt.title(variant)

    plt.show()

plot_most_frequent_words(data, "FG")
plot_most_frequent_words(data, "H")

In [None]:
def string_to_list(row):
    row = row.replace("[", "")
    row = row.replace("]", "")
    res = [float(x) for x in row.split(",")]

    return res

In [None]:
data["embedding_list"] = data["embedding"].apply(string_to_list)
data.head()

In [None]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(data["embedding_list"].tolist())

In [None]:
reducer = UMAP()
embeddings_2d = reducer.fit_transform(data["embedding_list"].tolist())

In [None]:
fig = px.scatter(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1], color=kmeans.labels_, hover_data={"variant": data["variant"], "respondent": data["respondent"], "elaboration_name": data["elaboration_name"], "relevant": data["relevant"], "quality": data["quality"], "honesty": data["honesty"], "tone": data["tone"], "language_complexity": data["language_complexity"], "linguistic_cues": data["linguistic_cues"], "defensiveness": data["defensiveness"], "contradictions": data["contradictions"], "consistency": data["consistency"], "intent": data["intent"]})
fig.show()