In [None]:
import logging
from datetime import datetime

current_file_name = "10_Zero_Words_Analysis"

dt_string = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f"logs/{current_file_name}/{dt_string}.log"
logging.basicConfig(level=logging.INFO, filename=log_file,filemode="w", format="%(asctime)s %(levelname)s %(message)s")

# https://blog.sentry.io/logging-in-python-a-developers-guide/

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import json

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
sns.set() # Use seaborn's default style to make attractive graphs
plt.rcParams['figure.dpi'] = 100 # Show nicely large images in this notebook

In [None]:
from helpers.pages import *
from helpers.constants import *
from helpers.utils import *

In [None]:
pd.set_option('display.max_columns', 500)

In [None]:
def get_words_dataset(response):
    words = response["words"]
    words_df = pd.DataFrame(words)
    words_df["articulation_duration"] = words_df["end"] - words_df["start"]

    return words_df

In [None]:
def get_dict_of_paths(root_path, file_extension=".json"):
    dict_of_paths = {}
    for root, dirs, files in os.walk(root_path):
        if len(files) > 0:
            files = [f for f in files if f.endswith(file_extension)]
            files = [os.path.join(root, f) for f in files]
            
            folder_name = root.split("\\")[-1]
            dict_of_paths[folder_name] = files
    return dict_of_paths

In [None]:
extracted_transcripts_fg_path = "data\\7_3_Combine_Chunks\\FG"
extracted_transcripts_h_path = "data\\7_3_Combine_Chunks\\H"

extracted_transcripts_fg_path_google = "data\\7_Elaborations_Transcripts\\FG_Google"
extracted_transcripts_h_path_google = "data\\7_Elaborations_Transcripts\\H_Google"

In [None]:
fg_paths = get_dict_of_paths(extracted_transcripts_fg_path)
h_paths = get_dict_of_paths(extracted_transcripts_h_path)

In [None]:
def get_zero_stats(dict_of_paths):
    list_of_dicts = []
    for k, v in dict_of_paths.items():
        for file in v:
            json_file = json.load(open(file))
            words_df = get_words_dataset(json_file)
            respondent = file.split("\\")[-2]
            elaboration = file.split("\\")[-1]
            elaboration = elaboration[:-14]
            
            list_of_dicts.append({
                "respondent": respondent,
                "elaboration": elaboration,
                "zero_stats": words_df[words_df["articulation_duration"] == 0].shape[0],
                "length": words_df.shape[0],
                "zero_precentage": words_df[words_df["articulation_duration"] == 0].shape[0] / words_df.shape[0] * 100
            })

    df = pd.DataFrame(list_of_dicts)

    return df

In [None]:
fg_stats = get_zero_stats(fg_paths)
h_stats = get_zero_stats(h_paths)

In [None]:
# Pivot the data based on respondent
fg_stats_pivot = fg_stats.pivot(index="respondent", columns="elaboration", values=["zero_precentage", "length"])
h_stats_pivot = h_stats.pivot(index="respondent", columns="elaboration", values=["zero_precentage", "length"])

In [None]:
fg_stats_pivot

In [None]:
h_stats_pivot

In [None]:
# Max 20% of zero percentage
fg_stats_pivot[fg_stats_pivot["zero_precentage"] > 20]

In [None]:
# Max 20% of zero percentage
h_stats_pivot[h_stats_pivot["zero_precentage"] > 20]

In [None]:
fg_paths_openai_txt = get_dict_of_paths(extracted_transcripts_fg_path, file_extension=".txt")
h_paths_openai_txt = get_dict_of_paths(extracted_transcripts_h_path, file_extension=".txt")

fg_paths_google_txt = get_dict_of_paths(extracted_transcripts_fg_path_google, file_extension=".txt")
h_paths_google_txt = get_dict_of_paths(extracted_transcripts_h_path_google, file_extension=".txt")

In [None]:
def merge_dicts(openai_dict, google_dict):
    merged_dict = {}
    for k, v in openai_dict.items():
        merged_dict[k] = { "openai": v, "google": google_dict[k]}

    # Remove all paths ending with _response.txt
    for k, v in merged_dict.items():
        openai_paths = v["openai"]
        google_paths = v["google"]
        openai_paths = [p for p in openai_paths if not p.endswith("_response.txt")]
        google_paths = [p for p in google_paths if not p.endswith("_response.txt")]

        v["openai"] = openai_paths
        v["google"] = google_paths
        
    return merged_dict

In [None]:
fg_paths_txt = merge_dicts(fg_paths_openai_txt, fg_paths_google_txt)
h_paths_txt = merge_dicts(h_paths_openai_txt, h_paths_google_txt)

In [None]:
def get_simmilarity(text1, text2):
    text = [text1, text2]
    vectorizer = CountVectorizer().fit_transform(text)
    vectors = vectorizer.toarray()
    csim = cosine_similarity(vectors)
    return csim[0][1]


In [None]:
def compare_openai_google(dict_of_paths):
    list_of_dicts = []
    for k, v in dict_of_paths.items():
        for openai_file, google_file in zip(v["openai"], v["google"]):
            openai_text = open(openai_file, "r").read()
            google_text = open(google_file, "r").read()

            elaboration = openai_file.split("\\")[-1]
            elaboration = elaboration[:-4]

            list_of_dicts.append({
                "respondent": k,
                "elaboration": elaboration,
                "openai": openai_text,
                "google": google_text,
                "simmilarity": get_simmilarity(openai_text, google_text)
            })

    df = pd.DataFrame(list_of_dicts)

    return df

In [None]:
fg_compared = compare_openai_google(fg_paths_txt)
h_compared = compare_openai_google(h_paths_txt)

In [None]:
# Show 100 characters of the text in pandas dataframe
pd.set_option('display.max_colwidth', 200)

In [None]:
fg_compared.sort_values(by="simmilarity", ascending=True).head(10)

In [None]:
# Create list of lists od damaged files in [respondent, elaboration] format
damaged_files_fg = fg_compared[fg_compared["simmilarity"] < 0.7][["respondent", "elaboration"]].values.tolist()
print(len(damaged_files_fg))
damaged_files_fg

In [None]:
h_compared.sort_values(by="simmilarity", ascending=True).head(10)

In [None]:
# Create list of lists od damaged files in [respondent, elaboration] format
damaged_files_h = h_compared[h_compared["simmilarity"] < 0.7][["respondent", "elaboration"]].values.tolist()
print(len(damaged_files_h))
damaged_files_h