In [1]:
# This script compares embedded radio transcripts and fact-checks,
# returning a CSV file of potential misinformation

# ensure libretranslate is running on port 5000 before running this script

import pandas as pd
import os
import numpy as np
from pathlib import Path
from openai import OpenAI

# add your OpenAI API key
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "sk-proj-HSrHGyOHtEDtr9loLzRjT3BlbkFJTUFaqCLYDEs3B7qpBz7z"))

transcript_json= Path().cwd().parent.joinpath('data/output_json/transcript.json')

print('Importing transcript json...')

transcript_df = pd.read_json(transcript_json)


Importing transcript json...


In [2]:
print('Organizing data...')

exploded_transcript_df = transcript_df.explode('transcription').reset_index(drop=True)

exploded_transcript_df = exploded_transcript_df[['file_name', 'transcription']]

Organizing data...


In [3]:
exploded_transcript_df["transcription_before"] = exploded_transcript_df['transcription'].shift(1, fill_value=' ')

In [4]:
exploded_transcript_df['transcription_after'] = exploded_transcript_df['transcription'].shift(-1, fill_value=' ')

In [5]:
exploded_transcript_df['transcription_with_context'] = exploded_transcript_df['transcription_before'] + " " + exploded_transcript_df['transcription'] + " " + exploded_transcript_df['transcription_after']

In [6]:
exploded_transcript_df['transcription_with_context'] = exploded_transcript_df['transcription_with_context'].apply(lambda x: x.strip())

In [7]:

def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding


In [8]:
transcription_embeddings_list = []
counter = 0
for segment in exploded_transcript_df['transcription']:
    print_text = f"Embedding segment {counter} of {len(exploded_transcript_df['transcription'])} segments -- {int((counter)/len(exploded_transcript_df['transcription'])*100)}% complete         "
    print("\r", print_text, end="")
    try:
        embedding = get_embedding(segment)
    except:
        embedding = 0
        print(f"Failed to generate embedding for segment {counter} of {len(exploded_transcript_df['transcription'])}")
    transcription_embeddings_list.append(embedding)
    counter += 1

exploded_transcript_df['transcription_embedding'] = transcription_embeddings_list

 Embedding segment 3756 of 23621 segments -- 15% complete         Failed to generate embedding for segment 3756 of 23621
 Embedding segment 10196 of 23621 segments -- 43% complete         Failed to generate embedding for segment 10196 of 23621
 Embedding segment 15985 of 23621 segments -- 67% complete         Failed to generate embedding for segment 15985 of 23621
 Embedding segment 23620 of 23621 segments -- 99% complete         

In [9]:
transcription_with_context_embeddings_list = []
counter = 0
for segment in exploded_transcript_df['transcription_with_context']:
    print_text = f"Embedding context segment {counter} of {len(exploded_transcript_df['transcription_with_context'])} segments -- {int((counter)/len(exploded_transcript_df['transcription_with_context'])*100)}% complete         "
    print("\r", print_text, end="")
    try:
        embedding = get_embedding(segment)
    except:
        embedding = 0
        print(f"Failed to generate embedding for context segment {counter} of {len(exploded_transcript_df['transcription'])}")
    transcription_with_context_embeddings_list.append(embedding)
    counter += 1

exploded_transcript_df['transcription_with_context_embedding'] = transcription_with_context_embeddings_list

 Embedding context segment 23620 of 23621 segments -- 99% complete         

In [10]:
exploded_transcript_df

Unnamed: 0,file_name,transcription,transcription_before,transcription_after,transcription_with_context,transcription_embedding,transcription_with_context_embedding
0,KCKO_107_9_AM_20240601_130000_000.mp3,"Bonito, todo me parece bonito Que bonito que ...",,Cuando te va bonito que te va Cuando te va bon...,"Bonito, todo me parece bonito Que bonito que t...","[0.007744003087282181, -0.036393869668245316, ...","[0.010290478356182575, -0.037478603422641754, ..."
1,KCKO_107_9_AM_20240601_130000_000.mp3,Cuando te va bonito que te va Cuando te va bon...,"Bonito, todo me parece bonito Que bonito que ...",Cuando te va bonito que te va Cuando te va bon...,"Bonito, todo me parece bonito Que bonito que t...","[0.009081456810235977, -0.03175891190767288, -...","[0.004798032809048891, -0.018404632806777954, ..."
2,KCKO_107_9_AM_20240601_130000_000.mp3,Cuando te va bonito que te va Cuando te va bon...,Cuando te va bonito que te va Cuando te va bon...,¡Más de lo que te gusta! ¡Acelera! ¡Acelera! ¡...,Cuando te va bonito que te va Cuando te va bon...,"[0.005347225349396467, -0.01722787693142891, -...","[0.00919861625880003, -0.015014556236565113, -..."
3,KCKO_107_9_AM_20240601_130000_000.mp3,¡Más de lo que te gusta! ¡Acelera! ¡Acelera! ¡...,Cuando te va bonito que te va Cuando te va bon...,m$ km Pongas entre tu yas dudas que poros Pue...,Cuando te va bonito que te va Cuando te va bon...,"[0.049135785549879074, -0.009870842099189758, ...","[0.03353051468729973, -0.017319098114967346, -..."
4,KCKO_107_9_AM_20240601_130000_001.mp3,m$ km Pongas entre tu yas dudas que poros Pue...,¡Más de lo que te gusta! ¡Acelera! ¡Acelera! ¡...,Están una morca simbocife Y mis siete como el...,¡Más de lo que te gusta! ¡Acelera! ¡Acelera! ¡...,"[0.032236598432064056, -0.007982238195836544, ...","[0.0459740050137043, -0.013996646739542484, -0..."
...,...,...,...,...,...,...,...
23616,WWFE_670_AM_20240601_100000_017.mp3,También es un increíble padre,"Y con pasión, ayuda hasta personas que no conoce","El siempre, siempre tiene tiempo para su familia","Y con pasión, ayuda hasta personas que no cono...","[0.06576722860336304, 0.018034888431429863, -0...","[0.0626211166381836, 0.010116624645888805, -0...."
23617,WWFE_670_AM_20240601_100000_017.mp3,"El siempre, siempre tiene tiempo para su familia",También es un increíble padre,"Yo, Elisa de Mari","También es un increíble padre El siempre, siem...","[0.03570161759853363, 0.03936721384525299, -0....","[0.04543386772274971, 0.014276322908699512, -0..."
23618,WWFE_670_AM_20240601_100000_017.mp3,"Yo, Elisa de Mari","El siempre, siempre tiene tiempo para su familia","Amos, Amipopá, Jorge de Asfias","El siempre, siempre tiene tiempo para su famil...","[0.022891750559210777, -0.010619109496474266, ...","[0.028432738035917282, 0.004396406468003988, -..."
23619,WWFE_670_AM_20240601_100000_017.mp3,"Amos, Amipopá, Jorge de Asfias","Yo, Elisa de Mari",Wellmax y Pasteo Armarco Centres celebrando a ...,"Yo, Elisa de Mari Amos, Amipopá, Jorge de Asfi...","[0.020974498242139816, 0.011386328376829624, -...","[0.044912174344062805, -0.01616838201880455, -..."


In [11]:
embedded_transcripts_path = embedded_false_statements_path = Path().cwd().parent.joinpath('data/embedded_transcripts/embedded_transcripts.csv')

exploded_transcript_df.to_csv(embedded_transcripts_path)