In [1]:
import pandas as pd
import os
import numpy as np
from pathlib import Path
from ast import literal_eval
from openai import OpenAI
from libretranslatepy import LibreTranslateAPI


In [40]:

# ensure libretranslate is running on port 5000

number_return_values = 1

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "sk-proj-HSrHGyOHtEDtr9loLzRjT3BlbkFJTUFaqCLYDEs3B7qpBz7z"))

transcript_json= Path().cwd().parent.joinpath('data/output_json/transcript.json')

print('Importing transcript json...')

transcript_df = pd.read_json(transcript_json)



Importing transcript json...


In [48]:


print('Organizing data...')

exploded_transcript_df = transcript_df.explode('transcription').reset_index(drop=True)

exploded_transcript_df = exploded_transcript_df[['file_name', 'transcription']]

embedded_false_statements_path = Path().cwd().parent.joinpath('data/factchecked_statements/embedded_false_statements.csv')


Organizing data...


In [49]:
exploded_transcript_df

Unnamed: 0,file_name,transcription
0,Actualidad_Radio_0523_000.mp3,y y y y y y y y y y y y y y y y y cambiado de...
1,Actualidad_Radio_0523_000.mp3,"De dés breakfast, de Estado, mepolisaba una de..."
2,Actualidad_Radio_0523_000.mp3,"unbank a Charlotte, un ayupo,ambreodies plaño,..."
3,Actualidad_Radio_0523_000.mp3,El último presidente que le fue bien en Nueva ...
4,Actualidad_Radio_0523_000.mp3,Vamos a hablarles también de lo que está pasan...
...,...,...
6896,América_Noticias_0524_057.mp3,¡El siguiente programa es una emisión previame...
6897,América_Noticias_0524_058.mp3,de la gente
6898,América_Noticias_0524_058.mp3,En el caso de la gente que se ha dado informat...
6899,América_Noticias_0524_058.mp3,Las informaciones se de Cuba a esta hora por s...


In [46]:
exploded_transcript_df[3]

KeyError: 3

In [50]:

print("Importing false statements with embeddings...")

false_text_df = pd.read_csv(embedded_false_statements_path)

false_text_dict = false_text_df['statement'].to_dict()

false_text_df['statement_embedding'] = false_text_df.statement_embedding.apply(eval).apply(np.array)

false_text_embedding_dict = false_text_df['statement_embedding'].to_dict()

false_text_df['statement'] = false_text_df['statement'].replace('\'', '').replace('/"', '').replace('\"', '').replace('.','').replace('?','').replace('!','')


Importing false statements with embeddings...


In [60]:

def get_embedding(text, model="text-embedding-3-small"):
   return client.embeddings.create(input = [text], model=model).data[0].embedding

def cosine_similarity(vec1, vec2):
    # Normalize each vector to unit length
    vec1_norm = vec1 / np.linalg.norm(vec1)
    vec2_norm = vec2 / np.linalg.norm(vec2)
  
    # Calculate dot product between normalized vectors
    similarity = np.dot(vec1_norm, vec2_norm)
    return similarity

def search_false_statements(search_terms,false_text_df=false_text_df, n=number_return_values, pprint=True):
    radio_embedding = get_embedding(search_terms, model='text-embedding-3-small')
    similarities = []
    counter = 0
    total_statements = len(false_text_embedding_dict)
    for factcheck in sorted(false_text_embedding_dict):
        counter += 1
        try:
            similarities.append(cosine_similarity(factcheck, radio_embedding))
            print_text = f"Generated similarity for statement {counter} of {total_statements}. [{int((counter / total_statements * 100))}%]          "
            print("\r", print_text, end="")
        except:
            similarities.append('failed to generate similarity')
            print(f"Embedding failed for statement {counter} of {total_statements}")
    false_text_df['similarities'] = similarities
    results = (
      false_text_df.sort_values("similarities", ascending=False)
      .head(n)
   )
    if pprint:
      for r in results:
         print(r[:200])
         print()
    return results


In [65]:
false_text_df.head()

Unnamed: 0,id,author,statement,rating,datePublished,days_since_publication,reformated_date,url,time_since_publication,statement_embedding,similarities
0,e74002af-7ca3-4071-917c-0585430d459d,"{'@type': 'Organization', 'name': 'Lupa', 'url...",Benjamin Netanyahu não atendeu Lula ao telefon...,Falso,2023-10-19,226,2023-10-19,https://lupa.uol.com.br/jornalismo/2023/10/19/...,"226 days, 0:00:00","[-0.022783860564231873, 0.017445528879761696, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
1,64dc0acd-92c7-45fb-9255-2a5ff85cc2da,"{'@type': 'Organization', 'name': 'Boatos.org'...",LULA APROVA O USO DE PAU DE ARARA PRA TRANSPOR...,Falso,2023-10-19,226,2023-10-19,https://www.boatos.org/politica/lula-aprovou-a...,"226 days, 0:00:00","[-0.022783860564231873, 0.017445528879761696, ...","[0.007634697513986262, 0.0003454049969404672, ..."
2,3f52fcf6-3825-4c32-b6aa-854655e70128,"{'@type': 'Organization', 'name': 'Lupa', 'url...",Líder palestino defende Israel na ONU e deixa ...,Falso,2023-10-23,222,2023-10-23,https://lupa.uol.com.br/jornalismo/2023/10/23/...,"222 days, 0:00:00","[-0.03071289137005806, 0.02502615377306938, 0....","[0.007634697513986262, 0.0003454049969404672, ..."
3,f953a13b-efbe-4963-84fa-c544738fb401,"{'@type': 'Organization', 'name': 'Boatos.org'...","Após falência no Brasil, Pernambucanas se vê c...",Falso,2023-10-23,222,2023-10-23,https://www.boatos.org/tecnologia/pernambucana...,"222 days, 0:00:00","[0.06262306869029999, -0.029643438756465912, 0...","[0.007634697513986262, 0.0003454049969404672, ..."
4,64dddcbd-2444-41af-8269-de7e09d3257d,"{'@type': 'Organization', 'name': 'Boatos.org'...","Líder do Hamas, com Haddad e Gleise,Aliados?vo...",Falso,2023-10-19,226,2023-10-19,https://www.boatos.org/politica/haddad-gleisi-...,"226 days, 0:00:00","[-0.008113361895084381, 0.01009256113320589, 0...","[0.007634697513986262, 0.0003454049969404672, ..."


In [55]:

top_matches_json = []

def search_all_transcripts():
    errors = 0
    for row in exploded_transcript_df.itertuples(name='segment'):
        try:
            results = search_false_statements(row.transcription, pprint=False)
            top_match = results.statement.values
            similarity = results.similarities
            match_index = row.Index
            top_matches = {}
            top_matches["index"] = f"{match_index}"
            top_matches["filename"] = row.file_name
            top_matches["input_statement"] = row.transcription
            top_matches["checked_false_statement"] = f"{top_match}"
            top_matches["similarity"] = f"{similarity}"
            top_matches_json.append(top_matches)
            print_text = f"Finished checking {match_index + 1} of {len(exploded_transcript_df)} statements -- {int((match_index + 1)/len(exploded_transcript_df)*100)}% complete         "
            print("\r", print_text, end="")
        except:
            errors += 1
            print(f"Error checking statement {match_index + 2}")
    print(f"Finished checking all statements with {errors} error(s)")

print("Starting search of all transcripts...")

search_all_transcripts()

Starting search of all transcripts...


  vec1_norm = vec1 / np.linalg.norm(vec1)


 Generated similarity for statement 16655 of 16655. [100%]          

UnboundLocalError: cannot access local variable 'match_index' where it is not associated with a value

In [None]:

'''

def get_embedding(text, model="text-embedding-3-small"):
   return client.embeddings.create(input = [text], model=model).data[0].embedding

def cosine_similarity(vec1, vec2):
    # Normalize each vector to unit length
    vec1_norm = vec1 / np.linalg.norm(vec1)
    vec2_norm = vec2 / np.linalg.norm(vec2)
  
    # Calculate dot product between normalized vectors
    similarity = np.dot(vec1_norm, vec2_norm)
    return similarity

def search_false_statements(search_terms,false_text_df=false_text_df, n=number_return_values, pprint=True):
   embedding = get_embedding(search_terms, model='text-embedding-3-small')
   false_text_df['similarities'] = false_text_df.statement_embedding.apply(lambda x: cosine_similarity(x, embedding))
   results = (
      false_text_df.sort_values("similarities", ascending=False)
      .head(n)
   )
   if pprint:
      for r in results:
         print(r[:200])
         print()
   return results

top_matches_json = []

def search_all_transcripts():
    errors = 0
    for row in exploded_transcript_df.itertuples(name='segment'):
        try:
            results = search_false_statements(row.transcription, pprint=False)
            top_match = results.statement.values
            similarity = results.similarities
            match_index = row.Index
            top_matches = {}
            top_matches["index"] = f"{match_index}"
            top_matches["filename"] = row.file_name
            top_matches["input_statement"] = row.transcription
            top_matches["checked_false_statement"] = f"{top_match}"
            top_matches["similarity"] = f"{similarity}"
            top_matches_json.append(top_matches)
            print_text = f"Finished checking {match_index + 1} of {len(exploded_transcript_df)} statements -- {int((match_index + 1)/len(exploded_transcript_df)*100)}% complete         "
            print("\r", print_text, end="")
        except:
            errors += 1
            print(f"Error checking statement {match_index + 2}")
    print(f"Finished checking all statements with {errors} error(s)")

print("Starting search of all transcripts...")

search_all_transcripts()

'''

In [None]:

print("Creating DataFrame...")

top_matches_df = pd.DataFrame(top_matches_json)

top_matches_df.sort_values('similarity', ascending=False)

lt = LibreTranslateAPI("http://localhost:5000")

def libretranslate_spanish(input_text):
    return lt.translate(f"{input_text}", "es", "en")

def libretranslate_french(input_text):
    return lt.translate(f"{input_text}", "fr", "en")

print("Adding translation...")

top_matches_df['similarity_value'] = top_matches_df['similarity'].apply(lambda x: x.split()[1]).astype(float)
top_matches_df['factcheck_index'] = top_matches_df['similarity'].apply(lambda x: x.split()[0])

over_50 = top_matches_df.loc[top_matches_df['similarity_value'] >= .50]

over_50['translation'] = over_50['input_statement'].apply(lambda x: libretranslate_french(x))
over_50['translation'] = over_50['translation'].apply(lambda x: libretranslate_spanish(x))

over_50 = over_50.sort_values('similarity_value', ascending=False)

over_50 = over_50[['filename', 'input_statement', 'translation', 'checked_false_statement', 'similarity_value', 'factcheck_index']]

over_50_csv_filepath = Path().cwd().parent.joinpath('data/output_csv/potential_misinformation.csv')

print("Generating csv file...")

over_50.to_csv(f"{over_50_csv_filepath}")