In [8]:
import pandas as pd

# Load your data (adjust the path as needed)
wapo_data = pd.read_csv('../benchmark_data/fatal-police-shootings-data.csv')
police_dept = pd.read_csv('../benchmark_data/fatal-police-shootings-agencies.csv')
# print(wapo_data)
# Define common suffixes to ignore
common_suffixes = ['Jr', 'Sr', 'II', 'III', 'IV', 'V']

# Define a function that processes each name
def process_name(name):
    # Ensure name is a string and not NaN or any float value
    if pd.isnull(name):
        # Handle NaN values or any other non-string values
        return [None, None]  # Or return ['Unknown', 'Unknown'] based on your preference
    else:
        # Convert to string in case it's not (handles numeric values)
        name = str(name)
        # Split the name into parts
        parts = name.split()
        # Check and construct the name ignoring common suffixes and middle names
        if parts[-1] in common_suffixes and len(parts) > 2:
            first_last_name = f"{parts[0]} {parts[-2]}"
        elif len(parts) > 2:
            first_last_name = f"{parts[0]} {parts[-1]}"
        else:
            first_last_name = name
        # Return a list with the original name, the first-last name version, and the version ignoring suffixes
        return [name, first_last_name]

# Apply the function to the 'name' column and create a new column with the results
wapo_data['names'] = wapo_data['name'].apply(process_name)
# wapo_data['date'] = pd.to_datetime(wapo_data['date'])
wapo_data = wapo_data[(wapo_data['date'] >= '2023-12-01') & (wapo_data['date'] <= '2023-12-15')]

result = pd.read_csv('../data_storage/benchmark/benchmark_large.csv')
# result['publication_date'] = pd.to_datetime(result['publication_date'])
wapo_data.reset_index(drop=True, inplace=True)
result_hit = result[result['Hit?']=="Y"]
print(result_hit.shape)
print(result.shape)
# wapo_data

import numpy as np
from dateutil import tz


def parse_closest_date(wapo_date, publication_dates_str):
    # Ensure wapo_date is tz-naive
    wapo_date = wapo_date.replace(tzinfo=None)
    
    # Split the string into individual dates and remove any whitespace
    date_str_list = publication_dates_str.strip().replace('(', '').replace(')', '').split(',')
    
    # Convert string dates to datetime and ensure they are tz-naive
    date_diffs = [abs(wapo_date - pd.to_datetime(date_str.strip(), utc=True).replace(tzinfo=None)).days for date_str in date_str_list]
    
    # Return the date with the minimum difference
    min_diff_index = np.argmin(date_diffs)
    return pd.to_datetime(date_str_list[min_diff_index].strip(), utc=True).replace(tzinfo=None)

In [9]:
# Assuming the preparation steps have been done as in your provided code

# Helper function to check conditions
def check_conditions(row, results_df, police_dept):
    # Ensure 'names' is a list of strings, filtering out any None values
    names = [str(name) for name in row['names'] if pd.notnull(name)]
    
    # Check if any name in 'names' appears in 'snippet'
    for name in names:
        if any(name in str(snippet) for snippet in results_df['snippet']):
            return True
    
    # Convert 'date' to datetime for comparison
    wapo_date = pd.to_datetime(row['date'])
    
    # Ensure 'city' and 'county' are strings, handling potential NaN values
    city = str(row['city']) if pd.notnull(row['city']) else ""
    county = str(row['county']) if pd.notnull(row['county']) else ""
    
    # Check if 'city' appears in 'snippet' and dates within 7 days
    city_condition = results_df.apply(lambda x: city in str(x['snippet']) and abs((wapo_date - parse_closest_date(wapo_date, x['publication_date'])).days) <= 7, axis=1)
    if city_condition.any():
        return True
    
    # Check if 'county' appears in 'snippet' and dates within 5 days
    county_condition = results_df.apply(lambda x: county in str(x['snippet']) and abs((wapo_date - parse_closest_date(wapo_date, x['publication_date'])).days) <= 5, axis=1)
    if county_condition.any():
        return True
    
    agency_id = row['agency_ids']
    if pd.notnull(agency_id):
        # Find the corresponding name in 'police_dept' for the given 'agency_id'
        agency_name = police_dept.loc[police_dept['id'] == agency_id, 'name'].values
        if len(agency_name) > 0:  # Ensure there is a match
            agency_name = str(agency_name[0])  # Convert to string in case it's not
            # Check if the agency name appears in 'snippet' and dates within 5 days
            agency_condition = results_df.apply(lambda x: agency_name in str(x['snippet']) and abs((wapo_date - parse_closest_date(wapo_date, x['publication_date'])).days) <= 5, axis=1)
            if agency_condition.any():
                return True

    # If none of the conditions are met
    return False


# Apply the helper function to each row in wapo_data
wapo_data['condition_met'] = wapo_data.apply(check_conditions, results_df=result, police_dept=police_dept, axis=1)

idx = wapo_data.index[wapo_data['condition_met']]
print(idx)

# Print summary
true_count = wapo_data['condition_met'].sum()
total_rows = len(wapo_data)
print(f"True conditions: {true_count} out of {total_rows} rows.")


Index([ 2,  5, 11, 13, 14, 20, 21, 22, 24, 26, 29, 33, 34, 37, 38, 40, 42, 45,
       48, 50, 52, 54, 55],
      dtype='int64')
True conditions: 23 out of 57 rows.


In [None]:

from waybacknews.searchapi import SearchApiClient
from datetime import datetime
import pandas as pd
import requests
from retrying import retry
import requests_cache
from tqdm import tqdm
# import mediacloud.api
# from newspaper import Article
# import unicodedata
import concurrent.futures
from sentence_transformers import SentenceTransformer
from InstructorEmbedding import INSTRUCTOR
from openai import OpenAI

import weaviate
import json
import os
from dotenv import load_dotenv


load_dotenv()
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

weaviate_api_key = os.getenv('WEAVIATE_API_KEY')
weaviate_url = os.getenv('WEAVIATE_URL')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

client = weaviate.Client(
    url = weaviate_url,
    auth_client_secret=weaviate.AuthApiKey(api_key=weaviate_api_key), 
    additional_headers = {
        "X-OpenAI-Api-Key": OPENAI_API_KEY
    }
)

openai_client = OpenAI()


query_text = """
Breaking news coverage on recent police shooting incidents in the United States leading to fatalities of the victims. The incidents are about police officer, deputy, sheriff, trooper, cop who fired shots and killed someone.
Do not include aggregated summary, list, or archive of incidents happened in the past. Do not include if it's about a past, not recent incident. Only include if the story mentioned the death of the victim.
Do not include if it's coverage on legal proceedings, court cases, or trials of a past incident. Do not include if it's about the aftermath of the incident, such as protests, rallies, or demonstrations on a past incident.
Do not include if it happened in a foreign country, only include if it's about the United States.
"""
instruction_prompt = "Represent the news articles for retrieval:"

# # Using the text-embedding-3-large model to generate embeddings
# response = openai_client.embeddings.create(
#     input=query_text,
#     model="text-embedding-3-small"
# )
# query_vector = response.data[0].embedding

# query_model = SentenceTransformer("nomic-ai/nomic-embed-text-v1")

# query_vector = query_model.encode(query_text).tolist()


query_model = SentenceTransformer("nomic-ai/nomic-embed-text-v1")
query_vector = query_model.encode(query_text, convert_to_tensor=True)

# query_vector = embed_model.encode([[instruction_prompt,query_text]]).tolist()
# query_vector = [item for sublist in query_vector for item in sublist]
print("vector:", query_vector.__len__())

get_articles_group = f"""
{{
  Get {{
    Article(
      nearVector: {{
        vector: {query_vector}
      }},
      group: {{
        type: merge,
        force: 0
      }},
      limit: 80
    ) {{
      title,
      publication_date,
      snippet
    }}
  }}
}}
"""


query_result = client.query.raw(get_articles_group)
# save to csv
df = pd.DataFrame(query_result['data']['Get']['Article'])
df.to_csv(f'./data_storage/{timestamp}_test_weaviate_result.csv', index=False)