In [8]:
import pandas as pd

# Load your data (adjust the path as needed)
wapo_data = pd.read_csv('../benchmark_data/fatal-police-shootings-data.csv')
police_dept = pd.read_csv('../benchmark_data/fatal-police-shootings-agencies.csv')
# print(wapo_data)
# Define common suffixes to ignore
common_suffixes = ['Jr', 'Sr', 'II', 'III', 'IV', 'V']

# Define a function that processes each name
def process_name(name):
    # Ensure name is a string and not NaN or any float value
    if pd.isnull(name):
        # Handle NaN values or any other non-string values
        return [None, None]  # Or return ['Unknown', 'Unknown'] based on your preference
    else:
        # Convert to string in case it's not (handles numeric values)
        name = str(name)
        # Split the name into parts
        parts = name.split()
        # Check and construct the name ignoring common suffixes and middle names
        if parts[-1] in common_suffixes and len(parts) > 2:
            first_last_name = f"{parts[0]} {parts[-2]}"
        elif len(parts) > 2:
            first_last_name = f"{parts[0]} {parts[-1]}"
        else:
            first_last_name = name
        # Return a list with the original name, the first-last name version, and the version ignoring suffixes
        return [name, first_last_name]

# Apply the function to the 'name' column and create a new column with the results
wapo_data['names'] = wapo_data['name'].apply(process_name)
# wapo_data['date'] = pd.to_datetime(wapo_data['date'])
wapo_data = wapo_data[(wapo_data['date'] >= '2023-12-15') & (wapo_data['date'] <= '2023-12-30')]

result = pd.read_csv('../data_storage/20240306143757_test_weaviate_result.csv')
# result['publication_date'] = pd.to_datetime(result['publication_date'])
wapo_data.reset_index(drop=True, inplace=True)

# wapo_data

import numpy as np
from dateutil import tz


def parse_closest_date(wapo_date, publication_dates_str):
    # Ensure wapo_date is tz-naive
    wapo_date = wapo_date.replace(tzinfo=None)
    
    # Split the string into individual dates and remove any whitespace
    date_str_list = publication_dates_str.strip().replace('(', '').replace(')', '').split(',')
    
    # Convert string dates to datetime and ensure they are tz-naive
    date_diffs = [abs(wapo_date - pd.to_datetime(date_str.strip(), utc=True).replace(tzinfo=None)).days for date_str in date_str_list]
    
    # Return the date with the minimum difference
    min_diff_index = np.argmin(date_diffs)
    return pd.to_datetime(date_str_list[min_diff_index].strip(), utc=True).replace(tzinfo=None)

In [9]:
# Assuming the preparation steps have been done as in your provided code

# Helper function to check conditions
def check_conditions(row, results_df, police_dept):
    # Ensure 'names' is a list of strings, filtering out any None values
    names = [str(name) for name in row['names'] if pd.notnull(name)]
    
    # Check if any name in 'names' appears in 'snippet'
    for name in names:
        if any(name in str(snippet) for snippet in results_df['snippet']):
            return True
    
    # Convert 'date' to datetime for comparison
    wapo_date = pd.to_datetime(row['date'])
    
    # Ensure 'city' and 'county' are strings, handling potential NaN values
    city = str(row['city']) if pd.notnull(row['city']) else ""
    county = str(row['county']) if pd.notnull(row['county']) else ""
    
    # Check if 'city' appears in 'snippet' and dates within 7 days
    city_condition = results_df.apply(lambda x: city in str(x['snippet']) and abs((wapo_date - parse_closest_date(wapo_date, x['publication_date'])).days) <= 7, axis=1)
    if city_condition.any():
        return True
    
    # Check if 'county' appears in 'snippet' and dates within 5 days
    county_condition = results_df.apply(lambda x: county in str(x['snippet']) and abs((wapo_date - parse_closest_date(wapo_date, x['publication_date'])).days) <= 5, axis=1)
    if county_condition.any():
        return True
    
    agency_id = row['agency_ids']
    if pd.notnull(agency_id):
        # Find the corresponding name in 'police_dept' for the given 'agency_id'
        agency_name = police_dept.loc[police_dept['id'] == agency_id, 'name'].values
        if len(agency_name) > 0:  # Ensure there is a match
            agency_name = str(agency_name[0])  # Convert to string in case it's not
            # Check if the agency name appears in 'snippet' and dates within 5 days
            agency_condition = results_df.apply(lambda x: agency_name in str(x['snippet']) and abs((wapo_date - parse_closest_date(wapo_date, x['publication_date'])).days) <= 5, axis=1)
            if agency_condition.any():
                return True

    # If none of the conditions are met
    return False


# Apply the helper function to each row in wapo_data
wapo_data['condition_met'] = wapo_data.apply(check_conditions, results_df=result, police_dept=police_dept, axis=1)

idx = wapo_data.index[wapo_data['condition_met']]
print(idx)

# Print summary
true_count = wapo_data['condition_met'].sum()
total_rows = len(wapo_data)
print(f"True conditions: {true_count} out of {total_rows} rows.")


Index([ 1,  2,  4,  5,  7,  8,  9, 10, 14, 15, 16, 18, 23, 25, 26, 32, 33, 35,
       36, 38, 39, 41, 42, 43, 44, 45, 47, 48, 51],
      dtype='int64')
True conditions: 29 out of 53 rows.
