In [4]:
import os
import pandas as pd
from datetime import datetime, timedelta

# Step 1: Get current directory
current_dir = os.getcwd()

# Step 2: Load the CSV
csv_path = os.path.join(current_dir, '..', 'data', 'checkpoints', 'checkpoint9_no_2025.csv')
df = pd.read_csv(csv_path)

# Step 3: Parse the date column
print("[INFO] Parsing date column...")
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.dropna(subset=['Date'])  # Drop rows where the date couldn't be parsed
df['Date'] = df['Date'].dt.date

# Step 4: Generate all expected dates from 2018 to 2024
start_date = datetime(2018, 1, 1).date()
end_date = datetime(2024, 12, 31).date()
expected_dates = set(start_date + timedelta(days=i) for i in range((end_date - start_date).days + 1))

# Step 5: Get actual dates in the dataset
actual_dates = set(df['Date'])

# Step 6: Find missing dates
print("[INFO] Finding missing dates...")
missing_dates = sorted(list(expected_dates - actual_dates))

# Step 7: Save missing dates to a text file
output_file = os.path.join(current_dir, 'missing_dates.txt')
print(f"[INFO] Writing missing dates to: {output_file}")
with open(output_file, 'w') as f:
    for date in missing_dates:
        f.write(str(date) + '\n')

print(f"[DONE] Found {len(missing_dates)} missing dates.")


  df = pd.read_csv(csv_path)


[INFO] Parsing date column...
[INFO] Finding missing dates...
[INFO] Writing missing dates to: c:\Users\Indel\Documents\gatewayinitiative-lawrencepd\scripts\missing_dates.txt
[DONE] Found 96 missing dates.


### After transformation of missing data (md) dataset (md_checkpoint8), combine with most recent checkpoint

In [25]:
import os
import pandas as pd

In [26]:
notebook_dir = os.getcwd()

In [27]:
md_file_path = os.path.join(notebook_dir, '..', 'data', 'missing_dates_csv')

In [28]:
cp_file_path = os.path.join(notebook_dir, '..', 'data', 'checkpoints')

In [29]:
md_df = pd.read_csv(os.path.join(md_file_path, 'md_checkpoints', 'md_checkpoint8_mass_filtered.csv'))
cp_df = pd.read_csv(os.path.join(cp_file_path, 'checkpoint9_no_2025.csv'))

In [30]:
md_df.columns = md_df.columns.str.strip()
cp_df.columns = cp_df.columns.str.strip()

In [31]:
combined_df = pd.concat([cp_df, md_df], ignore_index=True)

In [36]:
# Check what the date values look like
print("[INFO] Sample raw md_df['Date'] values:")
print(md_df['Date'].head(10))

[INFO] Sample raw md_df['Date'] values:
0    2024-01-07 00:04:59
1    2024-01-07 00:07:28
2    2024-01-07 00:07:41
3    2024-01-07 00:29:26
4    2024-01-07 00:29:38
5    2024-01-07 00:42:11
6    2024-01-07 01:17:00
7    2024-01-07 01:21:27
8    2024-01-07 01:31:53
9    2024-01-07 02:02:43
Name: Date, dtype: object


In [37]:
print("[INFO] Sample raw cp_df['Date'] values:")
print(cp_df['Date'].head(10))

[INFO] Sample raw cp_df['Date'] values:
0    4/1/2018 0:02
1    4/1/2018 0:03
2    4/1/2018 0:11
3    4/1/2018 0:13
4    4/1/2018 0:23
5    4/1/2018 0:23
6    4/1/2018 0:27
7    4/1/2018 0:28
8    4/1/2018 0:45
9    4/1/2018 0:46
Name: Date, dtype: object


In [38]:
# Fix cp_df date format (Month/Day/Year Hour:Minute)
cp_df['Date'] = pd.to_datetime(cp_df['Date'], format='%m/%d/%Y %H:%M', errors='coerce')

# Fix md_df date format (ISO format should work without format)
md_df['Date'] = pd.to_datetime(md_df['Date'], errors='coerce')


In [39]:
# Combine after fixing dates
combined_df = pd.concat([cp_df, md_df], ignore_index=True)

# Drop any rows with bad dates and sort
before = len(combined_df)
combined_df = combined_df.sort_values(by='Date').reset_index(drop=True)
combined_df = combined_df.dropna(subset=['Date'])
after = len(combined_df)
print(f"[INFO] Dropped {before - after} rows due to invalid dates.")

[INFO] Dropped 0 rows due to invalid dates.


In [40]:
print(combined_df[combined_df['Date'].isna()])

Empty DataFrame
Columns: [Incident #, Date, Type, Location, Arrested, Location Prefix, DOB, Charges, latitude, longitude, Cleaned Location, person_id, category, Year, crime_severity]
Index: []


In [41]:
# Save
combined_output = os.path.join(cp_file_path, 'checkpoint10_combined_data.csv')
combined_df.to_csv(combined_output, index=False)
print(f"[DONE] Combined dataset saved to: {combined_output}")

[DONE] Combined dataset saved to: c:\Users\Indel\Documents\gatewayinitiative-lawrencepd\scripts\..\data\checkpoints\checkpoint10_combined_data.csv
