In [3]:
import pandas as pd
import numpy as np
import json

# Load the Excel file
file_path = '../../data/raw_data/cali_hiv_aids/persons-living-with-hiv-aids.xlsx'
df = pd.read_excel(file_path)

# Filter rows where Category contains "Transmission Category:"
transmission_data = df[df['Category'].str.contains('Transmission Category:', na=False)]

# Function to extract population type from Category
def extract_population(category):
    if 'Male Adult' in category:
        return 'Male'
    elif 'Female Adult' in category:
        return 'Female'
    elif 'Child' in category:
        return 'Child'
    else:
        return 'Unknown'

# Add a Population column
transmission_data['Population'] = transmission_data['Category'].apply(extract_population)

# Make sure Year is integer type
transmission_data['Year'] = transmission_data['Year'].astype(int)

# Make sure Count is numeric
transmission_data['Count'] = pd.to_numeric(transmission_data['Count'], errors='coerce')

# Group by Year, Population, and Group to get totals
grouped_data = transmission_data.groupby(['Year', 'Population', 'Group'])['Count'].sum().reset_index()

# Create a pivot table to better visualize the data
pivot_table = grouped_data.pivot_table(
    index=['Year', 'Population'],
    columns='Group',
    values='Count',
    fill_value=0
).reset_index()

# Optional: Calculate percentage by year and population
# Get total by year and population
totals = grouped_data.groupby(['Year', 'Population'])['Count'].sum().reset_index()
totals.columns = ['Year', 'Population', 'Total']

# Merge with grouped data
grouped_data = pd.merge(grouped_data, totals, on=['Year', 'Population'])
grouped_data['Percentage'] = (grouped_data['Count'] / grouped_data['Total']) * 100

# Print some statistics
print("Data summary:")
print(f"Years range: {grouped_data['Year'].min()} to {grouped_data['Year'].max()}")
print(f"Total number of records: {len(grouped_data)}")
print("\nTotal cases by population:")
population_totals = grouped_data.groupby('Population')['Count'].sum()
for pop, count in population_totals.items():
    print(f"{pop}: {count:,}")

print("\nTop transmission categories overall:")
top_groups = grouped_data.groupby('Group')['Count'].sum().sort_values(ascending=False)
for group, count in top_groups.items():
    print(f"{group}: {count:,}")

# Calculate non-MSM percentage
msm_related = ['Male-to-male sexual contact (MMSC)', 'MMSC and IDU']
msm_total = grouped_data[grouped_data['Group'].isin(msm_related)]['Count'].sum()
total = grouped_data['Count'].sum()
non_msm_total = total - msm_total

print("\nMSM vs Non-MSM breakdown:")
print(f"MSM-related cases: {msm_total:,} ({msm_total/total*100:.1f}%)")
print(f"Non-MSM cases: {non_msm_total:,} ({non_msm_total/total*100:.1f}%)")

# Save the processed data to a CSV file for easy import
grouped_data.to_csv('../../data/clean_data/clean_cali_hiv_data.csv', index=False)
print("\nData saved to clean_cali_hiv_data.csv")

# Optional: Create a JSON file for direct use in visualization
json_data = grouped_data.to_dict(orient='records')
with open('../../data/clean_data/hiv_data.json', 'w') as f:
    json.dump(json_data, f)
print("Data also saved to hiv_data.json")

# Analyze trends over time
print("\nTrends in top transmission categories over time:")
for group in top_groups.head(5).index:
    group_trends = grouped_data[grouped_data['Group'] == group].groupby('Year')['Count'].sum()
    first_year = group_trends.index.min()
    last_year = group_trends.index.max()
    change = ((group_trends[last_year] / group_trends[first_year]) - 1) * 100
    print(f"{group}: {change:.1f}% change from {first_year} to {last_year}")

# For female-specific analysis
female_data = grouped_data[grouped_data['Population'] == 'Female']
print("\nTop transmission categories for females:")
female_top = female_data.groupby('Group')['Count'].sum().sort_values(ascending=False)
for group, count in female_top.items():
    print(f"{group}: {count:,}")

# For heterosexual contact analysis (both high-risk and non-high-risk)
hetero_groups = ['High-risk heterosexual contact (HRH)', 'Heterosexual contact (Non-HRH)']
hetero_data = grouped_data[grouped_data['Group'].isin(hetero_groups)]
hetero_total = hetero_data['Count'].sum()
print(f"\nTotal heterosexual contact cases: {hetero_total:,} ({hetero_total/total*100:.1f}%)")

# Percentage of all cases that are NOT male-to-male sexual contact
print("\nThis analysis highlights that HIV/AIDS affects diverse populations, not just men who have sex with men.")

Data summary:
Years range: 2011 to 2017
Total number of records: 119

Total cases by population:
Child: 1,004
Female: 104,161
Male: 782,826

Top transmission categories overall:
Male-to-male sexual contact (MMSC): 598,399
High-risk heterosexual contact (HRH)**: 79,087
MMSC and IDU: 67,603
Injection drug use (IDU): 57,245
Heterosexual contact (Non-HRH)***: 45,782
Unknown risk: 31,917
Perinatal: 5,089
Other****: 2,717
Other***: 152

MSM vs Non-MSM breakdown:
MSM-related cases: 666,002 (75.0%)
Non-MSM cases: 221,989 (25.0%)

Data saved to clean_cali_hiv_data.csv
Data also saved to hiv_data.json

Trends in top transmission categories over time:
Male-to-male sexual contact (MMSC): 15.6% change from 2011 to 2017
High-risk heterosexual contact (HRH)**: 8.9% change from 2011 to 2017
MMSC and IDU: -7.3% change from 2011 to 2017
Injection drug use (IDU): -5.6% change from 2011 to 2017
Heterosexual contact (Non-HRH)***: 47.6% change from 2011 to 2017

Top transmission categories for females:
High

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transmission_data['Population'] = transmission_data['Category'].apply(extract_population)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transmission_data['Year'] = transmission_data['Year'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transmission_data['Count'] = pd.to_numeric(transm