In [None]:
!pip install pandas==2.0.3
import pandas as pd

# Download our datasets and load them to dataframes
!wget -O connecticut_offense_type_by_agency_2019.xls "https://ucr.fbi.gov/nibrs/2019/tables/state-excels/connecticut_offense_type_by_agency_2019.xls/output.xls"
CT_Crime_df = pd.read_excel('connecticut_offense_type_by_agency_2019.xls', skiprows=4, nrows=87)

# Correct the column names for the crime dataframe
crime_columns = [
    "Agency Type", "Agency Name", "Population", "Total Offenses",
    "Crimes Against Persons", "Crimes Against Property", "Crimes Against Society",
    "Assault Offenses", "Aggravated Assault", "Simple Assault", "Intimidation",
    "Homicide Offenses", "Murder and Nonnegligent Manslaughter", "Negligent Manslaughter",
    "Justifiable Homicide", "Human Trafficking Offenses", "Commercial Sex Acts",
    "Involuntary Servitude", "Kidnapping/Abduction", "Sex Offenses", "Rape",
    "Sodomy", "Sexual Assault With an Object", "Fondling", "Incest", "Statutory Rape",
    "Arson", "Bribery", "Burglary/Breaking & Entering", "Counterfeiting/Forgery",
    "Destruction/Damage/Vandalism of Property", "Embezzlement", "Extortion/Blackmail",
    "Fraud Offenses", "False Pretenses/Swindle/Confidence Game",
    "Credit Card/Automated Teller Machine Fraud", "Impersonation", "Welfare Fraud",
    "Wire Fraud", "Identity Theft", "Hacking/Computer Invasion", "Larceny/Theft Offenses",
    "Pocket-picking", "Purse-snatching", "Shoplifting", "Theft From Building",
    "Theft From Coin Operated Machine or Device", "Theft From Motor Vehicle",
    "Theft of Motor Vehicle Parts or Accessories", "All Other Larceny", "Motor Vehicle Theft",
    "Robbery", "Stolen Property Offenses", "Animal Cruelty", "Drug/Narcotic Offenses",
    "Drug/Narcotic Violations", "Drug Equipment Violations", "Gambling Offenses",
    "Betting/Wagering", "Operating/Promoting/Assisting Gambling",
    "Gambling Equipment Violations", "Sports Tampering", "Pornography/Obscene Material",
    "Prostitution Offenses", "Prostitution", "Assisting or Promoting Prostitution",
    "Purchasing Prostitution", "Weapon Law Violations"
]

# Reassign these to the dataframe
CT_Crime_df.columns = crime_columns

!wget -O FoodAccessResearchAtlasData2019.xlsx "https://www.ers.usda.gov/webdocs/DataFiles/80591/FoodAccessResearchAtlasData2019.xlsx?v=8276.1"
CT_Food_df = pd.read_excel('FoodAccessResearchAtlasData2019.xlsx',
                                     sheet_name='Food Access Research Atlas',
                                     skiprows=range(1, 12818),  # Skip rows 1 to 12818
                                     nrows=828)                 # Read 828 rows (13647 - 12819)
# Grab rows 61238-61243
Austin_County_Food_df = pd.read_excel('FoodAccessResearchAtlasData2019.xlsx',
                                     sheet_name='Food Access Research Atlas',
                                     skiprows=range(1, 61237),  # Skip rows 1 to 61238
                                     nrows=6)                 # Read 6 rows (61238-61243)

!wget -O tract2town-2020.xlsx "https://github.com/CT-Data-Collaborative/ct-census-tract-to-town/blob/master/2020/tract2town-2020.xlsx?raw=true"
CT_Town_df = pd.read_excel('tract2town-2020.xlsx')

with pd.ExcelWriter('raw_output.xlsx') as writer:
    CT_Crime_df.to_excel(writer, sheet_name='CT_Crime')
    CT_Food_df.to_excel(writer, sheet_name='CT_Food')
    CT_Town_df.to_excel(writer, sheet_name='CT_Town')
    Austin_County_Food_df.to_excel(writer, sheet_name='Austin_County_Food')


Collecting pandas==2.0.3
  Downloading pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Downloading pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 2.2.2
    Uninstalling pandas-2.2.2:
      Successfully uninstalled pandas-2.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.0.3 which is incompatible.
mizani 0.13.0 requires pandas>=2.2.0, but you have pandas 2.0.3 which is incompatible.
plotnine 0.14.1 requires pandas>=2.2.0, but you have pandas 2.0.3 which is incompatible.
xarray 2024.10.

# Data Cleanup / Sanitization
- CT Town / Fips Conversion sheet
 - Add Missing Willimantic and Grotton Town data based on external researched data
 -

- CT Food Data
 - Add values for blanks (mostly 0's from data analysis)
 - Remove columns where all values are blank
 - Remove columns where there is population % next to raw data
 - Add Town column based on FIPS code
    - Reorder town to be 3rd column

 - Aggregate Tract by tract data into Town by Town Data

- CT Crime Data
  - Rename Columns to sensible names
  - Drop the agency type column
 - Make a version that is normalized via the population
    - Divide each number of crimes by the town population to get per capita crimes

In [None]:
import pandas as pd

### CT Town / Fips Conversion sheet  Cleanup ###

# Clean CT_Town Groton and Willimantic data to include those towns
# Define relevant columns
Town_columns = ['tract_fips', 'tract_name', 'town', 'county', 'town_fips']
willimantic_fips = ['9015800300', '9015800600', '9015800400', '9015800700']
grottontown_fips = ['9011702100', '9011702800', '9011702900', '9011703000', '9011702300', '9011702600', '9011702700', '9011980000', '9011870200']
grottonpropper_fips = ['9011702400', '9011702500']



# Ensure that tract_fips is treated as a string
CT_Town_df['tract_fips'] = CT_Town_df['tract_fips'].astype(str)

# Print initial state to verify current values
print("Before updates:")
print(CT_Town_df[CT_Town_df['tract_fips'].isin(willimantic_fips + grottontown_fips+grottonpropper_fips)])

# Update Groton Town
for fips in grottontown_fips:
    CT_Town_df.loc[CT_Town_df['tract_fips'] == fips, 'town'] = 'Groton Town'

# Update Willimantic
for fips in willimantic_fips:
    CT_Town_df.loc[CT_Town_df['tract_fips'] == fips, 'town'] = 'Willimantic'

# Print state after updates to verify changes
print("After updates:")
print(CT_Town_df[CT_Town_df['tract_fips'].isin(willimantic_fips + grottontown_fips+grottonpropper_fips)])

# Export to Excel
CT_Town_df.to_excel('townrevised.xlsx', index=False)


In [None]:
### Data Cleanup Functions ###
## Functions:
def fill_blanks_with_zero(df):
  df = df.fillna(0)
  return df

def remove_all_zero_columns(df):
  # Transpose the DataFrame to treat columns as rows
  df_transposed = df.T

  # Drop rows (originally columns) with all zeros, ignoring the first row (header)
  df_transposed = df_transposed.drop(df_transposed[(df_transposed[df_transposed.columns[1:]].sum(axis=1) == 0)].index)

  # Transpose back to original shape
  df = df_transposed.T
  return df

def add_town_by_fips(Food_df, Town_df):
    # Ensure CensusTract and tract_fips columns have the same data type for matching
    Food_df['CensusTract'] = Food_df['CensusTract'].astype(str)
    Town_df['tract_fips'] = Town_df['tract_fips'].astype(str)

    # Merge the two DataFrames based on the CensusTract and tract_fips columns
    Food_df = pd.merge(Food_df, Town_df[['tract_fips', 'town']], left_on='CensusTract', right_on='tract_fips', how='left')

    # Drop the 'tract_fips' column from the merged DataFrame
    Food_df.drop(columns=['tract_fips'], inplace=True)

    # Remove rows with no town value
    Food_df.dropna(subset=['town'], inplace=True)

    # Get the index of the 'State' column
    state_index = Food_df.columns.get_loc('State')

    # Insert the 'town' column at the desired position (state_index)
    Food_df.insert(state_index, 'town', Food_df.pop('town'))

    return Food_df

def remove_population_share_columns(df):
  # Get a list of columns containing "share"
  share_columns = [col for col in df if 'share' in col]

  # Remove those columns from the DataFrame
  df = df.drop(columns=share_columns)
  return df

def aggregate_town_rows(df):
  df = df.groupby('town', as_index=False, dropna=True).agg({
      # 'CensusTract': 'first',
      'State': lambda x: x.mode()[0] if not x.mode().empty else None,
      'County': lambda x: x.mode()[0] if not x.mode().empty else None,
      'Urban': 'mean',
      'Pop2010': 'sum',
      'OHU2010': 'sum',
      'GroupQuartersFlag': 'mean',
      'NUMGQTRS': 'sum',
      'PCTGQTRS': 'mean',
      'LILATracts_1And10': 'mean',
      'LILATracts_halfAnd10': 'mean',
      'LILATracts_1And20': 'mean',
      'LILATracts_Vehicle': 'mean',
      'HUNVFlag': 'mean',
      'LowIncomeTracts': 'mean',
      'PovertyRate': 'mean',
      'MedianFamilyIncome': 'mean',
      'LA1and10': 'mean',
      'LAhalfand10': 'mean',
      'LA1and20': 'mean',
      'LATracts_half': 'mean',
      'LATracts1': 'mean',
      'LATractsVehicle_20': 'mean',
      'LAPOP1_10': 'sum',
      'LAPOP05_10': 'sum',
      'LAPOP1_20': 'sum',
      'LALOWI1_10': 'sum',
      'LALOWI05_10': 'sum',
      'LALOWI1_20': 'sum',
      'lapophalf': 'sum',
      'lalowihalf': 'sum',
      'lakidshalf': 'sum',
      'laseniorshalf': 'sum',
      'lawhitehalf': 'sum',
      'lablackhalf': 'sum',
      'laasianhalf': 'sum',
      'lanhopihalf': 'sum',
      'laomultirhalf': 'sum',
      'lahisphalf': 'sum',
      'lahunvhalf': 'sum',
      'lasnaphalf': 'sum',
      'lapop1': 'sum',
      'lalowi1': 'sum',
      'lakids1': 'sum',
      'laseniors1': 'sum',
      'lawhite1': 'sum',
      'lablack1': 'sum',
      'laasian1': 'sum',
      'lanhopi1': 'sum',
      'laaian1': 'sum',
      'laomultir1': 'sum',
      'lahisp1': 'sum',
      'lahunv1': 'sum',
      'lasnap1': 'sum',
      'TractLOWI': 'sum',
      'TractKids': 'sum',
      'TractSeniors': 'sum',
      'TractWhite': 'sum',
      'TractBlack': 'sum',
      'TractAsian': 'sum',
      'TractNHOPI': 'sum',
      'TractAIAN': 'sum',
      'TractOMultir': 'sum',
      'TractHispanic': 'sum',
      'TractHUNV': 'sum',
      'TractSNAP': 'sum',
  })
  # Get the column names
  cols = df.columns.tolist()

  # Move the first column to the third position
  cols.insert(2, cols.pop(0))

  # Reindex the DataFrame
  df = df.reindex(cols, axis=1)

  return df

In [None]:
 ### CT Food Data Cleanup ###

import pandas as pd

def clean_CT_Food(df, town_df):
  # Assuming CT_Food_df is your DataFrame
  CT_Food_df = df
  CT_Town_df = town_df

  # Display the original DataFrame
  print("Original DataFrame:")
  print(CT_Food_df)

  print("Original DataFrame shape:", CT_Food_df.shape)

  CT_Food_revised_df = CT_Food_df.copy()

  # Add values in blanks (mostly 0's)
  CT_Food_revised_df = fill_blanks_with_zero(CT_Food_revised_df)
  print("Shape after fill_blanks_with_zero:", CT_Food_revised_df.shape)

  CT_Food_revised_df = remove_all_zero_columns(CT_Food_revised_df)
  print("Shape after remove_all_zero_columns:", CT_Food_revised_df.shape)

  CT_Food_revised_df = remove_population_share_columns(CT_Food_revised_df)
  print("Shape after remove_population_share_columns:", CT_Food_revised_df.shape)

  CT_Food_revised_df = add_town_by_fips(CT_Food_revised_df, CT_Town_df)
  print("Shape after add_town_by_fips:", CT_Food_revised_df.shape)  # Check shape here!
  print("Dropped Unincorperated tracts / Tracts without Towns")

  print(CT_Food_revised_df.head())

  CT_Food_revised_df.to_excel('foodrevised.xlsx', index=False)

  CT_Food_Aggregated = aggregate_town_rows(CT_Food_revised_df)
  CT_Food_Aggregated.to_excel('foodaggregated.xlsx', index=False)

  return CT_Food_revised_df, CT_Food_Aggregated

CT_Food_revised, CT_Food_Aggregated = clean_CT_Food(CT_Food_df, CT_Town_df)

In [None]:
### CT Crime Data Cleanup

def clean_CT_Crime(df):
    # Drop the 'Agency Type' column
    df = df.drop(columns=['Agency Type'])

    return df

def normalize_crime_data(df):

  # Create a copy of the DataFrame to avoid modifying the original
  normalized_df = df.copy()

  # Select the columns to normalize (excluding 'Agency Name' and 'Population')
  columns_to_normalize = df.columns[2:]  # Starts from the third column

  # Normalize the selected columns by dividing by the population
  normalized_df[columns_to_normalize] = normalized_df[columns_to_normalize].div(normalized_df['Population'], axis=0)

  return normalized_df

CT_Crime_revised = clean_CT_Crime(CT_Crime_df)
CT_Crime_revised.to_excel('crimerevised.xlsx', index=False)
CT_Crime_normalized = normalize_crime_data(CT_Crime_revised)
CT_Crime_normalized.to_excel('crimenormalized.xlsx', index=False)

# Data Enrichment (Combine the data)
- Combine CT Crime Revised + CT Food Aggregated
    - CT Crime data has fewer times than CT Food Aggregated, Drop towns in aggregated that we dont have crime statistics for
    - Combine the rows of data where agency name in crime_revised = town in ct_food_aggregated
- Stretch CT_Crime_normalized over CT_Food_Revised

In [None]:
## Combine CT Crime Data and Aggregated
def combine_crime_food_data(crime_df, food_df):

  # Get a list of towns present in the crime data
  crime_towns = crime_df['Agency Name'].unique()

  # Filter the food data to keep only towns present in the crime data
  filtered_food_df = food_df[food_df['town'].isin(crime_towns)]

  # Merge the two DataFrames based on town/agency name
  merged_df = pd.merge(filtered_food_df, crime_df, left_on='town', right_on='Agency Name', how='inner')

  # Remove the 'Agency Name' column
  merged_df.drop(columns=['Agency Name'], inplace=True)

  # Reorder columns to place food data on the left and crime data on the right
  food_cols = filtered_food_df.columns.tolist()  # Get food columns
  crime_cols = [col for col in merged_df.columns if col not in food_cols]  # Get crime columns
  reordered_cols = food_cols + crime_cols  # Combine columns in desired order
  merged_df = merged_df[reordered_cols]  # Reorder the DataFrame

  return merged_df

combined_crime_food_df = combine_crime_food_data(CT_Crime_revised, CT_Food_Aggregated)
combined_crime_food_df.to_excel('combined.xlsx', index=False)



# Exploratory Data Analysis


## Data Transformation
- SVD to reduce the dimensionality of the data
- TS-SNE to explore data relationships

## Predition / Machine Learning
- Use the Food Data to predict Total Crime numbers
- Use the Food Data to predict likelihoods of specific types of crimes

## Plotting Data / Visualization
- Crime Plot
- Relationship diagram

In [None]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


# Assuming combined_crime_food_df is already defined from the previous code

# 1. Correlation Matrix Heatmap:
# Visualize the correlation between different food security and crime statistics.
selected_columns = ['LowIncomeTracts', 'PovertyRate', 'MedianFamilyIncome', # Food security financial indicators
                    'Urban', 'OHU2010', 'PCTGQTRS', 'lapophalf', 'lapop1', 'lahunv1', # Food security Locational Indicators
                    'TractLOWI', 'TractKids', 'TractSeniors', 'TractWhite', 'TractBlack', 'TractAsian', 'TractNHOPI', 'TractAIAN', 'TractOMultir', 'TractHispanic', 'TractHUNV', 'TractSNAP', # Food Security Demographic Indicators
                    'Total Offenses', 'Crimes Against Persons', 'Crimes Against Property', 'Crimes Against Society', 'Assault Offenses', 'Homicide Offenses', 'Human Trafficking Offenses', 'Sex Offenses', 'Fraud Offenses', 'Robbery', 'Drug/Narcotic Offenses', 'Drug/Narcotic Violations', 'Gambling Offenses', 'Prostitution Offenses', 'Weapon Law Violations']  # Crime statistics (add more as needed)

subset_df = combined_crime_food_df[selected_columns]

# Calculate correlation matrix
correlation_matrix = subset_df.corr()

# Mask the upper triangle (including the diagonal)
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

# Increase figure size even further
plt.figure(figsize=(20, 16))  # <-- Adjust the dimensions as needed

# Create heatmap with adjustments (rest remains the same)
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f",
            annot_kws={"size": 10},
            linewidths=.5,
            mask=mask)


plt.title('Correlation Matrix of Food Security and Crime Statistics', fontsize=14)  # Increase title font size
plt.xticks(rotation=45, ha='right', fontsize=12)  # Rotate x-axis labels and adjust font size
plt.yticks(fontsize=12)  # Adjust y-axis label font size
plt.tight_layout()  # Adjust layout to prevent overlapping labels
plt.show()

# Apply t-SNE to the correlation matrix
tsne = TSNE(n_components=2, random_state=42)  # You can adjust n_components and random_state
tsne_results = tsne.fit_transform(correlation_matrix)

# Create a DataFrame for the t-SNE results
tsne_df = pd.DataFrame(data=tsne_results, index=correlation_matrix.index, columns=['TSNE1', 'TSNE2'])

# Visualize the t-SNE results
plt.figure(figsize=(20, 16))
for index, row in tsne_df.iterrows():
    plt.scatter(row['TSNE1'], row['TSNE2'])  # Scatter plot for each point
    plt.annotate(index, (row['TSNE1'], row['TSNE2']), fontsize=12)  # Increased fontsize to 12

plt.title('t-SNE Visualization of Food Security and Crime Statistics (Labels on Points)')
plt.xlabel('TSNE1')
plt.ylabel('TSNE2')
plt.show()

# Standardize the data (important for SVD)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(subset_df)

# Apply SVD with desired number of components (e.g., 2)
svd = TruncatedSVD(n_components=2, random_state=42)  # You can adjust n_components and random_state
svd_results = svd.fit_transform(scaled_data)

# Create a DataFrame for the SVD results
svd_df = pd.DataFrame(data=svd_results, index=subset_df.index, columns=['SVD1', 'SVD2'])

# Add labels to the scatterplot
plt.figure(figsize=(20, 16))
sns.scatterplot(x='SVD1', y='SVD2', data=svd_df)

# Add labels for each data point (town in this case)
for i, txt in enumerate(svd_df.index):
    plt.annotate(txt, (svd_df['SVD1'][i], svd_df['SVD2'][i]), fontsize=8)

plt.title('SVD Visualization of Food Security and Crime Statistics')
plt.show()

print(svd.explained_variance_ratio_.sum())



In [None]:
# Clustering with KMeans

# Elbow Method for Optimal k
wcss = []  # Within-cluster sum of squares
for i in range(1, 11):  # Check for k values from 1 to 10
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(svd_df[['SVD1', 'SVD2']])
    wcss.append(kmeans.inertia_)  # Inertia is the WCSS value

# Plot the elbow curve
plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS')
plt.show()

# Based on the elbow curve, select the optimal k value
# ... (visually inspect the plot to determine the 'elbow' point) ...

# Given optimal k is 3 (based on the elbow curve)
optimal_k = 3

# Generate the cluster
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
svd_df['cluster'] = kmeans.fit_predict(svd_df[['SVD1', 'SVD2']])

# Visualize clusters with different colors
plt.figure(figsize=(8, 6))
sns.scatterplot(x='SVD1', y='SVD2', data=svd_df, hue='cluster', palette='viridis')
plt.title('SVD Visualization with KMeans Clustering')
plt.show()

# Analyze cluster characteristics
for cluster in svd_df['cluster'].unique():
    print(f"Cluster {cluster}:")
    print(svd_df[svd_df['cluster'] == cluster].index.tolist())
    print("\n")

score = silhouette_score(svd_df[['SVD1', 'SVD2']], svd_df['cluster'])
print(f'Silhouette Score: {score}')

# Create a DataFrame for the t-SNE results
tsne_df = pd.DataFrame(data=tsne_results, index=correlation_matrix.index, columns=['TSNE1', 'TSNE2'])

# Elbow Method for Optimal k (using t-SNE data)
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(tsne_df[['TSNE1', 'TSNE2']])  # Fit to t-SNE data
    wcss.append(kmeans.inertia_)

# Plot the elbow curve (same as before)
plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method for Optimal k (t-SNE Data)')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS')
plt.show()

# Given optimal k is 4 (based on the elbow curve)
optimal_k = 4

# Apply KMeans (using t-SNE data)
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
tsne_df['cluster'] = kmeans.fit_predict(tsne_df[['TSNE1', 'TSNE2']])

# Add cluster assignments to the original DataFrame
combined_crime_food_df['cluster'] = combined_crime_food_df['town'].map(tsne_df.set_index(correlation_matrix.index)['cluster'])

# Analyze cluster characteristics (using combined_crime_food_df)
for cluster in combined_crime_food_df['cluster'].unique():
    print(f"Cluster {cluster}:")
    print(combined_crime_food_df[combined_crime_food_df['cluster'] == cluster].index.tolist())
    print("\n")

# Visualize the t-SNE results with clusters
plt.figure(figsize=(10, 8))
sns.scatterplot(x='TSNE1', y='TSNE2', hue='cluster', data=tsne_df, palette='viridis')
plt.title('t-SNE Visualization with KMeans Clustering')
plt.show()

# Calculate Silhouette Score (using t-SNE data)
score = silhouette_score(tsne_df[['TSNE1', 'TSNE2']], tsne_df['cluster'])
print(f'Silhouette Score (t-SNE Data): {score}')


In [None]:
# 2. Scatter Plots:
# Explore the relationship between specific food security indicators and crime rates.
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors as mcolors

# Define the pairs of columns for scatter plots and explanations
scatter_pairs = [
    (('PovertyRate', 'Total Offenses'), "Higher poverty rates are often associated with increased crime rates due to socioeconomic factors."),
    (('MedianFamilyIncome', 'Crimes Against Property'), "Areas with lower median family incomes might experience higher rates of property crimes due to economic disparities."),
    (('Urban', 'Drug/Narcotic Offenses'), "Urban areas may have higher drug-related offenses due to factors like population density and greater access to drug markets."),
    (('TractHUNV', 'Motor Vehicle Theft'), "Limited access to reliable vehicles might be correlated with the likelihood of motor vehicle theft."),
    (('TractSNAP', 'Larceny/Theft Offenses'), "Households facing food insecurity (indicated by SNAP benefits) might be more prone to larceny/theft offenses."),
    (('TractLOWI', 'Motor Vehicle Theft'), "Low income areas may have higher incidence of vehicle theft"),
    (('lakidshalf', 'Crimes Against Property'), "Children with poor access to food (Kids population count beyond 1/2 mile from supermarket) and other resources may be more likely to commit crimes against property"),
    (('lakidshalf', 'Destruction/Damage/Vandalism of Property'), "Children with poor access to food (Kids population count beyond 1/2 mile from supermarket) and other resources may be more likely to vandalize property or be unsupervised"),
    (('PovertyRate', 'Shoplifting'), "As the poverty rate in an area increases, it may lead to higher rates of shoplifting")
]

# Create a figure and a grid of subplots
fig, axes = plt.subplots(nrows=len(scatter_pairs), ncols=1, figsize=(8, 8 * len(scatter_pairs)))


# Create a custom colormap with a green-to-red gradient
cmap_urban_rural_gradient = mcolors.LinearSegmentedColormap.from_list(
    "urban_rural_gradient", ["lightgreen", "red"]  # Green for rural, red for urban
)

# Iterate through the scatter pairs and create subplots with trendlines and explanations
for i, ((x_col, y_col), explanation) in enumerate(scatter_pairs):
    ax = axes[i]  # Get the current subplot

    scatter = ax.scatter(
        combined_crime_food_df[x_col],
        combined_crime_food_df[y_col],
        c=combined_crime_food_df["Urban"],
        cmap=cmap_urban_rural_gradient,  # Use custom gradient colormap
        label="Urban (Color)",
    )

    # Trendline calculation and plotting
    x = combined_crime_food_df[x_col]
    y = combined_crime_food_df[y_col]

    # Handle potential errors if NaN values are present in the trendline calculation
    try:
        # Get coefficients of linear regression
        coefficients = np.polyfit(x, y, 1)
        # Create a function representing the trendline
        polynomial = np.poly1d(coefficients)
        # Generate x values for the trendline plot
        x_trendline = np.linspace(x.min(), x.max(), 100)
        # Calculate corresponding y values based on trendline
        y_trendline = polynomial(x_trendline)
        # Plot the trendline
        ax.plot(x_trendline, y_trendline, color='red', label='Trendline')
    except Exception as e:
        print(f"Error calculating or plotting trendline for {x_col} vs {y_col}: {e}")

    ax.set_xlabel(x_col)
    ax.set_ylabel(y_col)
    ax.set_title(f'Relationship between {x_col} and {y_col}')
    ax.text(0.05, 0.95, explanation, transform=ax.transAxes, va='top', fontsize=10, wrap=True)  # Add explanation text

    # Add colorbar with custom label
    cbar = fig.colorbar(scatter, ax=ax, label="Urban (Green: Rural, Red: Urban)")

# Adjust layout to prevent overlapping
plt.tight_layout()
plt.show()

In [None]:
# 3. Pairplot:
# Visualize pairwise relationships between multiple variables.
# You can select specific variables for the pairplot.
numeric_cols = ['LowIncomeTracts', 'PovertyRate', 'Total Offenses',
                 'MedianFamilyIncome', 'Larceny/Theft Offenses', 'Assault Offenses',
                 'Homicide Offenses', 'Drug/Narcotic Offenses',
                 'TractKids', 'Urban', 'TractHUNV', 'TractSNAP']
combined_crime_food_df[numeric_cols] = combined_crime_food_df[numeric_cols].apply(pd.to_numeric, errors='coerce')

sns.pairplot(combined_crime_food_df[numeric_cols], kind='reg')
plt.show()

# Define thresholds for categorization
combined_crime_food_df['Urban_Category'] = pd.cut(combined_crime_food_df['Urban'], bins=[-0.1, 0.3, 0.7, 1.1],
                                                  labels=['Rural', 'Mixed', 'Urban'])

plt.figure(figsize=(8, 6))
sns.scatterplot(x='LowIncomeTracts', y='Total Offenses', data=combined_crime_food_df, hue='Urban_Category')
plt.xlabel('Low Income Tracts')
plt.ylabel('Total Offenses')
plt.title('Relationship between Low Income Tracts and Total Offenses by Urban Category')
plt.show()

plt.figure(figsize=(8, 6))
# Use the 'c' parameter to map the data to color directly, and specify a colormap
scatter = plt.scatter(x='Population', y='Total Offenses', c='Urban', data=combined_crime_food_df, cmap='coolwarm')
plt.xlabel('Population')
plt.ylabel('Total Offenses')
plt.title('Relationship between Population and Total Offenses by Percentage Urban')
# Add the colorbar, referencing the scatter plot
plt.colorbar(scatter, label='Percentage Urban')
plt.show()

In [None]:
# 4. Histogram:
# Analyze the distribution of crime rates or food security indicators.
plt.figure(figsize=(8, 6))
plt.hist(combined_crime_food_df['Total Offenses'], bins=20)
plt.xlabel('Total Offenses')
plt.ylabel('Frequency')
plt.title('Distribution of Total Offenses')
plt.show()

In [None]:
# 5. Boxplots:
# Compare the distribution of crime rates across different categories of food security (e.g., urban vs. rural).
# Calculate percentiles of 'Total Offenses' for each 'Urban_Category'
urban_offenses_dist = combined_crime_food_df.groupby('Urban_Category')['Total Offenses'].quantile(np.linspace(0, 1, 100)).reset_index()

# Rename columns for clarity
urban_offenses_dist.columns = ['Urban_Category', 'Quantile', 'Total_Offenses']

# Create a line plot
plt.figure(figsize=(8, 6))
sns.lineplot(x='Total_Offenses', y='Quantile', hue='Urban_Category', data=urban_offenses_dist)
plt.xlabel('Total Offenses')
plt.ylabel('Quantile')
plt.title('Distribution of Total Offenses across Urban/Rural Areas')
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 6))
sns.boxenplot(x='Urban_Category', y='Total Offenses', data=combined_crime_food_df, showfliers=False)
plt.xlabel('Urban/Rural Category')
plt.ylabel('Total Offenses')
plt.title('Distribution of Total Offenses across Urban/Rural Areas')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels if needed
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 6))

plt.figure(figsize=(8, 6))

# Apply jitter to x-values before plotting
# Extract x-values as a NumPy array
x_values = combined_crime_food_df['Urban_Category'].cat.codes.astype(float)

# Calculate jitter values
x_jitter_values = np.random.uniform(-0.1, 0.1, size=x_values.shape)

# Apply jitter
jittered_x_values = x_values + x_jitter_values

sns.scatterplot(x=jittered_x_values,  # Use jittered x-values
                y='Total Offenses',
                data=combined_crime_food_df,
                alpha=0.5,
                hue='Urban_Category',  # Use hue for color mapping
                legend='full')  # Show the legend

# Update x-axis tick labels to original categories
plt.xticks(ticks=np.unique(x_values), labels=combined_crime_food_df['Urban_Category'].cat.categories)

plt.xlabel('Urban/Rural Category')
plt.ylabel('Total Offenses')
plt.title('Distribution of Total Offenses across Urban/Rural Areas')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor

# Select rows with at least one non-NaN value
combined_crime_food_df = combined_crime_food_df.dropna(how='all')
# Select only the columns between "Urban" and "TractSNAP" for the features
X = combined_crime_food_df.loc[:, 'Urban':'TractSNAP']

# Convert all columns to numeric, coercing errors to NaN
X = X.apply(pd.to_numeric, errors='coerce')

# Ensure only numerical data is included in the features
X = X.select_dtypes(include=np.number)

# Print the selected feature DataFrame
print("Features DataFrame (X):")
print(X.head())  # Display the first few rows of the DataFrame
print(X.dtypes)  # Confirm the data types of each column

# Get the target columns from "Total Offenses" to the last column, EXCLUDING 'Urban_Category'
target_columns = combined_crime_food_df.columns[68:-1]  # Exclude the last column ('Urban_Category')

# Remove any target columns that are not numeric
target_columns = [col for col in target_columns if pd.api.types.is_numeric_dtype(combined_crime_food_df[col])]

# Initialize a dictionary to store results for all target variables
results_matrix = pd.DataFrame(columns=['Model', 'Target Variable', 'R-squared', 'RMSE', 'Cross-Validation R-squared'])

# Function to train and evaluate models
def evaluate_models(X, y, target_name):
    global results_matrix
    # Drop rows with NaN values in the target variable
    X_filtered = X.loc[y.notna()]  # Select rows from X where y is not NaN
    y_filtered = y.dropna()  # Remove NaN values from y

    if X_filtered.empty or y_filtered.empty:
       print(f"Skipping {target_name} due to insufficient data after removing NaN values.")
       return  # Skip this target variable if data is insufficient

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_filtered, y_filtered, test_size=0.2, random_state=42)

    # Linear Regression
    linear_reg = LinearRegression()
    linear_reg.fit(X_train, y_train)
    y_pred_linear = linear_reg.predict(X_test)
    cv_scores_linear = cross_val_score(linear_reg, X, y, cv=5, scoring='r2')

    results_matrix.loc[len(results_matrix)] = [
        'Linear Regression',
        target_name,
        r2_score(y_test, y_pred_linear),
        mean_squared_error(y_test, y_pred_linear, squared=False),
        cv_scores_linear.mean()
    ]
    print(f"Results after Linear Regression for {target_name}:")
    print(results_matrix.tail(1))  # Print the latest row of results

    # Random Forest Regression
    rf_reg = RandomForestRegressor(random_state=42)
    rf_reg.fit(X_train, y_train)
    y_pred_rf = rf_reg.predict(X_test)
    cv_scores_rf = cross_val_score(rf_reg, X, y, cv=5, scoring='r2')

    results_matrix.loc[len(results_matrix)] = [
        'Random Forest Regression',
        target_name,
        r2_score(y_test, y_pred_rf),
        mean_squared_error(y_test, y_pred_rf, squared=False),
        cv_scores_rf.mean()
    ]
    print(f"Results after Random Forest Regression for {target_name}:")
    print(results_matrix.tail(1))  # Print the latest row of results

    # Decision Tree Regression
    dt_reg = DecisionTreeRegressor(random_state=42)
    dt_reg.fit(X_train, y_train)
    y_pred_dt = dt_reg.predict(X_test)
    cv_scores_dt = cross_val_score(dt_reg, X, y, cv=5, scoring='r2')

    results_matrix.loc[len(results_matrix)] = [
        'Decision Tree Regression',
        target_name,
        r2_score(y_test, y_pred_dt),
        mean_squared_error(y_test, y_pred_dt, squared=False),
        cv_scores_dt.mean()
    ]
    print(f"Results after Decision Tree Regression for {target_name}:")
    print(results_matrix.tail(1))  # Print the latest row of results

    # Support Vector Regression
    svr_reg = SVR(kernel='rbf')  # You can experiment with different kernels
    svr_reg.fit(X_train, y_train)
    y_pred_svr = svr_reg.predict(X_test)
    cv_scores_svr = cross_val_score(svr_reg, X, y, cv=5, scoring='r2')

    results_matrix.loc[len(results_matrix)] = [
        'Support Vector Regression',
        target_name,
        r2_score(y_test, y_pred_svr),
        mean_squared_error(y_test, y_pred_svr, squared=False),
        cv_scores_svr.mean()
    ]
    print(f"Results after Support Vector Regression for {target_name}:")
    print(results_matrix.tail(1))

    # XGBoost Regression
    xgb_reg = XGBRegressor(random_state=42)
    xgb_reg.fit(X_train, y_train)
    y_pred_xgb = xgb_reg.predict(X_test)
    cv_scores_xgb = cross_val_score(xgb_reg, X, y, cv=5, scoring='r2')

    results_matrix.loc[len(results_matrix)] = [
        'XGBoost Regression',
        target_name,
        r2_score(y_test, y_pred_xgb),
        mean_squared_error(y_test, y_pred_xgb, squared=False),
        cv_scores_xgb.mean()
    ]
    print(f"Results after XGBoost Regression for {target_name}:")
    print(results_matrix.tail(1))


    # K-Nearest Neighbors Regression
    knn_reg = KNeighborsRegressor(n_neighbors=5)  # You can tune the number of neighbors
    knn_reg.fit(X_train, y_train)
    y_pred_knn = knn_reg.predict(X_test)
    cv_scores_knn = cross_val_score(knn_reg, X, y, cv=5, scoring='r2')

    results_matrix.loc[len(results_matrix)] = [
        'KNN Regression',
        target_name,
        r2_score(y_test, y_pred_knn),
        mean_squared_error(y_test, y_pred_knn, squared=False),
        cv_scores_knn.mean()
    ]
    print(f"Results after KNN Regression for {target_name}:")
    print(results_matrix.tail(1))

# Iterate over each target column and evaluate models
for target_name in target_columns:
    y = combined_crime_food_df[target_name]
    evaluate_models(X, y, target_name)

# Display the final results matrix
print("\nFinal Results Matrix:")
print(results_matrix)


In [None]:
def visualize_model_performance(results_matrix):

    # Find the best model for each target variable based on R-squared
    best_models = results_matrix.loc[
        results_matrix.groupby('Target Variable')['R-squared'].idxmax()
    ]

    # Reshape the data for visualization
    model_r2_pivot = best_models.pivot(
        index='Model', columns='Target Variable', values='R-squared'
    )

    # Create a custom colormap (red to green) for R-squared values
    r2_colormap = mcolors.LinearSegmentedColormap.from_list(
        "rg", ["red", "orange", "yellow", "lightgreen"], N=256
    )

    # Create the heatmap using seaborn with text offsets
    plt.figure(figsize=(12, 8))
    ax = sns.heatmap(
        model_r2_pivot,
        annot=True,
        cmap=r2_colormap,
        fmt=".2f",
        annot_kws={'size': 10}  # Adjust annotation font size
    )

    # Iterate through annotations to apply vertical offsets
    for t in ax.texts:
        t.set_y(t.get_position()[1] + 0.01)  # Adjust offset as needed

    plt.title('Maximum R-squared for Each Target Variable by Model')
    plt.xlabel('Target Variable')
    plt.ylabel('Model')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

    # Melt the pivot table for bar chart and stripplot (using model_r2_pivot)
    melted_df = model_r2_pivot.reset_index().melt(id_vars=['Model'],
                                                value_vars=model_r2_pivot.columns,
                                                var_name='Target Variable',
                                                value_name='R-squared')

    # Create the grouped bar chart (using melted_df)
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Target Variable', y='R-squared', hue='Model', data=melted_df)
    plt.title('Model Performance (R-squared) for Different Crime Types')
    plt.xlabel('Crime Type')
    plt.ylabel('R-squared')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

    # Create the dot plot (stripplot) (using melted_df)
    plt.figure(figsize=(12, 8))
    sns.stripplot(x='Target Variable', y='R-squared', hue='Model', data=melted_df, jitter=True, dodge=True)
    plt.title('Model Performance (R-squared) for Different Crime Types')
    plt.xlabel('Crime Type')
    plt.ylabel('R-squared')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()


# Call the function to visualize the results
visualize_model_performance(results_matrix)

In [None]:
def visualize_model_performance_2(results_matrix):
    import matplotlib.pyplot as plt
    import pandas as pd

    # Check if required columns are present in results_matrix
    if 'R-squared' not in results_matrix.columns or 'Target Variable' not in results_matrix.columns or 'Model' not in results_matrix.columns:
        raise ValueError("results_matrix must contain 'R-squared', 'Target Variable', and 'Model' columns.")

    # Filter R-squared values to the range -2 to 1
    filtered_results = results_matrix[(results_matrix['R-squared'] >= -2) & (results_matrix['R-squared'] <= 1)].copy()

    # Ensure 'Target Variable' is treated as a categorical variable with the correct order
    crime_type_order = filtered_results['Target Variable'].unique()  # Get unique crime types
    filtered_results['Target Variable'] = pd.Categorical(
        filtered_results['Target Variable'],
        categories=crime_type_order,
        ordered=True
    )

    # Sort the entire DataFrame by 'Target Variable'
    filtered_results = filtered_results.sort_values(by='Target Variable')

    # Start plotting
    plt.figure(figsize=(12, 8))

    # Loop through each model and plot
    for model in filtered_results['Model'].unique():
        model_data = filtered_results[filtered_results['Model'] == model]

        # Plot with lines only between sequential points for the same model
        plt.plot(
            model_data['Target Variable'],
            model_data['R-squared'],
            label=model,
            marker='o',
            linestyle='-',
            alpha=0.7  # Optional transparency for clarity
        )

    plt.title('Model Performance (R-squared) for Different Crime Types (Zoomed)')
    plt.xlabel('Crime Type')
    plt.ylabel('R-squared')
    plt.ylim([-2, 1])  # Set Y-axis limits
    plt.xticks(rotation=45, ha='right', fontsize=10)
    plt.legend(title='Model', loc='upper right')  # Adjust legend position for visibility
    plt.tight_layout()
    plt.show()

visualize_model_performance_2(results_matrix)

From the Data above, we can see a few important things. Firstly, note that Linear Regression and Descision Tree Regression (both simpler models) tended to perform better on crimes for which there was less data. This makes sense, because many of those will be 0 or a constant value all the way down, making the function to predict extremely simple. This leads to high accuracy of these types of models, but may indicate this data is not very useful without more crime data for less common crimes, since things like computer hacking, sports tampering, and betting / wagering are uncommon offenses. KNN regression appears to be the highest performer

In [None]:
def filter_matching_columns(df1, df2):
    # Get common columns between both dataframes
    common_columns = df1.columns.intersection(df2.columns)

    # Filter df1 to keep only common columns
    filtered_df = df1[common_columns]

    # Fill any empty cells (NaN values) with 0
    filtered_df = filtered_df.fillna(0)

    return filtered_df

Pick the best performing model, and train it again on the dataset

In [None]:
# CT Data Cleanup
combined_crime_food_df.to_excel('CT_combined_crime_food_df.xlsx', index=False)
combined_crime_food_df.to_csv('CT_combined_crime_food_df.csv', index=False)

# Austin Data Cleanup
filtered_austin_df = filter_matching_columns(Austin_County_Food_df, combined_crime_food_df)
filtered_austin_df.to_excel('filtered_austin_food_df.xlsx', index=False)
filtered_austin_df.to_csv('filtered_austin_food_df.csv', index=False)

# Selecting X and y from combined_crime_food_df
X_combined = combined_crime_food_df.loc[:, "Urban":"TractSNAP"]
y_combined = combined_crime_food_df["Total Offenses"]

# Selecting X from filtered_austin_df
X_austin = filtered_austin_df.loc[:, "Urban":"TractSNAP"]

print(X_combined.shape, y_combined.shape, X_austin.shape)

print(X_combined.isnull().sum())
print(y_combined.isnull().sum())
print(X_austin.isnull().sum())


In [None]:
# Split the CT data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)


# Create a KNN Regressor
knn_model = KNeighborsRegressor(n_neighbors=4)  # You can tune this hyperparameter

# Train the model on the training data
knn_model.fit(X_train, y_train)

# Evaluate the model on the testing data
y_pred_test = knn_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)
print("Mean Squared Error:", mse)
print("R-squared Score:", r2)

# Predict crime rates for Austin
y_pred_austin = knn_model.predict(X_austin)

# Add the predicted crime rates to the Austin DataFrame
filtered_austin_df['Predicted_Total_Crime'] = y_pred_austin

# Reorder columns to move 'Predicted_Total_Crime' to the third position
cols = list(filtered_austin_df.columns)
cols.insert(2, cols.pop(cols.index('Predicted_Total_Crime')))
filtered_austin_df = filtered_austin_df[cols]

# Output the DataFrame to an Excel spreadsheet
filtered_austin_df.to_excel('austin_with_predictions.xlsx', index=False)


# Visualize the actual vs. predicted crime rates for CT
plt.scatter(y_test, y_pred_test, alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Crime Rate')
plt.ylabel('Predicted Crime Rate')
plt.title('Actual vs. Predicted Crime Rates (CT Data)')
plt.show()

# You can now analyze the predicted crime rates for Austin
print(y_pred_austin)

In [None]:
import matplotlib.pyplot as plt

def plot_predicted_crime_rates(df, predicted_rates, relevant_columns):

    # Create subplots for each relevant column
    num_plots = len(relevant_columns)
    fig, axes = plt.subplots(nrows=num_plots, ncols=1, figsize=(8, 6 * num_plots))

    # Ensure axes is always iterable, even for a single plot
    if num_plots == 1:
        axes = [axes]

    # Iterate through relevant columns and create plots
    for i, column in enumerate(relevant_columns):
        ax = axes[i]  # Get the current subplot axis

        # Scatter plot
        ax.scatter(df[column], predicted_rates, alpha=0.5, label='Data')

        # Fit a regression line
        x = df[column].values.reshape(-1, 1)
        y = predicted_rates
        model = LinearRegression()
        model.fit(x, y)
        y_pred = model.predict(x)

        # Plot the regression line
        ax.plot(df[column], y_pred, color='red', label='Prediction Line')

        # Set labels and title
        ax.set_xlabel(column)
        ax.set_ylabel('Predicted Crime Rate')
        ax.set_title(f'Predicted Crime Rates for Austin vs. {column}')
        ax.legend()

    # Adjust layout and display plots
    plt.tight_layout()
    plt.show()

# Example usage:
relevant_cols = ['Pop2010', 'Urban', 'TractLOWI', 'TractHUNV', 'TractKids', 'PovertyRate', 'MedianFamilyIncome', 'TractSNAP', 'lapophalf', 'lalowihalf', 'lakidshalf']  # Example relevant columns
plot_predicted_crime_rates(filtered_austin_df, y_pred_austin, relevant_cols)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Input
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

# Ensure data is in NumPy array format with correct dtype
X_combined = np.array(X_combined, dtype=np.float32)
y_combined = np.array(y_combined, dtype=np.float32)
X_austin = np.array(X_austin, dtype=np.float32)

# Split the CT data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

# Create a neural network model
def create_model(input_dim, learning_rate=0.001, num_hidden_layers=2, neurons_per_layer=64):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    for _ in range(num_hidden_layers):
        model.add(Dense(neurons_per_layer, activation='relu'))
    model.add(Dense(1))  # Single output for regression
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse', metrics=['mae'])
    return model

# Define hyperparameters
input_dim = X_combined.shape[1]
learning_rate = 0.001
num_hidden_layers = 3
neurons_per_layer = 64
batch_size = 32
epochs = 100

# Create and train the model
model = create_model(input_dim, learning_rate, num_hidden_layers, neurons_per_layer)
history = model.fit(X_train, y_train, validation_data=(X_test, y_test),
                    batch_size=batch_size, epochs=epochs, verbose=1)

# Evaluate the model
y_pred_test_nn = model.predict(X_test)
mse_nn = mean_squared_error(y_test, y_pred_test_nn)
r2_nn = r2_score(y_test, y_pred_test_nn)
print("Neural Network Mean Squared Error:", mse_nn)
print("Neural Network R-squared Score:", r2_nn)

# Predict crime rates for Austin
y_pred_austin_nn = model.predict(X_austin)

# Visualize the actual vs. predicted crime rates for CT
plt.scatter(y_test, y_pred_test_nn, alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Crime Rate')
plt.ylabel('Predicted Crime Rate')
plt.title('Actual vs. Predicted Crime Rates (CT Data) - Neural Network')
plt.show()

# Analyze the predicted crime rates for Austin
print(y_pred_austin_nn)

relevant_cols = ['Pop2010', 'Urban', 'TractLOWI', 'TractHUNV', 'TractKids', 'PovertyRate', 'MedianFamilyIncome', 'TractSNAP', 'lapophalf', 'lalowihalf', 'lakidshalf']  # Example relevant columns
plot_predicted_crime_rates(filtered_austin_df, y_pred_austin_nn, relevant_cols)