# Crime Data Analysis

- Your analysis here
  
---

In [None]:
# Dependencies and Setup
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import hvplot.pandas

# Files to Load
data_2010_2019 = Path("Crime_Data_from_2010_to_2019 (1).csv")
data_2020_2023 = Path("Crime_Data_from_2020_to_Present_20231016.csv")

# Read data(2010-2019) and data (2020- present)Data File and store into Pandas DataFrames
pre_covid_data = pd.read_csv(data_2010_2019)
post_covid_data = pd.read_csv(data_2020_2023)

# Combine the data into a single dataset.  
Data_complete = pd.concat([pre_covid_data, post_covid_data], ignore_index=True)
Data_complete.head()

Data Cleaning


In [None]:
# Create a Year column from the dataframe
Data_complete['Crime Year'] = pd.to_datetime(Data_complete['Date Rptd']).dt.year

In [None]:
# Remove unused columns
Data_complete.drop(Data_complete.columns[[1, 2, 3, 4, 6, 7, 10, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,28]], axis=1, inplace=True)

In [None]:
# Remove NaN entries
Data_complete = Data_complete.dropna()

# Clean data for duplicate crime records
Data_complete = Data_complete.drop_duplicates(subset=['DR_NO'])

# Remove unrecorded victim age data
Data_complete = Data_complete[Data_complete['Vict Age'] > 0]

In [None]:
# Change the name of the columns to complete the data cleaning
Data_complete.rename(columns={
                    'DR_NO':'DR Number',
                    'Crime Year': 'Crime Year',
                    'AREA NAME': 'Area Name',
                    'Crm Cd': 'Crime Code',
                    'Crm Cd Desc':'Type of Crime',
                    'Vict Age': 'Victim Age',
                    'Vict Sex': 'Victim Gender',
                    'Vict Descent': 'Victim Ethnicity',
                    'Premis Desc': 'Scene of Crime',
                    'LAT': 'Latitude',
                    'LON': 'Longitude'
                    
},inplace= True)

# Remove coordinates outside of Los Angeles
Data_complete = Data_complete[(Data_complete['Longitude'] < -108) & (Data_complete['Longitude'] > -128)]
Data_complete = Data_complete[(Data_complete['Latitude'] < 44) & (Data_complete['Latitude'] > 24)]

Data_complete.head()

## Data Analysis

In [None]:
count = len(Data_complete["DR Number"])
print(count)

In [None]:
unique_demo = Data_complete["Victim Age"].value_counts()
print(unique_demo)

In [None]:
unique_demo = Data_complete["Victim Ethnicity"].value_counts()
print(unique_demo)

In [None]:
unique_demo = Data_complete["Area Name"].value_counts()
print(unique_demo)

## Yearly Summary

In [None]:
# Get the total number of crimes by year
yearly_total = Data_complete["DR Number"].groupby(Data_complete["Crime Year"]).count().reset_index()
yearly_total.rename(columns={"DR Number": "Total Crimes"}, inplace=True)

# Add a column that calculates the yearly rate of change in crime
yearly_total['Percentage Change'] = yearly_total['Total Crimes'].pct_change() * 100

# Convert NaN first row to a dash
yearly_total['Percentage Change'].iloc[0] = '-'
yearly_total.set_index('Crime Year', inplace=True)
yearly_total

In [None]:
# Create a Summary Statistics table of the total crimes over the entire dataset
yearly_sum_stats = yearly_total.describe()
yearly_sum_stats

In [None]:
yearly_total_plot.plot?

In [None]:
# Bar plot of the total crime count over the length of the dataset
yearly_total_plot = yearly_total
mean_total = yearly_sum_stats.loc['mean', 'Total Crimes']
covid_color = ["r" if year >= 2020 else "b" for year in yearly_total_plot.index]
# Trying to plot the covid years in red
yearly_total_plot.plot.bar(y = "Total Crimes", 
                       color= covid_color)

plt.axhline(y=mean_total, color='black', linestyle='--', label='Mean: {mean_total}')
plt.text(9.5, 172000, "Mean")
plt.xlabel("")
plt.xticks(rotation=45)
plt.ylabel("Total Crimes by Year")
plt.title("Totals Crimes in Los Angeles (2010-2023)")
plt.savefig("output_data/TotalCrimeYTY.png")
plt.show()

In [None]:
# Line graph showing the percentage change in total crimes year-to-year
yearly_total_index_reset = yearly_total.reset_index()
percent_plot = yearly_total_index_reset.iloc[1:]
plt.figure(figsize=(7, 5))
plt.plot(percent_plot["Crime Year"], percent_plot["Percentage Change"], marker='o', linestyle='-')
plt.title("Percentage Change in Total Crimes (2010-2023)")
plt.xlabel("Year")
plt.ylabel("Percentage Change")
plt.xticks(percent_plot["Crime Year"], rotation=45)
plt.grid(True)

# Show or save the plot
plt.show()

In [None]:
# Linear Regression plot
slope, intercept, rvalue, pvalue, stderr = stats.linregress(yearly_total_reg["Crime Year"], yearly_total_reg["Total Crimes"])
regress = yearly_total_reg["Crime Year"] * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x +" + str(round(intercept, 2))

plt.scatter(yearly_total_reg["Crime Year"], yearly_total_reg["Total Crimes"])
plt.plot(yearly_total_reg["Crime Year"], regress, "r-")
plt.xticks(yearly_total_reg["Crime Year"], rotation=45)
plt.ylabel("Total Crimes")
plt.title("Linear Regression Plot of Total Crime in Los Angeles (2010-2023)")
plt.annotate(line_eq, (min(yearly_total_index_reset["Crime Year"]), max(yearly_total_index_reset["Total Crimes"])-1), fontsize=12, color="red")
print(f"The r-value is: {rvalue ** 2}")
print(f"The p-value is: {pvalue}")
plt.show()

In [None]:
# T-Test of Pre-Covid and Post-Covid Yearly Total Crimes
# Slice the dataframe to create pre and post covid
pre_covid_slice = yearly_total_index_reset[(yearly_total_index_reset['Crime Year'] >= 2010) & (yearly_total_index_reset['Crime Year'] <= 2019)]
post_covid_slice = yearly_total_index_reset[(yearly_total_index_reset['Crime Year'] >= 2020) & (yearly_total_index_reset['Crime Year'] <= 2023)]

# Calculate the mean for before and after covid
pre_covid_mean = pre_covid_slice['Total Crimes'].mean()
post_covid_mean = post_covid_slice['Total Crimes'].mean()

print(pre_covid_mean)
print(post_covid_mean)
# Perform a t-test to compare the means of the two groups
stats.ttest_ind(pre_covid_slice['Total Crimes'], post_covid_slice['Total Crimes'], equal_var=False)

In [None]:
# Linear Regression plot
yearly_total_index_reset = yearly_total.reset_index()
slope, intercept, rvalue, pvalue, stderr = stats.linregress(yearly_total_reg["Crime Year"], yearly_total_reg["Total Crimes"])
regress = yearly_total_reg["Crime Year"] * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x +" + str(round(intercept, 2))

plt.scatter(yearly_total_reg["Crime Year"], yearly_total_reg["Total Crimes"])
plt.plot(yearly_total_reg["Crime Year"], regress, "r-")
plt.xticks(yearly_total_reg["Crime Year"], rotation=45)
plt.ylabel("Total Crimes")
plt.title("Linear Regression Plot of Total Crime in Los Angeles (2010-2023)")
plt.annotate(line_eq, (min(yearly_total_index_reset["Crime Year"]), max(yearly_total_index_reset["Total Crimes"])-1), fontsize=12, color="red")
print(f"The r-value is: {rvalue ** 2}")
print(f"The p-value is: {pvalue}")
plt.show()

In [None]:
# Define function to pull the most common occurring value in each column
def highest_occurance(column):
    return column.value_counts().idxmax()

# Group the data by 'Crime Year' and apply the most_frequent_value function to each column
highest_occurance = Data_complete.groupby('Crime Year').agg({
                    'Area Name': highest_occurance,
                    'Crime Code': highest_occurance,
                    'Type of Crime': highest_occurance,
                    'Victim Age': highest_occurance,
                    'Victim Gender': highest_occurance,
                    'Victim Ethnicity': highest_occurance,
                    'Scene of Crime': highest_occurance
                    }).reset_index()

In [None]:
# Create Yearly Summary table by merging the two new yearly analysis dataframes
yearly_summary_df = pd.merge(yearly_total, highest_occurance, on='Crime Year')
yearly_summary_df.set_index('Crime Year', inplace=True)
yearly_summary_df

## Crimes Summary

In [None]:
Total_crime_count = len(Data_complete['DR Number'])
Total_crime_count

In [None]:
# Calculate the total number of Crimes 
##Total_crime_count = Data_complete.len['Type of Crime']


# unique kind of the crimes:
#Kind_of_crimes = clean_crime_data.groupby(["Crm Cd Desc","crime_year"], as_index = false).count()

# Pick which highest frequency (5)
#highest_frequency_crime = Kind_of_crimes.sort_values(ascending= False)

# Inside the home and outside the home( COVID people were home)



In [None]:
#Plot a line graph showing the overall crimes trend change over the years
plot.line

In [None]:
# Calculate the kind of crimes (e.g different kind of crimes) per year??
#the distribution of crime types over the years
#clean_crime_data.loc(2017)
#df_2017= 
#df_2018
#df_2019
#df_2020
#df_2021
#df_2022
#crime_type



In [None]:
# Create a dataframe with crime_type and year????
crime_summary = pd.DataFrame({
    

In [None]:
# Plot a bar plot with multiple columns over the different years for total number of crimes(value count)


In [None]:
# Plot another histogram for average of pre covid and post covid crimes (Total and one for each crime).

## Area Summary

In [None]:
#  select all of the different Areas


In [None]:
# Divide areas into Central, Valley, South, West

In [None]:
# Calculate the total crimes per area per year
per_area_crime_counts = 

In [None]:
#calculate crime types per area
# Five highest crimes 

In [None]:
# Geoplot the area 

In [None]:
# Make a data frame with Columns for Average crime per area


# Display Data Frame

In [None]:
# Highest Crime Area(By Total Crimes)

In [None]:
# Lowest Crime Area (By Total Crime)

In [None]:
#Bar chart four areas, four years and total number of crimes

In [None]:
# Identify Hot Spots
#Crime in each area acroos the years

In [None]:
# Any change in the Hot spots overs the years


### Female Vs Male Victims

In [None]:
# Generate a pie plot showing the distribution of female versus male victims using Pandas
data = data["Sex"].value_counts()
plt.title("Female vs. Male Victims")
Female_male_data.plot.pie(autopct= "%1.1f%%")
plt.show()

### Victim Race 

# Child Abuse compare over the years

## Change in crime spot

In [None]:
#Splitting race by groups: Black, White, Hispanics, Asians

In [None]:
#Splitting premise into 4 categories: Commercial, residential, industrial and outdoors
# using Bins

In [None]:
#Percentage of increase and decrease in crimes over the years
Total_crime_count groupbyyear (count)

In [None]:
 #How has crime changed over the years?
 
 Line graphs 