# CRIME DETECTION SYSTEM

## Programming for AI TABA

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import folium
from folium.plugins import HeatMap
import pymysql
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

### DB Connection

In [None]:
host = "localhost"
user = "root"
password = "root"
database = "crime_detection"

connection = pymysql.connect(host=host, user=user, password=password, db=database)

### Load Data

In [None]:
crime_data = pd.read_csv('Crime_Data_from_2010_to_2019.csv')
#sql_query = "SELECT * FROM raw_data"
#crime_data = pd.read_sql(sql_query, connection)
#connection.close()

In [41]:
crime_data.head()

Unnamed: 0,dr_no,time_occ_code,AREA,area_name,rpt_dist_no,part,crm_code,crm_desc,vict_age,vict_sex,...,location_name,cross_street,lat_coor,lon_coor,date_rptd,time_rptd,date_occ,time_occ,date,month_occ
0,1307355,1350,13,Newton,1385,2,900,VIOLATION OF COURT ORDER,48,M,...,300 E GAGE AV,,33.9825,-118.2695,02/20/2010,12:00:00,2010-02-20,12:00:00,2010,2
1,11401303,45,14,Pacific,1485,2,740,"VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...",38,M,...,SEPULVEDA BL,MANCHESTER AV,33.9599,-118.3962,09/13/2010,12:00:00,2010-09-12,12:00:00,2010,9
2,70309629,1515,13,Newton,1324,2,946,OTHER MISCELLANEOUS CRIME,38,M,...,1300 E 21ST ST,,34.0224,-118.2524,08/09/2010,12:00:00,2010-08-09,12:00:00,2010,8
3,90631215,150,6,Hollywood,646,2,900,VIOLATION OF COURT ORDER,47,F,...,CAHUENGA BL,HOLLYWOOD BL,34.1016,-118.3295,01/05/2010,12:00:00,2010-01-05,12:00:00,2010,1
4,100100501,2100,1,Central,176,1,122,"RAPE, ATTEMPTED",47,F,...,8TH ST,SAN PEDRO ST,34.0387,-118.2488,01/03/2010,12:00:00,2010-01-02,12:00:00,2010,1


In [None]:
crime_data.tail()

In [None]:
print(crime_data.columns)

In [None]:
crime_data.info()

The columns of the dataset are:
1. DR_NO: Report number for the incident (int64)
2. Date Rptd: Date the incident was reported (object, might need to be converted to a datetime format)
3. DATE OCC: Date of occurrence of the incident (object, might need to be converted to a datetime format)
4. TIME OCC: Time of occurrence of the incident (int64)
5. AREA: Area code related to the incident (int64)
6. AREA NAME: Name of the area related to the incident (object)
7. Rpt Dist No: Report district number (int64)
8. Part 1-2: Classification of the incident (int64)
9. Crm Cd: Crime code (int64)
10. Crm Cd Desc: Description of the crime code (object)
11. Mocodes: Modus operandi codes (object)
12. Vict Age: Age of the victim (int64)
13. Vict Sex: Sex of the victim (object)
14. Vict Descent: Descent of the victim (object)
15. Premis Cd: Premises code (float64)
16. Premis Desc: Description of the premises (object)
17. Weapon Used Cd: Weapon code used (float64)
18. Weapon Desc: Description of the weapon used (object)
19. Status: Status of the incident (object)
20. Status Desc: Description of the status (object)
21. Crm Cd 1-4: Additional crime codes (float64)
22. LOCATION: Location of the incident (object)
23. Cross Street: Cross street of the incident (object)
24. LAT: Latitude coordinates (float64)
25. LON: Longitude coordinates (float64)

In [None]:
crime_data.shape

In [None]:
crime_data.isnull().sum()

### Preprocessing & Visualisation

#### Date and Time

In [None]:
crime_data.head()

In [None]:
# Stripping leading/trailing whitespaces from selected columns
columns_to_strip = [
    'DR_NO',  'DATE OCC', 'TIME OCC',  'AREA NAME', 'Crm Cd Desc', 'Mocodes',
    'Vict Sex', 'Vict Descent', 'Premis Desc', 'Weapon Desc', 'Status', 'Status Desc', 'LOCATION',
    'Cross Street'
]

# Ensure all columns to strip are converted to string type to avoid issues with NaN/None values
for column in columns_to_strip:
    crime_data[column] = crime_data[column].astype(str).str.strip()

# Splitting 'Date Rptd' and 'DATE OCC' columns into date and time components
crime_data['date_time_rptd'] = crime_data['Date Rptd'].str.split()
crime_data['date_rptd'] = crime_data['date_time_rptd'].apply(lambda x: x[0] if isinstance(x, list) else None)
crime_data['time_rptd'] = crime_data['date_time_rptd'].apply(lambda x: x[1] if isinstance(x, list) and len(x) > 1 else None)

crime_data['date_time_occ'] = crime_data['DATE OCC'].str.split()
crime_data['date_occ'] = crime_data['date_time_occ'].apply(lambda x: x[0] if isinstance(x, list) else None)
crime_data['time_occ'] = crime_data['date_time_occ'].apply(lambda x: x[1] if isinstance(x, list) and len(x) > 1 else None)

crime_data.head()

In [None]:
# Extracting year from 'Date Occurred' and creating a new 'Date' column
crime_data['date'] = pd.to_datetime(crime_data['date_occ']).dt.year

crime_data.head()

In [None]:
# Dropping the columns
columns_to_drop = ['date_time_rptd', 'date_time_occ', 'Date Rptd', 'DATE OCC']
crime_data.drop(columns=columns_to_drop, inplace=True, axis=1)

# Changing the column names
crime_data.rename({'DR_NO': 'dr_no', 'TIME OCC': 'time_occ_code'}, axis=1, inplace=True)

crime_data.head()

In [None]:
# Filter the data for the years 2010 to 2019
crime_data_2010_2019 = crime_data[(crime_data['date'] >= 2010) & (crime_data['date'] <= 2019)]

# Group by year and count the total criminal cases for each year
crime_by_year = crime_data_2010_2019.groupby('date').size()

plt.plot(crime_by_year.index, crime_by_year.values, marker='o', linestyle='-', color = 'cadetblue')
plt.title('Total Criminal Cases from 2010 to 2019')
plt.xlabel('Year')
plt.ylabel('Total Criminal Cases')
plt.grid(True)
plt.xticks(crime_by_year.index)
plt.tight_layout()
plt.show()

In [None]:
crime_data['date_occ'] = pd.to_datetime(crime_data['date_occ'])
crime_data['month_occ'] = crime_data['date_occ'].dt.month
crimes_per_month = crime_data['month_occ'].value_counts().sort_index()

num_months = len(crimes_per_month)
palette = sns.color_palette("husl", num_months)
plt.figure(figsize=(10, 6))
crimes_per_month.plot(kind='bar', color=palette)
plt.title('Number of Crimes per Month')
plt.xlabel('Month')
plt.ylabel('Number of Crimes')
plt.xticks(rotation=0)  # Keep x-axis labels vertical
plt.show()

#### Area

In [None]:
# Changing the column names
crime_data.rename({'AREA': 'area', 'AREA NAME': 'area_name', 'LOCATION':'location_name', 'Cross Street':'cross_street', 
                   'LAT': 'lat_coor', 'LON':'lon_coor', 'Rpt Dist No':'rpt_dist_no', 'Part 1-2':'part'}, axis=1, inplace=True)
crime_data.head()

In [None]:
# Grouping data by 'AREA NAME' and counting occurrences
crime_by_area = crime_data['area_name'].value_counts()

crime_by_area.plot(kind='bar', color='yellowgreen')
plt.title('Crime Distribution by Area')
plt.xlabel('Area')
plt.ylabel('Number of Crimes')
plt.xticks(rotation=45, ha='right') 
plt.tight_layout()
plt.show()

#### Crime

In [None]:
columns_to_drop = ['Crm Cd 1', 'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4', 'Mocodes'] 
crime_data.drop(columns=columns_to_drop, inplace=True)

In [None]:
crime_data.rename({'Crm Cd': 'crm_code', 'Crm Cd Desc': 'crm_desc', 
                 'Premis Cd':'premis_code' ,'Premis Desc':'premis_desc', 'Weapon Used Cd':'weapon_code',
                  'Weapon Desc':'weapon_desc', 'Status':'status', 'Status Desc':'status_desc' }, axis=1, inplace=True)

crime_data['weapon_desc'] = crime_data['weapon_desc'].replace('nan', np.nan)
crime_data.head()

In [None]:
top_crime_types = crime_data['crm_desc'].value_counts().head(10)

plt.figure(figsize=(10, 6))
top_crime_types.plot(kind='barh', color='orange')
plt.title('Top 10 Crimes in LA')
plt.xlabel('Frequency')
plt.ylabel('Crime Type')
plt.gca().invert_yaxis()
plt.show()

In [None]:
crime_data['weapon_desc'].value_counts()

In [None]:
weapons_text = ' '.join(crime_data['weapon_desc'].dropna())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(weapons_text)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Weapons Used')
plt.show()

#### Victim

In [None]:
crime_data.rename({'Vict Age': 'vict_age', 'Vict Sex': 'vict_sex', 'Vict Descent':'vict_des' }, axis=1, inplace=True)
crime_data.head()

In [None]:
# Calculate the mean of non-zero positive ages
mean_age = np.mean(crime_data.loc[(crime_data['vict_age'] > 0), 'vict_age'])
crime_data.loc[(crime_data['vict_age'] <= 0), 'vict_age'] = mean_age
crime_data['vict_age'] = crime_data['vict_age'].astype(int)

crime_data.head()

In [None]:
crime_data['vict_sex'].unique()

In [None]:
crime_data['vict_sex'] = crime_data['vict_sex'].replace('nan', np.nan)

In [None]:
plt.figure(figsize=(8, 6))
crime_data['vict_sex'].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=['skyblue', 'red', 'orange','purple', 'pink','green'])
plt.title('Crime Distribution by Sex of Victims')
plt.ylabel('')
plt.legend(title='Victim Sex', loc='upper right')
plt.show()

In [None]:
crime_data['vict_sex'] = crime_data["vict_sex"].replace(['H', 'N','-'], 'X')
crime_data['vict_sex'].value_counts()

In [None]:
top_10_areas = crime_data['area_name'].value_counts().head(10).index.tolist()
df_top_10_areas = crime_data[crime_data['area_name'].isin(top_10_areas)]

fig, ax = plt.subplots(figsize=(12, 8))
sns.countplot(x='area_name', hue='vict_sex', data=df_top_10_areas, palette='rocket', alpha=0.75)
ax.set_title('Victims, by Sex, in the Top 10 Highest Crime Neighborhoods of Los Angeles (2010-2019)')
ax.set_xlabel('Area Name', fontsize=15)
plt.xticks(rotation=45) 
plt.show()

In [None]:
crime_data['vict_des'].unique()

In [None]:
crime_data['vict_des'] = crime_data['vict_des'].replace('nan', np.nan)
crime_data['vict_des'] = crime_data['vict_des'].replace('-', np.nan)

In [None]:
victims = {
      "A": "Asian",
      "B": "Black",
      "C": "Chinese",
      "D": "Cambodian",
      "F": "Filipino",
      "G": "Guamanian",
      "H": "Hispanic/Latin/Mexican",
      "I": "American Indian/Alaskan Native",
      "J": "Japanese",
      "K": "Korean",
      "L": "Laotian",
      "O": "Other",
      "P": "Pacific Islander",
      "S": "Samoan",
      "U": "Hawaiian",
      "V": "Vietnamese",
      "W": "White",
      "X": "Unknown",
      "Z": "Asian Indian"
  }

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='vict_des', data=crime_data, order=crime_data['vict_des'].value_counts().index)
plt.title('Distribution of Victims by Nations')
plt.xlabel('Victim Nations')
plt.ylabel('Number of Victims')
plt.xticks(rotation=45)  
plt.show()

In [None]:
crime_data = crime_data.dropna(subset=['vict_sex', 'vict_des'])

In [None]:
crime_data.shape

In [None]:
aggregated_data = crime_data.groupby(['lat_coor', 'lon_coor']).size().reset_index(name='crime_count')
la_latitude = 34.052235
la_longitude = -118.243683
la_map = folium.Map(location=[la_latitude, la_longitude], zoom_start=10)

# Create a HeatMap layer using the aggregated crime data
heat_data = aggregated_data[['lat_coor', 'lon_coor', 'crime_count']].values.tolist()
HeatMap(heat_data, radius=15).add_to(la_map)

# Display the map
la_map

### Crime Prediction

Selecting features and target

In [47]:
# Özelliklerin ve hedefin seçilmesi
features = crime_data[['date', 'time_occ_code', 'AREA ']]
target = crime_data['crm_code']

Categorical data digitization and feature scaling

In [48]:
# Kategorik verileri sayısal hale getirme ve özellik ölçeklendirme
features = pd.get_dummies(features)
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

Separation into training and testing sets

In [49]:
# Eğitim ve test setlerine ayırma
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.3, random_state=42)


Variability of the model (depth limitation to reduce complexity)

In [None]:
# Modelin oluşturulması (karmaşıklığın azaltılması için derinlik sınırlaması)
classifier = DecisionTreeClassifier(max_depth=5)
classifier.fit(X_train, y_train)

Evaluate the generalization process of the model using cross operations

In [None]:
# Çapraz doğrulama kullanarak modelin genelleştirme performansını değerlendirme
cross_val_accuracy = cross_val_score(classifier, X_train, y_train, cv=5)
print(f"Cross Validation Accuracy Rates: {cross_val_accuracy}")
print(f"Average Cross Validation Accuracy: {np.mean(cross_val_accuracy)}")

Make predictions on the test set

In [None]:
# Test seti üzerinde tahmin yapma
y_pred = classifier.predict(X_test)

Shown by the model's test set accuracy score

In [None]:
# Modelin test seti doğruluk skorunun hesaplanması
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy Rate: {accuracy}")