# CRIME DETECTION SYSTEM

## Programming for AI TABA

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import folium
from folium.plugins import HeatMap
import pymysql
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

### DB Connection

In [None]:
def db(): 
    host = "localhost"
    user = "root"
    password = "root"
    database = "crime_detection"

    return pymysql.connect(host=host, user=user, password=password, db=database)

### Load Data

In [None]:

crime_data = None

In [None]:
def load_by_csv():
    global crime_data
    raw_data = pd.read_csv('Crime_Data_from_2010_to_2019.csv')
    #If the dataset read from cvs file, change the name of the columns to complete the data cleaning
    raw_data.rename(columns={
    'DR_NO': 'dr_no',
    'Date Rptd': 'date_rptd',
    'DATE OCC': 'date_occ',
    'TIME OCC': 'time_occ',
    'AREA ': 'area',
    'AREA NAME': 'area_name',
    'Rpt Dist No': 'rpt_dist_no',
    'Part 1-2': 'part_1-2',
    'Crm Cd': 'crm_cd',
    'Crm Cd Desc': 'crm_cd_desc',
    'Mocodes': 'mocodes',
    'Vict Age': 'vict_age',
    'Vict Sex': 'vict_sex',
    'Vict Descent': 'vict_descent',
    'Premis Cd': 'premis_cd',
    'Premis Desc': 'premis_desc',
    'Weapon Used Cd': 'weapon_used_cd',
    'Weapon Desc': 'weapon_desc',
    'Status': 'status',
    'Status Desc': 'status_desc',
    'Crm Cd 1': 'crm_cd_1',
    'Crm Cd 2': 'crm_cd_2',
    'Crm Cd 3': 'crm_cd_3',
    'Crm Cd 4': 'crm_cd_4',
    'LOCATION': 'location',
    'Cross Street': 'cross_street',
    'LAT': 'lat',
    'LON': 'lon',              
    },inplace= True)
    
    crime_data = raw_data[[
        'date_rptd',
        'date_occ',
        'time_occ',
        'area',
        'area_name',
        'crm_cd',
        'crm_cd_desc',
        'vict_age',
        'vict_sex',
        'vict_descent',
        'premis_cd',
        'premis_desc',
        'weapon_used_cd',
        'weapon_desc',
        'location',
        'lat',
        'lon' 
    ]]

In [None]:
def load_by_db():
    global crime_data
    connection = db()
    sql_query = """SELECT
    'date_rptd',
    'date_occ',
    'time_occ',
    'area',
    'area_name',
    'crm_cd',
    'crm_cd_desc',
    'vict_age',
    'vict_sex',
    'vict_descent',
    'premis_cd',
    'premis_desc',
    'weapon_used_cd',
    'weapon_desc',
    'location',
    'lat',
    'lon'
    FROM raw_data"""
    crime_data = pd.read_sql(sql_query, connection)
    connection.close()

In [None]:
source_type = 'CSV'
if source_type == 'DB':
    print("DB...")
    load_by_db()

elif source_type == 'CSV':
    print("CSV...")
    load_by_csv()
else:
    raise Exception('Chose a data source ("DB" or "CSV")')

In [None]:
crime_data.head()

In [None]:
crime_data.tail()

Dataframe columns name

In [None]:
print(crime_data.columns)

Dataframe columns info

In [None]:
crime_data.info()

The columns of the dataset are:
1. DR_NO: Report number for the incident (int64)
2. Date Rptd: Date the incident was reported (object, might need to be converted to a datetime format)
3. DATE OCC: Date of occurrence of the incident (object, might need to be converted to a datetime format)
4. TIME OCC: Time of occurrence of the incident (int64)
5. AREA: Area code related to the incident (int64)
6. AREA NAME: Name of the area related to the incident (object)
7. Rpt Dist No: Report district number (int64)
8. Part 1-2: Classification of the incident (int64)
9. Crm Cd: Crime code (int64)
10. Crm Cd Desc: Description of the crime code (object)
11. Mocodes: Modus operandi codes (object)
12. Vict Age: Age of the victim (int64)
13. Vict Sex: Sex of the victim (object)
14. Vict Descent: Descent of the victim (object)
15. Premis Cd: Premises code (float64)
16. Premis Desc: Description of the premises (object)
17. Weapon Used Cd: Weapon code used (float64)
18. Weapon Desc: Description of the weapon used (object)
19. Status: Status of the incident (object)
20. Status Desc: Description of the status (object)
21. Crm Cd 1-4: Additional crime codes (float64)
22. LOCATION: Location of the incident (object)
23. Cross Street: Cross street of the incident (object)
24. LAT: Latitude coordinates (float64)
25. LON: Longitude coordinates (float64)

In [None]:
crime_data.shape

In [None]:
crime_data.isnull().sum()

### Preprocessing & Visualisation

#### Date of Crime Reported and Crime Occurred Converting

Spliting Crime Reported, converting to date time Crime Reported and droping column date_time_rptd

In [None]:
crime_data['date_time_rptd'] = crime_data['date_rptd'].str.split()
crime_data['date_rptd'] = pd.to_datetime(crime_data['date_time_rptd'].apply(lambda x: x[0]), format="%m/%d/%Y")
crime_data['time_rptd'] = crime_data['date_time_rptd'].apply(lambda x: x[1])
crime_data.drop(columns=['date_time_rptd'], inplace=True, axis=1)

Spliting Crime Occurred, converting to date time Crime Occurred and droping column date_time_occ

In [None]:
crime_data['date_time_occ'] = crime_data['date_occ'].str.split()
crime_data['date_occ'] = pd.to_datetime(crime_data['date_time_occ'].apply(lambda x: x[0]), format="%m/%d/%Y")
crime_data['time_occ'] = crime_data['date_time_occ'].apply(lambda x: x[1])
crime_data.drop(columns=['date_time_occ'], inplace=True, axis=1)

Create year field from date_occ

In [None]:
crime_data['year'] = crime_data['date_occ'].dt.strftime('%Y')

Filter the data for the years 2010 to 2019  
Group by year and count the total criminal cases for each year

In [None]:
crime_by_year = crime_data[(crime_data['year'] >= '2010') & (crime_data['year'] <= '2019')].groupby('year').size()
plt.plot(crime_by_year.index, crime_by_year.values, marker='o', linestyle='-', color = 'cadetblue')
plt.title('Total Criminal Cases from 2010 to 2019')
plt.xlabel('Year')
plt.ylabel('Total Criminal Cases')
plt.grid(True)
plt.xticks(crime_by_year.index)
plt.tight_layout()
plt.show()

In [None]:
crimes_per_month = crime_data['date_occ'].dt.month.value_counts().sort_index()
num_months = len(crimes_per_month)
palette = sns.color_palette("husl", num_months)
plt.figure(figsize=(10, 6))
crimes_per_month.plot(kind='bar', color=palette)
plt.title('Number of Crimes per Month')
plt.xlabel('Month')
plt.ylabel('Number of Crimes')
plt.xticks(rotation=0)  # Keep x-axis labels vertical
plt.show()

#### Area

In [None]:
# Grouping data by 'AREA NAME' and counting occurrences
crime_by_area = crime_data['area_name'].value_counts()
crime_by_area.plot(kind='bar', color='yellowgreen')
plt.title('Crime Distribution by Area')
plt.xlabel('Area')
plt.ylabel('Number of Crimes')
plt.xticks(rotation=45, ha='right') 
plt.tight_layout()
plt.show()

#### Crime

In [None]:
top_crime_types = crime_data['crm_cd_desc'].value_counts().head(10)

plt.figure(figsize=(10, 6))
top_crime_types.plot(kind='barh', color='orange')
plt.title('Top 10 Crimes in LA')
plt.xlabel('Frequency')
plt.ylabel('Crime Type')
plt.gca().invert_yaxis()
plt.show()

In [None]:
crime_data['weapon_desc'].value_counts()

In [None]:
weapons_text = ' '.join(crime_data['weapon_desc'].dropna())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(weapons_text)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Weapons Used')
plt.show()

#### Victim

In [None]:
# Calculate the mean of non-zero positive ages
mean_age = np.mean(crime_data.loc[(crime_data['vict_age'] > 0), 'vict_age'])
crime_data.loc[(crime_data['vict_age'] <= 0), 'vict_age'] = mean_age
crime_data['vict_age'] = crime_data['vict_age'].astype(int)

crime_data.head()

In [None]:
crime_data['vict_sex'].unique()

In [None]:
crime_data['vict_sex'] = crime_data['vict_sex'].replace('nan', np.nan)

In [None]:
plt.figure(figsize=(8, 6))
crime_data['vict_sex'].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=['skyblue', 'red', 'orange','purple', 'pink','green'])
plt.title('Crime Distribution by Sex of Victims')
plt.ylabel('')
plt.legend(title='Victim Sex', loc='upper right')
plt.show()

In [None]:
crime_data['vict_sex'] = crime_data["vict_sex"].replace(['H', 'N','-'], 'X')
crime_data['vict_sex'].value_counts()

In [None]:
top_10_areas = crime_data['area_name'].value_counts().head(10).index.tolist()
df_top_10_areas = crime_data[crime_data['area_name'].isin(top_10_areas)]

fig, ax = plt.subplots(figsize=(12, 8))
sns.countplot(x='area_name', hue='vict_sex', data=df_top_10_areas, palette='rocket', alpha=0.75)
ax.set_title('Victims, by Sex, in the Top 10 Highest Crime Neighborhoods of Los Angeles (2010-2019)')
ax.set_xlabel('Area Name', fontsize=15)
plt.xticks(rotation=45) 
plt.show()

In [None]:
victims = {
      "A": "Asian",
      "B": "Black",
      "C": "Chinese",
      "D": "Cambodian",
      "F": "Filipino",
      "G": "Guamanian",
      "H": "Hispanic/Latin/Mexican",
      "I": "American Indian/Alaskan Native",
      "J": "Japanese",
      "K": "Korean",
      "L": "Laotian",
      "O": "Other",
      "P": "Pacific Islander",
      "S": "Samoan",
      "U": "Hawaiian",
      "V": "Vietnamese",
      "W": "White",
      "X": "Unknown",
      "Z": "Asian Indian"
  }

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=crime_data['vict_descent'].map(victims), order=crime_data['vict_descent'].map(victims).value_counts().index)
plt.title('Distribution of Victims by Nations')
plt.ylabel('Victim Nations')
plt.xlabel('Number of Victims')
plt.show()

In [None]:
crime_data = crime_data.dropna(subset=['vict_sex', 'vict_descent'])

In [None]:
crime_data.shape

In [None]:
aggregated_data = crime_data.groupby(['lat', 'lon']).size().reset_index(name='crime_count')
la_latitude = 34.052235
la_longitude = -118.243683
la_map = folium.Map(location=[la_latitude, la_longitude], zoom_start=10)

# Create a HeatMap layer using the aggregated crime data
heat_data = aggregated_data[['lat', 'lon', 'crime_count']].values.tolist()
HeatMap(heat_data, radius=15).add_to(la_map)

# Display the map
la_map

### Crime Prediction

Selecting features and the target variable

In [None]:
features = ['lat', 'lon', 'time_occ', 'area', 'year'] 
target = 'crm_cd'

X = crime_data[features]
y = crime_data[target]

Applying get_dummies to convert categorical features into dummy/indicator variables

In [None]:
X = pd.get_dummies(X, columns=['time_occ', 'area', 'year'])

Preprocessing for continuous features

In [None]:
continuous_features = ['lat', 'lon']
continuous_transformer = StandardScaler()


Creating a ColumnTransformer that will apply the transformations to the respective features

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cont', continuous_transformer, continuous_features)])


Defining and training the Random Forest model

In [None]:
rf = Pipeline(steps=[('preprocessor', preprocessor),
                     ('classifier', RandomForestClassifier(n_jobs=2, verbose=2, max_depth=5))])


Defining and training the Decision Tree model

In [None]:
dt = Pipeline(steps=[('preprocessor', preprocessor),
                     ('classifier', DecisionTreeClassifier(max_depth=5))], verbose=3)


Splitting the dataset into training and testing sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


Training the Random Forest model and calculating cross-validation scores

In [None]:
rf.fit(X_train, y_train)
rf_cross_val_scores = cross_val_score(rf, X_train, y_train, cv=5)
print(f"Cross Validation Accuracy Rates: {rf_cross_val_scores}")
print(f"Average Cross Validation Accuracy: {np.mean(rf_cross_val_scores)}")


Training the Decision Tree model and calculating cross-validation scores

In [None]:
dt.fit(X_train, y_train)
dt_cross_val_scores = cross_val_score(dt, X_train, y_train, cv=5)
print(f"Cross Validation Accuracy Rates: {dt_cross_val_scores}")
print(f"Average Cross Validation Accuracy: {np.mean(dt_cross_val_scores)}")


Evaluating the Random Forest model

In [None]:
rf_y_pred = rf.predict(X_test)
print(f"Random Forest Model Accuracy: {accuracy_score(y_test, rf_y_pred)}")
print(classification_report(y_test, rf_y_pred))


Evaluating the Decision Tree model

In [None]:
dt_y_pred = dt.predict(X_test)
print(f"Decision Tree Model Accuracy: {accuracy_score(y_test, dt_y_pred)}")
print(classification_report(y_test, dt_y_pred))
