# Section 1: Preparing Environment

## Installing Requirements

In [None]:
# Install a pip package in the current Jupyter kernel
import sys

def install_packages(*packages):
    for package in packages:
        # If package has been imported before, no need to install it
        if package not in sys.modules:
            !{sys.executable} -m pip install {package} --quiet
            print(f"Installed <{package}>")

install_packages(
    'ast','nltk','bokeh','spacy','numpy','scipy','kneed','pandas','gensim',
    'folium','plotly','sklearn','mlxtend','xgboost','shapely',
    'seaborn','datetime','requests','lightgbm','nbformat',
    'networkx','hyperopt','geopandas','matplotlib','fuzzywuzzy',
    'mplcursors','collections','geodatasets'
)

## Importing necessary packages

In [None]:
import re
import sys
import math
import requests 
import nbformat
import mplcursors 
import geodatasets
import numpy as np
import spacy as sp
import pandas as pd
import datetime as dt
import seaborn as sns
import xgboost as xgb
import networkx as nx
import lightgbm as lgb
import geopandas as gpd
from fuzzywuzzy import fuzz
import plotly.express as px
from kneed import KneeLocator


import folium
from folium import plugins


import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from ast import literal_eval

from hyperopt import hp, tpe, fmin, Trials, STATUS_OK

from scipy.stats import zscore

import shapely.geometry as geom


import matplotlib.pyplot as plt
from collections import Counter

from sklearn import metrics
from sklearn.metrics import *
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from mlxtend.preprocessing import TransactionEncoder
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_selection import SelectFromModel
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, LeaveOneOut

from bokeh.tile_providers import CARTODBPOSITRON
from bokeh.plotting import figure, show, output_notebook

from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec


from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth

# Section 2: Data Preparation

## 2.1 Data Understanding

Obtaining dataset *Chicago Crimes*

In [None]:
chicago_crimes = pd.read_csv('./data/Crimes___2017_to_Present.csv')

Description of the data

In [None]:
chicago_crimes.describe()

## 2.2 Data cleaning

In this section we begin to analyze missing data. With this analysis, we've decided to first correct some incorrect data and then try to fill missing data with possible values. One example of this is the *District* feature. While we have missing data on said feature, we're first gonna correct some datapoints which contain incorrect values and only after do we fill the empty data with the correct values.

In [None]:
# count number of null entries per feature
missing_values = list(chicago_crimes.isna().sum())
# missing values is a list of the number of missing values in each column

cols = list(chicago_crimes.columns)
col_final = []
for i in range(len(cols)):
    if (missing_values[i] == 0):
        cols[i]="Others"
d = dict(zip(cols, missing_values)) # making a dicionary for the missing values

print("Number of Missing Values per feature >>")
missing_vals = pd.DataFrame(d, index=["Missing Values"]) # Making a custom dataframe from dict d
missing_vals.head()

# Plotting the missing values in the dataset
x = list(d.keys())
y = list(d.values())
sns.barplot(x=x, y=y, palette="GnBu_d")
plt.xticks(rotation=90)
plt.title("Missing Values in the Dataset", fontdict = {'fontsize': 20})
plt.ylabel("Count of missing values", fontdict={'fontsize': 15})
plt.show()

In [None]:
# Continuous Variables
cont = chicago_crimes._get_numeric_data().columns
print("The continuous variables are: ",list(cont))
# Categorical Variables
print("The categorical variables are: ",list(set(chicago_crimes.columns) - set(cont)))

### Correcting the districts

Some of the datapoints have the incorrect *District* feature.<br>
To correct this feature of some of the datapoints, we noticed that each *Beat* lies inside a specific *District*, as seen in the picture bellow.

<img src="./data/notebook_imgs/Districts_VS_Beats.gif" style="width:300px">
<figcaption style='font-size:10pt'>Fig.1 - <i>District</i> VS <i>Beat</i>. Data sources: <a href='https://data.cityofchicago.org/d/aerh-rz74'>Beat</a> | <a href='https://data.cityofchicago.org/d/fthy-xz3r'>District</a></figcaption>

After noticing this, we decided to use the *Beat* feature to obtain the *District*.

It must be also noted that since some *District* are incorrect, we need to find a way of obtaining the correct *District* each *Beat* is a part of.<br>
The way we approached was by finding how many crimes happenned in each *Beat*/*District* combination and picking the correct *District* for each *Beat*. this is done by:<br>
* Obtaining the number of **crimes** (count of *ID*) for each *Beat*/*District* combination
* For each *Beat*, comparing the number of **crimes** in all *District*
* We noticed that the in **all** *Beat* with multiple *District*, the *District* with the lowest number of crimes never exceeded 22, while the other *District* was never less than 1161
* After noticing the above, we excluded all *Beat*/*District* with less than 23 crimes and this gave us our *Beat*/*District* combination.



In [None]:

# Aggregating by "Blocks" and "District"
beats = chicago_crimes.groupby(by=['Beat', 'District'], as_index=False)[['ID']].count()

# Checking the beats with more than one district
dups = beats[beats['Beat'].duplicated()]
dups = beats[beats['Beat'].isin(dups['Beat'])]

# For each of the beats with more than one district, check which is the maximum num of crimes of the district with less crimes
# (this will make it so we can filter all beats/district with more crimes than this amount and get unique values
# while also checking if the value makes it plausable to be error)
# We will also keep track of the minimum of the maximum value, so we can check if the difference between the two is too big
# If so, we can assume all the Beat/District with a num of crimes less than max_min_val to be an error
max_min_val = 0
min_max_val = dups['ID'].max()
for beat in dups['Beat'].unique():
    tmp = dups[dups['Beat']==beat]['ID']
    max_min_val = max(tmp.min(), max_min_val)
    min_max_val = min(tmp.max(), min_max_val)
print(f'max_min_val VS min_max_val -> {max_min_val} | {min_max_val}') 

# Since the difference between the two values is really big, we'll ignore all rows with a number of crimes less than "max_min_val"
# This way, we can get a single district for each Beat
beats = beats[beats['ID'] > max_min_val]

# Validating every beat has only a single district
assert not beats['Beat'].duplicated().any()

# Filtering only the columns needed in beats
beats = beats[['Beat', 'District']]     # Each beat has a single District

In [None]:
# With the association between Beat and District obtained above, we're gonna correct the districts of some rows
# Join the correct district of each Beat to original dataframe
chicago_crimes = chicago_crimes.merge(beats, how='inner', on=['Beat'], suffixes=('', '_new'))

### Correcting coordinates

Some crimes have coordinates too much far away from chicago, we're gonna pick those and correct them.

The approach here will be to pick the average coordinate for the Block where the crimes happenned and updating the coordinates accordingly

In [None]:
# Some points have really weird coordenates, we're gonna fix that by giving them the average coordinates of their "Beat"
wrong_coords = chicago_crimes[chicago_crimes['Latitude']<=37]

# Picking all the points with the same "Beat" as "wrong_coords" crimes
beats = chicago_crimes[chicago_crimes['Beat'].isin(wrong_coords['Beat'].unique())]
# Removing the "wrong_coords" crimes
beats = beats[beats['Latitude'] > 37]
# Aggregating by "Beat" and averaging coordinates
beats = beats.groupby(by=['Beat'], as_index=False)[['Latitude', 'Longitude']].mean()
# Correct the coordenates by giving the crimes the average coordinates of each Beat
beats = beats.rename(columns={'Latitude': 'Lat', 'Longitude':'Lon'})
chicago_crimes = chicago_crimes.merge(beats, on=['Beat'], how='left')

# Correcting the coordinates of the Crimes
chicago_crimes['Lat'] = chicago_crimes['Lat'].fillna(chicago_crimes['Latitude'])    # Columns from "blocks" which don't have latitude, must have correct values
chicago_crimes['Lon'] = chicago_crimes['Lon'].fillna(chicago_crimes['Longitude'])   # Same as above for Longitude
chicago_crimes = chicago_crimes.drop(columns = ['Latitude', 'Longitude']).rename(columns={'Lat': 'Latitude', 'Lon': 'Longitude'})   # Drop old Lat/Lon columns, and rename new ones

### Fixing some of the null values

#### Latitude/Longitude

The specific coordinates (lat/lon) may not give much information, so we've decided to fill the null values with data.<br>

Since when one coordinate is null, the other also is, we're gonna apply the same logic for both of the features.<br>
Using the *Beat* of each datapoint, we're gonna pick the average coordinates of said *Beat* and apply attribute that value to said datapoint.

In [None]:
# Calculating average coordinates for each beat
beats = chicago_crimes[['Beat', 'Latitude', 'Longitude']]   # Pick only required columns from original dataset
beats = beats.dropna(how='any')                             # Delete any null datapoints to calculate averages
beats = beats.groupby(by=['Beat'], as_index=False).mean()   # Agregate by beat, and average the coordinates 

# Add average Beat latitude/lon to original dataset as "Latitude_new" and "Longitude_new" 
chicago_crimes = chicago_crimes.merge(beats, how='left', on=['Beat'], suffixes=('', '_new'))

# Create new lat/lon columns (by filling null values of latitude/longitude columns with the new values)
chicago_crimes['Latitude_new']  = chicago_crimes['Latitude' ].fillna(chicago_crimes['Latitude_new' ])
chicago_crimes['Longitude_new'] = chicago_crimes['Longitude'].fillna(chicago_crimes['Longitude_new'])


# Validate that there are no NULL Latitude/Longitude
assert not chicago_crimes['Latitude_new'].isnull().any()

#### District

Since *Beat* is the smallest police geographical area, a *Beat* describes is more granular than any of *District*/*Ward*/*Community*.<br>
We have 0 datapoints with a **NaN** *Beat*, so we can use that feature to obtain the missing values. <br>
Whenever a datapoint doesn't have district, lookup other datapoints with the same *Beat* and we have our missing data.


In [None]:
# Calculating average coordinates for each beat
beats = chicago_crimes[['Beat', 'District_new']]            # Pick only required columns from original dataset
beats = beats.drop_duplicates()                             # Ignore duplicate values
beats = beats.rename(columns={'District_new': 'District'})  # Renam District_new to District

# Add default Beat District to original dataset as "District_new"
chicago_crimes = chicago_crimes.merge(beats, how='left', on=['Beat'], suffixes=('', '_new2'))

# Create new District columns (by filling null values of District columns with the new values)
chicago_crimes['District_new'] = chicago_crimes['District_new'].fillna(chicago_crimes['District_new2'])
chicago_crimes = chicago_crimes.drop(columns=['District_new2'])

# Validate that there are no NULL District
assert not chicago_crimes['District_new'].isnull().any()


### Correcting the Ward/Community Area

As we corrected some of the coordinates and districts, some of *Ward*/*Community Area* are not the most up to date.<br>
To correct this feature, we are going to obtain the data from the source directly and use spatial manipulation to obtain the correct values for each datapoint.<br>
The sources were accessed through the Endpoint API available on the references bellow.

The respective data sources are: <a href='https://data.cityofchicago.org/d/sp34-6z76'>Ward</a>, <a href='https://data.cityofchicago.org/d/cauq-8yn6'>Community Area</a>

We used the following logic for the correction of the features mentioned:
* Obtain a polygon describing each feature
* Using spatial manipulation, use the coordinates of each datapoint to pick in which area they lie within
* The "left out" datapoints*, pick the nearest region

<div style="font-size:10pt">*Datapoints whose coordinates do not lie within any region. This may be due to some rounding errors, which have a great impact on GPS coordinates, and some different projections</div>

In [None]:
# Defining which features to correct, and where to fetch the data from as well as which column contains the "content" we want
data_sources = {
    'Ward': {
        'url': 'https://data.cityofchicago.org/resource/k9yb-bpqx.json',    # Where to get the data from
        'column_name': 'ward'                                               # Data we want to bring to our original dataset
    },
    'Community Area': {
        'url': 'https://data.cityofchicago.org/resource/igwz-8jzy.json',    # Where to get the data from
        'column_name': 'area_numbe'                                         # Data we want to bring to our original dataset
    }
}

# Creating a GeoDataFrame object from original dataset (to be able to perform geography operations)
df = gpd.GeoDataFrame(chicago_crimes, geometry=gpd.points_from_xy(chicago_crimes['Longitude_new'], chicago_crimes['Latitude_new']))


# Iterating through all sources, since steps will be exactly the same
for col in data_sources.keys():
    # Fetching data from data_source (url obtained in DatasetDescription.docx document)
    raw_data = requests.get(data_sources[col]['url'])
    raw_data = raw_data.json()  # Obtain the JSON response from data_source

    # Transform the geometry into a Shape object (easily read by GeoPandas)
    for (i, region) in enumerate(raw_data):
        # Replace data with new data
        raw_data[i] = {
            # Create a "Geometry" column which will be the geometry read by GeoPandas
            'geometry': geom.shape(region['the_geom'])
            # Create a new column which will store the "ID" of the region
            ,col: region[data_sources[col]['column_name']]
        }

    # Converting the geometries to a GeoDataFrame (to be able to use geometry join)
    regions = gpd.GeoDataFrame(raw_data)
    # Renaming the "region" column to the same as "col"
    regions = regions.rename(columns={regions.columns.difference(['geometry']).tolist()[0]: col})

    # Merge data onto original (with geoDataFrame) dataset
    df = df.sjoin(regions, how='left', predicate='within', lsuffix='old', rsuffix='new') # Will merge regions onto DataFrame by checking in which region each datapoint is contained
    df = df.drop(columns=['index_new']) # The merge creates a new column "index_new", we don't care about that

    # Some points don't lie inside any of the regions (this may be because of roundings in the coordinates)
    # to these "outside" points, we're gonna attribute the nearest region to them
    
    # Datapoints which did not lie inside any region
    left_out = df[df[f'{col}_new'].isnull()].copy()[['ID', 'geometry']]

    # Merge on left_out points
    left_out = left_out.sjoin_nearest(regions, how='inner', lsuffix='old', rsuffix='new')
    left_out = left_out.drop(columns=['index_new', 'geometry'])

    # Merge this new data onto original DF
    df = df.merge(left_out, on=['ID'], how='left', suffixes=('', '2'))
    
    # Fill NULL values of new column with the buffered ones
    df[f'{col}_new'] = df[f'{col}_new'].fillna(df[col])
    df = df.drop(columns=[col])   # Drop this "Buffered" new column


# Keep only wanted columns (just want ID and the respective new Region columns)
# This is so when we merge to original dataset, only the new Region columns are created
df = df[['ID'] + [f'{col}_new' for col in data_sources.keys()]]

# Join the correct Ward/Community of each Beat to original dataframe
chicago_crimes  = chicago_crimes.merge(df, how='left', on=['ID'])

# Replacing "normal" columns with newest ones
# Drop "normal" columns
chicago_crimes = chicago_crimes.drop(columns = [col.replace('_new', '') for col in chicago_crimes.columns if '_new' in col])
# Rename all the "_new" column to their respective name
chicago_crimes = chicago_crimes.rename(columns = {col: col.replace('_new', '') for col in chicago_crimes.columns if '_new' in col})

### Removing crimes which contain NaN in any feature

Removing all rows which contain Null values except in Description features (we won't use description features)

In [None]:
# Remove rows with NA or empty values
chicago_crimes_clean = chicago_crimes.dropna(subset=chicago_crimes.columns.difference(['Location Description', 'Description']), how='any', axis='index')

## 2.3 Creating new features
We create a set of new features that will help us identify patterns in the data, such as months with more crimes, periods of certain crimes, etc.

### Date related features

Features obtained in this section:
* **Date**: Datetime truncated to day
* **Hour**: Hour of the datetime
* **YearMonth**: Datetime formatted to Year and Month concattenated
* **DayOfWeek**: Week day of datetime (Monday/Wednesday/etc)
* **periodOfDay**: If datetime occurred in morning/afternoon/etc

In [None]:
# Function to map time to period of the day
def map_time_to_period(hour):
    if 5 <= hour < 12: return 'Morning'
    elif 12 <= hour < 17: return 'Afternoon'
    elif 17 <= hour < 21: return 'Evening'
    else: return 'Night'

In [None]:

chicago_crimes_clean['Date_T'] = pd.to_datetime(chicago_crimes_clean['Date'].str[:16], format='%Y-%m-%dT%H:%M')
chicago_crimes_clean['Date'] = pd.to_datetime(chicago_crimes_clean['Date'].str[:10], format='%Y-%m-%d')
chicago_crimes_clean['Hour'] = chicago_crimes_clean.Date_T.dt.hour
chicago_crimes_clean['YearMonth'] = chicago_crimes_clean['Date_T'].dt.strftime('%Y-%m')
chicago_crimes_clean['Month'] = chicago_crimes_clean['Date_T'].dt.month
chicago_crimes_clean['DayOfWeek'] = chicago_crimes_clean['Date_T'].dt.dayofweek
del chicago_crimes_clean['Date_T']
# Map the time column to periods of the day
chicago_crimes_clean['periodOfDay'] = chicago_crimes_clean['Hour'].apply(map_time_to_period)

### Text Processing

Using the python script in the folder "textProcessing", we obtained a mapping between the descriptions and the new descriptions

In [None]:
desc = pd.read_csv(r'./data/textProcessing/result/description_reduced.csv', header=None)
desc.columns = ['Description_New', 'Description']
loc_desc = pd.read_csv(r'./data/textProcessing/result/location_reduced.csv', header=None)
loc_desc.columns = ['Location Description_New', 'Location Description']

# Merge new descriptions onto original dataset
chicago_crimes_clean = chicago_crimes_clean.merge(desc, on=['Description'], how='left')
chicago_crimes_clean = chicago_crimes_clean.merge(loc_desc, on=['Location Description'], how='left')

# Replace old columns with new columns
for col in ['Location Description', 'Description']:
    chicago_crimes_clean[col] = chicago_crimes_clean[f'{col}_New']
    chicago_crimes_clean = chicago_crimes_clean.drop(columns=[f'{col}_New'])

### External Datasets - Socio Economically Disadvantaged Areas

Areas of Chicago, based on census tracts, that are the most socioeconomically disadvantaged, for the purpose of promoting equitable hiring within areas of economic need. Qualifying areas were identified using three criteria, based on data from the 2014 American Community Survey: household income, poverty rate, and unemployment rate.

Socioeconomically Disadvantaged Areas -> https://data.cityofchicago.org/Community-Economic-Development/Socioeconomically-Disadvantaged-Areas/2ui7-wiq8

Obtaining dataset through the Endpoint API

In [None]:
# Creating a GeoDataFrame object from original dataset (to be able to perform geography operations)
df = gpd.GeoDataFrame(chicago_crimes_clean, geometry=gpd.points_from_xy(chicago_crimes_clean['Longitude'], chicago_crimes_clean['Latitude']))[['ID', 'geometry']]


# Url to get the data from (USING ENDPOINT API)
url = 'https://data.cityofchicago.org/resource/2ui7-wiq8.json?$query=SELECT%20%60the_geom%60'

# Fetching data from data_source (url obtained in DatasetDescription.docx document)
raw_data = requests.get(url)
raw_data = raw_data.json()  # Obtain the JSON response from data_source

# Transform the geometry into a Shape object (easily read by GeoPandas)
for (i, region) in enumerate(raw_data):
    # Replace data with new data
    raw_data[i] = {
        # Create a "Geometry" column which will be the geometry read by GeoPandas
        'geometry': geom.shape(region['the_geom'])
    }

# Converting the geometries to a GeoDataFrame (to be able to use geometry join)
unadvantage_zones = gpd.GeoDataFrame(raw_data)
# Creating new colmun to store the data (which regions are Unadvantaged)
unadvantage_zones['Unadvantage Zone'] = True

Visualizing the regions

In [None]:

m = folium.Map(location=[41.881832, -87.623177], zoom_start=11, tiles="CartoDB positron")
for _, r in unadvantage_zones.iterrows():
    # Without simplifying the representation of each borough,
    # the map might not be displayed
    sim_geo = gpd.GeoSeries(r["geometry"]).simplify(tolerance=0.001)
    geo_j = sim_geo.to_json()
    geo_j = folium.GeoJson(data=geo_j, style_function=lambda x: {"fillColor": "orange"})
    geo_j.add_to(m)
m

Merging regions into dataset

In [None]:
# Merge data onto original (with geoDataFrame) dataset
df = df.sjoin(unadvantage_zones, how='left', predicate='within', lsuffix='old', rsuffix='new') # Will merge regions onto DataFrame by checking in which region each datapoint is contained
df = df.drop(columns=['index_new']) # The merge creates a new column "index_new", we don't care about that

df['Unadvantage Zone'] = df['Unadvantage Zone'].fillna(False)


# Join the data onto original dataset
chicago_crimes_clean  = chicago_crimes_clean.merge(df[['ID', 'Unadvantage Zone']], how='left', on=['ID'])

# Section 3. Data Visualization

After preparing all the data, we're going to visualize all the dataset and everything that it has to offer

## 3.1 Crimes per month + Crimes by types

In [None]:
# Defining figures and axes to plot data on
fig, axs = plt.subplots(ncols=3, figsize=(20, 5))

#count number of crimes per month
chicago_crimes_monthly = chicago_crimes_clean.groupby(by=['YearMonth', 'Year', 'Month'], as_index=False)[['ID']].count()
# Create a sequence of months
month_labels = chicago_crimes_monthly['YearMonth'].unique().tolist()
# Create the plot
chicago_crimes_monthly.plot(
    x='YearMonth'
    ,y=['ID']
    ,ax=axs[0]
    ,title='Monthly Crimes')



# Create a heatmap
sns.heatmap(chicago_crimes_monthly.set_index(['Year', 'Month'])['ID'].unstack().fillna(0), cmap='YlOrRd', linewidths=0.5, ax=axs[1])
axs[1].set_xlabel('Month')
axs[1].set_ylabel('Year')
axs[1].set_title('Chicago Crimes Heatmap')


# Crime By TYPES
crime_by_type = chicago_crimes_clean['Primary Type'].value_counts().head(10)
axs[2].set_title('Top 10 Crime Types')
axs[2].set_xlabel('Crime Type')
axs[2].set_ylabel('Number of Crimes')
sns.barplot(x=crime_by_type.index, y=crime_by_type.values, ax=axs[2])
axs[2].set_xticklabels(crime_by_type.index, rotation=45)

plt.show()

## 3.2 Likelihood of an arrest

In [None]:
l = chicago_crimes["Arrest"].value_counts()
false = l[0]
true = l[1]

arrest = pd.DataFrame({'Status':['Not Arrested','Arrested'],'Value':list(l)})
print("Percentage of no arrests of all reported crimes :",false/(false+true)*100,'!')

# How are arrests spread out across the months
plt.style.use('bmh')

fig, ax = plt.subplots(figsize=(10, 5))
ax = sns.countplot(x="Year",
                   hue='Arrest',
                   data=chicago_crimes_clean[['Year','Arrest']],
                   palette=['Red', 'Green'])

ax.set(title='Arrests Made per Year', xlabel='Year', ylabel='Number of Crimes')
plt.title('Arrests Made per Year', fontdict={'fontsize': 20, 'color': 'black'}, weight="bold")
plt.show()

## 3.3 Number of crimes per hour

Aggregating number of crimes occurred in each hour

In [None]:
# Set plot style
plt.style.use('seaborn-dark')
sns.set_context('paper')

# Write code to plot
fig, ax = plt.subplots(figsize=(10, 5))
sns.countplot(x='Hour', data=chicago_crimes_clean, palette="viridis")

# Aesthetic appeal
plt.title("Unsafest Hours in Chicago", fontdict={'fontsize': 15, 'color': '#bb0e14','fontname':'Agency FB'}, weight="bold")
plt.xlabel("\nHour in the Day", fontdict={'fontsize': 15}, weight='bold')
plt.ylabel("Number of Crimes\n", fontdict={'fontsize': 15}, weight="bold")

plt.show()

## 3.4 Mapping yearly crimes per type

In [None]:
chicago_crimes_sample = chicago_crimes_clean.sample(frac=0.1)
# Create the map of Chicago crimes with primary type
fig = px.scatter_mapbox(chicago_crimes_sample, animation_frame='Year',lat='Latitude', lon='Longitude',
                        color='Primary Type', hover_name='Primary Type', hover_data=['Arrest','District','periodOfDay'],
                        mapbox_style='open-street-map', height=800)

# Customize the appearance and layout of the map
fig.update_layout(
    title='Chicago Crimes by Primary Type',
    mapbox=dict(
        zoom=10,
        center=dict(lat=41.8781, lon=-87.6298)
    )
)

# Show the map
fig.show()

## 3.5 Correlation between Variables

In [None]:
sns.heatmap(chicago_crimes_clean.corr(numeric_only=True),cmap='YlOrRd', linewidths=0.5)

## 3.6 Visualizing social unadvantaged regions

Mapping crimes inside and outside regions

In [None]:
chicago_crimes_sample = chicago_crimes_clean.sample(frac=0.1).rename(columns={'Unadvantage Zone': 'Socioeconomically Disadvantaged Areas'})
crimes_outside_zones = chicago_crimes_sample[~chicago_crimes_sample['Socioeconomically Disadvantaged Areas']]
crimes_in_zones = chicago_crimes_sample[chicago_crimes_sample['Socioeconomically Disadvantaged Areas']]


# Create the map of Chicago crimes with primary type
fig = px.scatter_mapbox(chicago_crimes_sample, animation_frame='Year',lat='Latitude', lon='Longitude',
                        color='Socioeconomically Disadvantaged Areas', color_discrete_sequence={False:'#8bb4f2', True:'#f28b8b'}, 
                        hover_name='Socioeconomically Disadvantaged Areas', hover_data=['Arrest','District'],
                        mapbox_style='open-street-map', height=800)

# Customize the appearance and layout of the map
fig.update_layout(
    mapbox=dict(
        zoom=10,
        center=dict(lat=41.8781, lon=-87.6298)
    )
)

# Show the map
fig.show()

Comparing crime types per region type

In [None]:
# Count occurrences of each crime type in non-socioeconomically disadvantaged areas
crime_counts_outside = crimes_outside_zones['Primary Type'].value_counts().head(3)

# Count occurrences of each crime type in socioeconomically disadvantaged areas
crime_counts_in_zones = crimes_in_zones['Primary Type'].value_counts().head(3)

# Create a single bar chart for both sets of data
plt.figure(figsize=(8, 6))

# Plot for non-socioeconomically disadvantaged areas
plt.bar(crime_counts_outside.index, crime_counts_outside.values, color='skyblue', alpha=0.8, label='Non-Socioeconomically Disadvantaged Areas')

# Plot for socioeconomically disadvantaged areas
plt.bar(crime_counts_in_zones.index, crime_counts_in_zones.values, color='lightcoral', alpha=0.8, label='Socioeconomically Disadvantaged Areas')

plt.title('Comparison of Top Crime Types')
plt.xlabel('Crime Type')
plt.ylabel('Number of Crimes')
plt.xticks(rotation=45, ha='right')
plt.legend()

# Add grid lines
plt.grid(axis='y', linestyle='--')

# Adjust spacing between bars
plt.tight_layout()

# Display the combined plot
plt.show()


## 3.7 Clustering with K-means

Defining parameters to use for K-Means, and transforming them into numerical types

In [None]:
kmeans_cols = ['Domestic', 'periodOfDay', 'Month', 'DayOfWeek', 'Beat', 'Primary Type', 'Arrest']
df_aux = chicago_crimes_clean[kmeans_cols + ['Latitude', 'Longitude']].copy()

for col in kmeans_cols:
    dtype = df_aux[col].dtype
    if str(dtype) in ['object', 'str', 'string']:
        # Change type to category
        df_aux[col] = df_aux[col].astype('category')
        
        # Convert from categorical to Numeric
        df_aux[col] = df_aux[col].cat.codes
    if str(dtype) == 'bool':
        df_aux[col] = df_aux[col].astype('int8')

###  Elbow Method

In order to find the optimal number of clusters, we're going to use the elbow method

In [None]:
kmeans_kwargs = {
"init": "random",
"n_init": 10,
"max_iter": 300,
"random_state": 42,
}

nrclusters_inertia = []
for k in range(1,20):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(df_aux[kmeans_cols])

    # inertia measures how well a dataset was clustered
    nrclusters_inertia.append(kmeans.inertia_)

plt.plot(range(1, 20), nrclusters_inertia)
plt.xticks(range(1, 20))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()

kl = KneeLocator(range(1, 20), nrclusters_inertia, curve="convex", direction="decreasing")
print("The optimal number of clusters is -> " + str(kl.elbow) + ' <- by the Elbow Method.')

### K-Means Clustering Model

In [None]:
kmeans_kwargs = {
"init": "random",
"n_init": 10,
"max_iter": 300,
"random_state": 42,
}

# Since 4 was  the optimal number of clusters, we are going to use 4.
kmeans = KMeans(4, **kmeans_kwargs)

# fitting the data
kmeans.fit(df_aux[kmeans_cols])

# Clusters of each datapoint
df_aux['Cluster'] = kmeans.predict(df_aux[kmeans_cols])

### Plotting Clusters per beats

In [None]:

df_sample = df_aux.sample(frac=0.05)
# Create the map of Chicago crimes with primary type
fig = px.scatter_mapbox(df_sample, lat='Latitude', lon='Longitude',
                        color='Cluster',
                        mapbox_style='open-street-map', height=800)

# Customize the appearance and layout of the map
fig.update_layout(
    title='K-Means clustering',
    mapbox=dict(
        zoom=10,
        center=dict(lat=41.8781, lon=-87.6298)
    )
)

# Show the map
fig.show()

# Section 4. Association Rules

In this section we perform the association rules from the dataset with the objective to learn eventual co-occurrence of crimes.

For this we use first the Apriori method and later the FP-Growth.

Its valid to note that we used a sample from the original dataset due to the fact that it was to large to process in reasonable time.

## Visualizing Item Frequency

In [None]:
df_sample = chicago_crimes_clean.sample(frac=0.04)

# Preprocess the dataset and select relevant columns
df = df_sample[['Primary Type', 'Location Description', 'Date', 'Beat']]

# Convert the dataset into a transactional format
transactions = df.groupby(['Date'])['Primary Type'].apply(list).values.tolist()

# Encode the transactional data
te = TransactionEncoder()
te_array = te.fit_transform(transactions)
df_encoded = pd.DataFrame(te_array, columns=te.columns_)


# Calculate item frequency (support)
item_support = df_encoded.sum() / len(df_encoded)

# Sort the items by support in descending order
sorted_items = item_support.sort_values(ascending=False)

# Plot the item frequency
plt.figure(figsize=(10, 6))
plt.bar(sorted_items.index, sorted_items.values)
plt.xlabel('Item')
plt.ylabel('Support')
plt.title('Item Frequency (Support)')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

## Apriori

Association Rules

In [None]:
# Apply Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)

# Generate association rules
df_association_rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

frequent_itemsets = frequent_itemsets.sort_values('support', ascending=False)

# Filter rules based on desired metrics or thresholds
filtered_rules = df_association_rules[(df_association_rules['confidence'] > 0.7) & (df_association_rules['lift'] > 1.2)]
filtered_rules = filtered_rules.sort_values(by = 'support', ascending = False)

# Print the association rules
for index, rule in filtered_rules.iterrows():
    antecedents = ', '.join(list(rule['antecedents']))
    consequents = ', '.join(list(rule['consequents']))
    print(f"Rule: {antecedents} -> {consequents}")
    print(f"Support: {rule['support']}")
    print(f"Confidence: {rule['confidence']}")
    print(f"Lift: {rule['lift']}")
    print("----------------------------")

Visualizing Association Rules

In [None]:
# Visualize the association rules
plt.figure(figsize=(8, 6))
scatterplot = sns.scatterplot(data=filtered_rules, x='support', y='confidence', size='lift', hue='lift', palette='viridis')
plt.xlabel('Support')
plt.ylabel('Confidence')
plt.title('Association Rules')

plt.show()

## FP-Growth

Association Rules

In [None]:
# Select relevant columns
df = df_sample[['Primary Type', 'Location Description', 'Beat', 'Date']].copy()

# Convert the dataset into a transactional format
transactions = df.groupby(['Date'])['Primary Type'].apply(list).values.tolist()

# Encode the transactional data
te = TransactionEncoder()
te_array = te.fit_transform(transactions)
df_encoded = pd.DataFrame(te_array, columns=te.columns_)


# Apply FP-Growth algorithm to find frequent itemsets
frequent_itemsets = fpgrowth(df_encoded, min_support=0.01, use_colnames=True)

# Generate association rules
df_association_rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

In [None]:
# Filter rules based on desired metrics or thresholds
filtered_rules = df_association_rules[(df_association_rules['confidence'] > 0.8) & (df_association_rules['lift'] > 1.2)]

# Print the association rules
for index, rule in filtered_rules.iterrows():
    antecedents = ', '.join(list(rule['antecedents']))
    consequents = ', '.join(list(rule['consequents']))
    print(f"Rule: {antecedents} -> {consequents}")
    print(f"Support: {rule['support']}")
    print(f"Confidence: {rule['confidence']}")
    print(f"Lift: {rule['lift']}")
    print("----------------------------")

# Sort the association rules by lift
association_rules_graph = filtered_rules.sort_values(by='lift', ascending=False)

# Plot the association rules by lift
plt.figure(figsize=(10, 6))
sns.barplot(x='lift', y='antecedents', data=association_rules_graph.head(10))
plt.xlabel('Lift')
plt.ylabel('Antecedents')
plt.title('Top 10 Association Rules by Lift')
plt.show()

Visualizing Association Rules

In [None]:
# Visualize the association rules
plt.figure(figsize=(8, 6))
scatterplot = sns.scatterplot(data=filtered_rules, x='support', y='confidence', size='lift', hue='lift', palette='viridis')
plt.xlabel('Support')
plt.ylabel('Confidence')
plt.title('Association Rules')

plt.show()

# Section 5. Link Analysis

## CrimeType -> Location

In [None]:
# Select relevant columns for link analysis
df = df_sample[['Primary Type', 'Location Description', 'Arrest', 'Domestic', 'Beat', 'Year', 'District']].head(100).copy()

# Create a directed graph
G = nx.DiGraph()

# Iterate over the dataset and add edges to the graph
for _, row in df.iterrows():
    crime_type = row['Primary Type']
    location = row['Location Description']
    G.add_edge(crime_type, location)

# Perform link analysis using PageRank algorithm
page_rank = nx.pagerank(G)

# Sort the nodes based on PageRank score
sorted_nodes = sorted(page_rank, key=page_rank.get, reverse=True)

# Print the top 10 nodes by PageRank score
for node in sorted_nodes[:10]:
    print(f"Node: {node}, PageRank Score: {page_rank[node]}")


# Visualize the graph
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G, k=0.3)
nx.draw_networkx(G, pos, with_labels=True, node_size=1500, node_color='lightblue', edge_color='gray', font_size=10, width=1, alpha=0.7)
plt.title('Crime-related Entities Graph')
plt.show()

## Using some other features

The features used in this link analysis were:
* Primary Type
* Description
* Location
* Arrest
* Domestic
* Year
* Month
* Weekday
* WeekMonth

In [None]:
# Add folder to path (to run code correctly)
sys.path.insert(0, './data/linkAnalysis')
import reportFinal

# Remove linkanalysis folder from sys path
sys.path.pop(0)

# Section 6. Predictive Modeling

### Defining predicted feature

In [None]:
prediction_attr = 'Arrest'

## Feature Selection

In [None]:
features = [
    'Location Description'
    ,'Description'
    ,'Unadvantage Zone'
    ,'DayOfWeek'
    ,'Month'
    ,'periodOfDay'
    ,'Beat'
    ,'Primary Type'
    ,'Domestic'
]

df_final = chicago_crimes_clean[features + [prediction_attr]].copy()

### Converting all features to numerical Type

In [None]:
# Convergint all columns to numerical type
for col in df_final.columns:
    dtype = df_final[col].dtype
    if str(dtype) in ['object', 'str', 'string']:
        # Change type to category
        df_final[col] = df_final[col].astype('category')

        # Convert from categorical to Numeric
        df_final[col] = df_final[col].cat.codes
    if str(dtype) == 'bool':
        df_final[col] = df_final[col].astype('int8')

## Definition of Models

### Logistic Regression

#### Model Definition

In [None]:
# Create an instance of the logistic regression model
logreg = LogisticRegression()

### XGBoost

#### Hyperparameter Search

In this section we're are looking for the best parameters to include in our XGBoost model

In [None]:
# hyperparameter space variables
space={'max_depth': hp.quniform("max_depth", 3, 35, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 0,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.1,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'learning_rate': hp.quniform('learning_rate', 0, 1, 0.05),
        'n_estimators': 30,
        'seed': 123
    }


# Defining training and testing sets
train = df_final.loc[[x for x in df_final.index if np.random.random()<0.7]].copy()
test  = df_final.loc[~df_final.index.isin(train.index)].copy()


# search function for hyperparameter
def objective(space):
    clf=xgb.XGBClassifier(objective ='reg:logistic', booster='gbtree', learning_rate = space['learning_rate'],
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']), eval_metric="auc", early_stopping_rounds=10)
    
    evaluation = [(train[features], train[prediction_attr]), (test[features], test[prediction_attr])]
    
    clf.fit(train[features], train[prediction_attr],
            eval_set=evaluation, verbose=0)
    

    pred = clf.predict(test[features])
    accuracy = accuracy_score(test[prediction_attr], pred>0.5)
    # print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

# run hyperparameter function
trials = Trials()
best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

# show results
print("The best hyperparameters are : ")
print(best_hyperparams)

#### Defining Model

In this section, we're going to make use of the `xgboost` python package which contains a `XGBClassifier` class which allows for the modeling of a *XGBoost* model

In [None]:
XGBoostModel = xgb.XGBClassifier(
    objective ='reg:logistic'
    ,colsample_bytree = 0.6038
    ,learning_rate = 0.8
    ,max_depth = 11
    ,n_estimators = 30
    ,min_split_loss=5
    ,booster='gbtree'
)

### Random Forest

#### Hyperparameter Search

After obtaining the features to use in the model, we perform a hyperparameter search to try to find the best arguments to use on the model.

In [None]:
# Define the parameter grid for hyperparameter search
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 5, 10],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt'],  # Number of features to consider when looking for the best split
    'random_state': [42]  # Random state for reproducibility
}

# Initialize a Random Forest classifier
rf_model = RandomForestClassifier()

# Defining new train and test dataset with new selected features
X_train, X_test, y_train, y_test = train_test_split(df_final[features], df_final['Arrest'], test_size=0.3, random_state=42)

# Perform randomized search with cross-validation
random_search = RandomizedSearchCV(rf_model, param_distributions=param_grid, n_iter=5, cv=3, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

# Get the best model and its hyperparameters
best_model = random_search.best_estimator_
best_params = random_search.best_params_

# Make predictions on the testing set using the best model
y_pred = best_model.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)

# Print the best hyperparameters and accuracy score
print("Best Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")
print(f"Accuracy: {accuracy:.4f}")

#### Defining Model

We then input all the discovered hyperparameters

In [None]:
random_state = 42
n_estimators = 100
min_samples_split = 10
min_samples_leaf = 4
max_features = 'sqrt'
max_depth = None

rf_classifier = RandomForestClassifier(random_state=random_state,
                                       n_estimators=n_estimators,
                                       min_samples_split=min_samples_split,
                                       min_samples_leaf=min_samples_leaf,
                                       max_features=max_features,
                                       max_depth=max_depth)

### LGBM

#### Defining Model

In [None]:
LGBMModel = lgb.LGBMClassifier(
    n_estimators=400, random_state=42,
    num_leaves= 20, max_depth=5, 
    silent=True, 
    metric='None', 
    n_jobs=6, 
    colsample_bytree=0.9,
    subsample=0.9,
    learning_rate=0.3
)

## Evaluation

### Evaluation Class
In order to compare all our different models, we decided to create a class which, given a certan model and dataset, will evaluate the given method with some different methods. Afterwards we'll apply this class to all the model defined previously and compare them between each other

The methods included:
* Holdout Method
* Random SubSampling
* Startified K Folding
* Leave One Out Cross Validation
* ROC + AUC score
* Confusion Matrix

**NOTE**: Despiting `Leave_One_Out_Cross_Validation` being implemented, it might not be ran since it is very complex to run and it took +/- 8 hours to run. We decided to exclude it in the method `run_all_evaluations` by default for its complexity and for not affecting the results by our tests

In [None]:
class EvalModel:
    def __init__(self, model=None, model_name='', dataset: pd.DataFrame=None, prediction_attrs=None):
        self.model = model
        self.label = model_name
        self.df = dataset.copy()

        if not isinstance(prediction_attrs, list): prediction_attrs=[prediction_attrs]
        self.pred_attrs=prediction_attrs

    # Evaluation metrics
    def _mean_prediction_error(self, real, pred):
        return np.average(pred-real)

    def _standard_error(self, real, pred):
        return np.std(pred-real)


    # Evaluation
    def evaluate(self, print_evaluation=True):
        test = self.test if not isinstance(self.train, list) else self.test[-1]

        self.real = test[self.pred_attrs]
        self.pred = self.model.predict(test.drop(columns=self.pred_attrs)) 
        
        if len(self.pred_attrs) == 1:
            self.real = self.real.to_numpy().T[0]

        self.std_err = self._standard_error(self.real, self.pred)
        self.avg_err = self._mean_prediction_error(self.real, self.pred)
        
        
        if print_evaluation: print(f'<{self.label}> {self.method} Error: {self.avg_err:.3f} +/- {self.std_err:.3f}')
        return self.avg_err

    # Fitting model
    def fit(self):
        if isinstance(self.train, list):
            return self.model.fit(self.train[-1].drop(columns=self.pred_attrs), self.train[-1][self.pred_attrs])
        return self.model.fit(self.train.drop(columns=self.pred_attrs), self.train[self.pred_attrs])


    # Defining evaluation methods
    def Houldout_Method(self, train_ratio=0.7, print_evaluation=True, **kwargs):
        self.method = 'Holdout Method'
        self.train  = self.df.iloc[:int(self.df.shape[0]*train_ratio)].copy()
        self.test   = self.df.loc[~self.df.index.isin(self.train.index)].copy()

        self.fit()
        self.evaluate(print_evaluation=print_evaluation)

        
    def Random_Subsampling(self, seed=1, val_ratio=0.7, print_evaluation=True):
        np.random.seed=seed
        self.method = 'Random SubSampling'

        self.train = self.df.loc[[x for x in self.df.index if np.random.random()<val_ratio]].copy()
        self.test  = self.df.loc[~self.df.index.isin(self.train.index)].copy()

        self.fit()
        self.evaluate(print_evaluation=print_evaluation)
        


    def Stratified_K_Fold(self, seed=1, n_splits=10, print_folds=True, print_evaluation=True):
        np.random.seed=seed
        self.method='k-fold Cross Validation'

        # pipeline = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth))
        strtfdKFold = StratifiedKFold(n_splits=n_splits)

        kfold = strtfdKFold.split(self.df.drop(columns=self.pred_attrs), self.df[self.pred_attrs])
        scores = []

        self.train = []
        self.test = []
        for k, (train, test) in enumerate(kfold):
            self.train.append(self.df.iloc[train])
            self.test.append(self.df.iloc[test])

            self.fit()
            
            score = self.evaluate(print_evaluation=False)
            scores.append(score)
            if print_folds: print(f'Fold: {k+1:2d}, Training/Test Split Distribution: {np.bincount(self.df[self.pred_attrs].iloc[test].to_numpy().T[0])}, Error: {score:.3f}')

        self.avg_err = np.mean(scores)
        self.std_err = np.std(scores)
        if print_evaluation: print(f'<{self.label}> {self.method} Error: {self.avg_err:.3f} +/- {self.std_err:.3f}')
        
    def Leave_One_Out_Cross_Validation(self, print_folds=True, print_evaluation=True, **kwargs):
        self.method='Leave One Out Cross Validation'
        
        leaveOneOut = LeaveOneOut()
        kfold = leaveOneOut.split(self.df.drop(columns=self.pred_attrs), self.df[self.pred_attrs])
        scores = []
        
        self.train = []
        self.test = []
        for k, (train, test) in enumerate(kfold):
            self.train.append(self.df.iloc[train])
            self.test.append(self.df.iloc[test])

            self.fit()
            
            score = self.evaluate(print_evaluation=False)
            scores.append(score)
            if print_folds: print(f'Fold: {k+1:2d}, Training/Test Split Distribution: {np.bincount(self.df[self.pred_attrs].iloc[test].to_numpy().T[0])}, Error: {score:.3f}')

        self.avg_err = np.mean(scores)
        self.std_err = np.std(scores)
        if print_evaluation: print(f'<{self.label}> {self.method} Error: {self.avg_err:.3f} +/- {self.std_err:.3f}')

    def Bootstrap_Method(self, n_tries=200, print_evaluation=True, **kwargs):
        scores = []
        for _ in range(n_tries):
            self.Random_Subsampling(print_evaluation=False, **kwargs)
            
            
            scores.append(self.evaluate(print_evaluation=False))
        self.evaluate(print_evaluation=print_evaluation)
        self.method = 'Bootstrap Method'
    
    def ROC(self, figsize=(8, 6), metric_index_to_compare=0, print_auc_score=True):
        
        self.Random_Subsampling(print_evaluation=False)

        self.real = self.test[self.pred_attrs[metric_index_to_compare]].to_numpy().T
        self.pred = self.model.predict(self.test.drop(columns=self.pred_attrs))
        # ROC Curve
        self.roc_curve = roc_curve(self.real, self.pred)
        self.false_positive_rate = self.roc_curve[0]
        self.true_positive_rate = self.roc_curve[1]
        self.threshholds = self.roc_curve[2]
        
        self.roc_auc_score = roc_auc_score(self.real, self.pred)
        if print_auc_score: print(f'<{self.label}> ROC AUC score: {self.roc_auc_score:.3f}')
        
        plt.figure(figsize=figsize)
        ax = plt.gca()
        ax.plot(self.false_positive_rate, self.true_positive_rate)
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve <{self.label}>')
        plt.show()

    def conf_matrix(self, metric_index_to_compare=0, figsize=(6, 6)):
        class_names = np.array([0, 1])

        # Train and divide into test and train
        self.Random_Subsampling(print_evaluation=False)

        # Real and Predicted values of test set
        self.real = self.test[self.pred_attrs[metric_index_to_compare]].to_numpy().T
        self.pred = np.vectorize(lambda x: min(x, 1))(self.model.predict(self.test.drop(columns=self.pred_attrs)).round())

        matrix = confusion_matrix(self.real, self.pred)
        plt.figure(figsize=figsize)
        sns.set(font_scale=1)
        sns.heatmap(matrix, annot=True, annot_kws={'size':10}, cmap=plt.cm.Greens, linewidths=0.2, fmt='g')

        # Add labels to the plot
        tick_marks = class_names
        tick_marks2 = tick_marks + 0.5
        plt.xticks(tick_marks + 0.5, class_names, rotation=0)
        plt.yticks(tick_marks2, class_names, rotation=0)
        plt.xlabel('Predicted label')
        plt.ylabel('True label')
        plt.show()

    
    def run_all_evaluations(self, leave_one_out=False, stratified_splits=10):
        self.Houldout_Method()
        self.Random_Subsampling()
        self.Stratified_K_Fold(print_folds=False, n_splits=stratified_splits)
        if leave_one_out: self.Leave_One_Out_Cross_Validation(print_folds=False)
        self.ROC()
        self.conf_matrix()

### Evaluating models

In [None]:
print(f'{"#":#^50s}\n#{"Evaluating <Logistic Regression>":^48s}#\n{"#":#^50s}')
model = 'Logistic Regression'
logreg_eval = EvalModel(logreg, model_name=model, dataset=df_final, prediction_attrs=prediction_attr)
logreg_eval.run_all_evaluations()

print(f'{"#":#^50s}\n#{"Evaluating <XGBoost>":^48s}#\n{"#":#^50s}')
model = 'XGBoost'
xgboost_eval = EvalModel(XGBoostModel, model_name=model, dataset=df_final, prediction_attrs=prediction_attr)
xgboost_eval.run_all_evaluations()

print(f'{"#":#^50s}\n#{"Evaluating <Random Forest>":^48s}#\n{"#":#^50s}')
model = 'Random Forest'
rf_eval = EvalModel(rf_classifier, model_name=model, dataset=df_final, prediction_attrs=prediction_attr)
rf_eval.run_all_evaluations()

print(f'{"#":#^50s}\n#{"Evaluating <LGBM>":^48s}#\n{"#":#^50s}')
model = 'LGBM'
rf_eval = EvalModel(LGBMModel, model_name=model, dataset=df_final, prediction_attrs=prediction_attr)
rf_eval.run_all_evaluations()