In [None]:
# upload local file

from google.colab import files

fatal_police_shootings_data = files.upload()



In [None]:
# Imports
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.graph_objects as go
import seaborn as sns
import random

In [None]:
# Reading data from file
data = pd.read_csv('fatal-police-shootings-data.csv')
date_df = data['date']

In [None]:
data.info()
# Analysis
# In total there are 13 columns and 5338 entries(Number of data)
# Columns that have null values are:
# "armed"(240 entries) - around 4% of data, --> unknown
# "age"(249 entries) - around 4% of data, --> replace to mean value
# "gender"(2 entries) - around 0.3% of data, --> drop empty value
# "race"(607 entries - around 11% of data),  --> unknown
# "flee"(250 entries) - around 4% of data  --> drop column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5338 entries, 0 to 5337
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       5338 non-null   int64  
 1   name                     5338 non-null   object 
 2   date                     5338 non-null   object 
 3   manner_of_death          5338 non-null   object 
 4   armed                    5098 non-null   object 
 5   age                      5089 non-null   float64
 6   gender                   5336 non-null   object 
 7   race                     4731 non-null   object 
 8   city                     5338 non-null   object 
 9   state                    5338 non-null   object 
 10  signs_of_mental_illness  5338 non-null   bool   
 11  threat_level             5338 non-null   object 
 12  flee                     5088 non-null   object 
 13  body_camera              5338 non-null   bool   
dtypes: bool(2), float64(1), 

In [None]:
# Number of unique values in the dataset per columns
data.nunique()

id                         5338
name                       5115
date                       1823
manner_of_death               2
armed                        92
age                          77
gender                        2
race                          6
city                       2447
state                        51
signs_of_mental_illness       2
threat_level                  3
flee                          4
body_camera                   2
dtype: int64

# Clean Data


In [None]:
# Merging & Cleaning Data


# Delete Duplicated Data - By checking id & rows that actually have the same data.
# With "num_duplicates = duplicated_rows.sum()", we can check the actual number of duplicates
duplicated_rows = data.duplicated(subset=['id'])
data.drop_duplicates(inplace=True)

# Delete Useless Data - date & name & flee
data = data.drop(['date','name', "flee"], axis=1)

# Replace Empty Cell - 'armed', 'gender', 'race'
data['armed'].fillna('undetermined', inplace=True)
data['gender'].fillna('unknown', inplace=True)
data['race'].fillna('unknown', inplace=True)

# Remove data values of features that is not useful
# (1) Race :
data.drop(data[(data["race"] == "unknown")].index, inplace=True)

data.head()


Unnamed: 0,id,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,body_camera
0,3,shot,gun,53.0,M,A,Shelton,WA,True,attack,False
1,4,shot,gun,47.0,M,W,Aloha,OR,False,attack,False
2,5,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,False
3,8,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,False
4,9,shot,nail gun,39.0,M,H,Evans,CO,False,attack,False


In [None]:

# Cleaned armed data
def count_each_armed_type(armed_str):
    if ' and ' in armed_str:
            parts = armed_str.split(' and ')
            split_parts = [item.strip() for part in parts for item in part.split(',')]
            return split_parts
    else:
            return [armed_str]

expanded_data = []

for index, row in data.iterrows():
    armed_types = count_each_armed_type(row['armed'])
    for armed_type in armed_types:
        expanded_data.append({'race': row['race'], 'armed': armed_type})

expanded_df = pd.DataFrame(expanded_data)

print(expanded_df)

     race         armed
0       A           gun
1       W           gun
2       H       unarmed
3       W    toy weapon
4       H      nail gun
...   ...           ...
4779    W           gun
4780    W           gun
4781    W       unarmed
4782    H           gun
4783    W  undetermined

[4784 rows x 2 columns]


# <Victims by age, gender, race, sign_of_mental_illness>

In [None]:
fig = make_subplots(rows=2, cols=2, start_cell="bottom-left")

fig.add_trace(go.Bar(x=data['age'].value_counts().index, y=data['age'].value_counts().values, name="age"),
              row=1, col=1)

fig.add_trace(go.Bar(x=data['race'].value_counts().index, y=data['race'].value_counts().values, name="race"),
              row=1, col=2)

fig.add_trace(go.Bar(x=data['gender'].value_counts().index, y=data['gender'].value_counts().values, name="gender"),
              row=2, col=1)

fig.add_trace(go.Bar(x=data['signs_of_mental_illness'].value_counts().index, y=data['signs_of_mental_illness'].value_counts().values, name="mental illness"),
              row=2, col=2)
fig.update_layout(
    title={
        'text': "Number of Victims Based on:",
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

fig.update_xaxes(title_text="Age", row=1, col=1)
fig.update_xaxes(title_text="Race",  row=1, col=2)
fig.update_xaxes(title_text="Gender", row=2, col=1)
fig.update_xaxes(title_text="Sign of Mental Illness", row=2, col=2)


fig.show()





# <Change in the number of "Fatal Police Shootings" over the years (2015-2020)>

In [None]:
# Year with the most incidents happened
date_df = pd.to_datetime(date_df)
no_incidents = date_df.dt.year.value_counts()

# Visualization1 : Number of incidents(count) - Pie Chart
incidents_df = pd.DataFrame({"year":no_incidents.index,"count":no_incidents.values}).sort_values(by="count", ascending=False)
fig_pie = px.pie(incidents_df, values='count', names='year', title='Number of Fatal Police Shootings per Year')
fig_pie.update_traces(direction="clockwise")
fig_pie.update_layout(title_x=0.5)
fig_pie.show()

### Visualization2: TimeData - Line Graph
incidents_df = pd.DataFrame({"year":no_incidents.index,"count":no_incidents.values}).sort_values(by='year')
fig_timeline = px.line(incidents_df, x='year', y="count",title='Change in the Number of Fatal Police Shootings per Year')
fig_timeline.update_layout(title_x=0.5)
fig_timeline.show()



### Why?

In [None]:
# For displaying all counts of year by month
yearly_counts = {}
montly_mean_per_year = {}


for year in [2015, 2016, 2017, 2018, 2019, 2020]:
    count_year = date_df[date_df.dt.year == year]
    count_month_year = count_year.dt.month.value_counts().sort_index()

    yearly_counts[year] = count_month_year
    montly_mean_per_year[year] = count_month_year.mean()


for year, counts in yearly_counts.items():
    print(f"Year: {year}")
    print("Mean of months: ",montly_mean_per_year[year])
    print(counts)
    print()



Year: 2015
Mean of months:  82.83333333333333
date
1      76
2      77
3      92
4      84
5      71
6      65
7     104
8      94
9      82
10     84
11     77
12     88
Name: count, dtype: int64

Year: 2016
Mean of months:  80.16666666666667
date
1     81
2     86
3     92
4     73
5     74
6     92
7     72
8     82
9     78
10    77
11    77
12    78
Name: count, dtype: int64

Year: 2017
Mean of months:  82.16666666666667
date
1      92
2     100
3      76
4      67
5      74
6      84
7      94
8      82
9      70
10     85
11     84
12     78
Name: count, dtype: int64

Year: 2018
Mean of months:  82.66666666666667
date
1      99
2      80
3     110
4      98
5      83
6      81
7      89
8      74
9      55
10     75
11     77
12     71
Name: count, dtype: int64

Year: 2019
Mean of months:  83.66666666666667
date
1      94
2      72
3      88
4      71
5      74
6      86
7      80
8      86
9      78
10     90
11     77
12    108
Name: count, dtype: int64

Year: 2020
Mean of mon

# <State with most Fatal Police Shootings & the Most Dangerous Cities.>




### (1) State counts

In [None]:
state_counts = data['state'].value_counts()
top5_state_counts = state_counts[:5]
top_state_count = state_counts.idxmax()
print("The most number of fatal shooting happened in ",top_state_count,"\n",top5_state_counts,"\n")

# Visualization1: Number of Fatal Police Shootings per State - Bar Chart
num_states = len(state_counts)
unique_colors = px.colors.qualitative.Plotly[:num_states]
fig = px.bar(x=state_counts.index, y=state_counts.values, labels={"x":"State","y":"Counts"},
             title="Number of Fatal Police Shootings per State", color=state_counts.index, color_discrete_sequence=unique_colors)
fig.update_layout(title_x=0.5)
fig.show()

# Visualization2: Distribution of Fatal Police Shootings per State - Choropleth
state_counts = data.groupby(by='state').agg({'id' : 'count'}).reset_index()

fig2 = go.Figure(data=go.Choropleth(
    locations=state_counts['state'],
    z = state_counts['id'],
    colorbar_title = "Number of Deaths",
    locationmode = 'USA-states'
))

fig2.update_layout(
    title_x=0.5,
    title_text = 'Distribution of Fatal Police Shootings per State',
    geo_scope='usa'
)


fig2.show()

The most number of fatal shooting happened in  CA 
 state
CA    677
TX    411
FL    315
AZ    218
CO    158
Name: count, dtype: int64 



### (2) City counts


In [None]:
city_counts = data['city'].value_counts()

city_counts_over10 = city_counts[city_counts >= 10] # Out of 5338 number of datas only the top 70 data has a count over 10
last10_city_counts = city_counts[-10:]
top_city_count = city_counts.idxmax()

print("The most number of fatal shooting happened in ",top_city_count,"\n",city_counts_over10[:5],"\n")
print()

unique_colors = px.colors.qualitative.Plotly[:len(city_counts_over10)]
fig = px.bar(x=city_counts_over10.index, y=city_counts_over10.values, labels={"x":"City","y":"Counts"},
             title="Number of Fatal Police Shootings per City", color=city_counts_over10.index, color_discrete_sequence=unique_colors)
fig.update_layout(title_x=0.5)
fig.show()


The most number of fatal shooting happened in  Los Angeles 
 city
Los Angeles    76
Phoenix        62
Houston        50
San Antonio    39
Las Vegas      38
Name: count, dtype: int64 




<The most common way of being armed>

# < The Most Common Way of Being Armed >

In [None]:
armed_counts = expanded_df['armed'].value_counts().head(15)
top5_armed_counts = expanded_df['armed'].value_counts().head(5)

print("Top 5 ways of being",top5_armed_counts)
print()

print("The most common way of being armed: ",top5_armed_counts.index[0])

colors = px.colors.qualitative.Plotly

# Visualization1: The Ways of Being Armed - Bar Chart
fig_bar_armed = px.bar(x=armed_counts.index, y=armed_counts.values,
                 labels={"x": "Armed Type", "y": "Counts"},
                 title="The Ways of Being Armed")
fig_bar_armed.update_traces(marker_color=colors)
fig_bar_armed.update_xaxes(categoryorder='total descending')

# Visualization2: The Ways of Being Armed - Pie Chart
fig_pie_armed = px.pie(names=armed_counts.index, values=armed_counts.values,
                 labels={"x": "Armed Type", "y": "Counts"},
                 title="The Ways of Being Armed")

fig = make_subplots(rows=1, cols=2,
                    specs=[[{"type": "bar"}, {"type": "pie"}]])

for trace in fig_bar_armed.data:
    fig.add_trace(trace, row=1, col=1)

for trace in fig_pie_armed.data:
    fig.add_trace(trace, row=1, col=2)

fig.update_layout(title_text="Comparison of Armed Type Distribution", height=400, showlegend=True)
fig.show()

Top 5 ways of being armed
gun             2677
knife            704
undetermined     404
unarmed          310
toy weapon       165
Name: count, dtype: int64

The most common way of being armed:  gun


# < The Age Distribution of the Victims >

### (1) the age distribution of the victims

In [None]:
age_counts = data['armed'].value_counts()
age_df = data['age']
age_cleaned = age_df.dropna()

age_sorted = age_cleaned.sort_values(ascending=False)

print("The Age Distribution of the Victims: \n", age_sorted)
print()

# Visualization2: Age Distribution of the Victims - Distribution plot
fig_distplot = ff.create_distplot([age_sorted], ['Age'], show_hist=False)


fig_sub_age = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.1,
                    subplot_titles=("The Age Distribution of the Victims", ""))

# Visualization1: Age Distribution of the Victims - Histogram
fig_sub_age.add_trace(go.Histogram(x=age_sorted, name="Count"), row=1, col=1)


for trace in fig_distplot['data']:
    fig_sub_age.add_trace(trace, row=2, col=1)

fig_sub_age.update_yaxes(title_text="Counts", row=1, col=1)
fig_sub_age.update_xaxes(title_text="Age", row=2, col=1)
fig_sub_age.update_yaxes(title_text="Density", row=2, col=1)

fig_sub_age.update_traces(name="Count", selector=dict(type="histogram"))
fig_sub_age.update_traces(name="Density", selector=dict(type="scatter"))

fig_sub_age.show()

The Age Distribution of the Victims: 
 2162    91.0
3075    84.0
3741    84.0
5129    84.0
3147    84.0
        ... 
3986    14.0
1681    13.0
1017    12.0
2908     6.0
833      6.0
Name: age, Length: 4647, dtype: float64



### (2) compare age distribution of different races

In [None]:
# Age Distribution(mean, median, minimum, maximum value) by Race
race_age = data.groupby('race')['age'].agg(['mean', 'median', 'min', 'max'])
print(race_age)

# Visualization: Age Distribution by Race - Box Plot
fig_age_race = px.box(data, x='race', y='age', title='Box Plot of Age by Race', color="race")
fig_age_race.show()

           mean  median   min   max
race                               
A     36.494382    35.0  15.0  62.0
B     32.320065    30.0  13.0  77.0
H     33.602579    33.0  14.0  80.0
N     31.360000    31.0  14.0  58.0
O     33.217391    30.0  18.0  59.0
W     39.908627    38.0   6.0  91.0


# <Total Number if People Killed per Race (proportion wise)>

In [None]:
race_counts = data["race"].value_counts()
# 4731 non-null values (47 data has value of Other)
total_race_counts = race_counts.sum()
print(race_counts,total_race_counts)
race_proportions = race_counts/total_race_counts
print(race_proportions)

fig = make_subplots(
    rows=1, cols=2,
    column_widths=[0.5, 0.5],
    row_heights=[0.5],
    subplot_titles=('Number of People Killed per Race - Proportion Wise',  'Number of People Killed per Race - Total Count'),
    specs=[[ {"type": "pie"}, {"type": "bar"}]])

fig.add_trace(go.Pie(
    labels=race_proportions.index,
    values=race_proportions.values,
    legendgroup="group",
    textinfo='percent+label'),
    row=1, col=1)

fig.add_trace(go.Bar(
    x=race_counts.index,
    y=race_counts.values,
    legendgroup="group"),
    row=1, col=2)

fig.show()


race
W    2385
B    1254
H     878
A      91
N      76
O      47
Name: count, dtype: int64 4731
race
W    0.504122
B    0.265060
H    0.185584
A    0.019235
N    0.016064
O    0.009934
Name: count, dtype: float64


# < Gender Ratio of the Victims By Race >

In [None]:
gender_counts = data.groupby(['race', 'gender']).size().unstack(fill_value=0)

gender_counts['Total'] = gender_counts.sum(axis=1)
gender_counts['Male Ratio'] = gender_counts['M'] / gender_counts['Total']
gender_counts['Female Ratio'] = gender_counts['F'] / gender_counts['Total']

# Visualization: Gender Ratio of the Victims By Race - Bar chart
fig_race_gender = px.bar(gender_counts, x=['Male Ratio', 'Female Ratio'], y=gender_counts.index,
             orientation='h', color_discrete_map={'Male Ratio': 'rgb(57,105,172)', 'Female Ratio': 'rgb(204,80,62)'},
             labels={'value': 'Gender Ratio', 'y': 'Race'})

fig_race_gender.update_layout(title="Gender Ratio by Race",
                  xaxis=dict(title="Gender Ratio"),
                  yaxis=dict(title="Race"))

# < The Race of Victims over the years >

### (1) the number of victims by race over the years

In [None]:
df_yr = pd.DataFrame(data)

df_yr['date'] = pd.to_datetime(date_df)
df_yr['year'] = df_yr['date'].dt.year.astype(str)

# Exclude 2020 data that do not have data for all months
df_yr = df_yr[df_yr['year'] != '2020']

counts = df_yr.groupby(['year', 'race']).size().unstack(fill_value=0)

print(counts)

# Visualization: The Race of Victims over the years - Stacked Bar Chart
fig_year_race = px.bar(counts, x=counts.index, y=counts.columns,
                          title='The Race of Victims over the Years',
                          labels={'year': 'Year', 'count': 'Number of Victims'},
                          barmode='stack', height=600)

fig_year_race.update_layout(xaxis={'title': 'Year'}, yaxis={'title': 'Number of Victims'})

fig_year_race.show()

race   A    B    H   N   O    W
year                           
2015  14  258  172   9  15  497
2016  15  234  160  16  11  465
2017  16  223  179  22   6  459
2018  21  229  165  15   4  451
2019  18  235  158  12   9  370


### (2) the race of victims over the years (proportions)

In [None]:
yearly_totals = counts.sum(axis=1)

normalized_counts = counts.div(yearly_totals, axis=0)

# Visualization: The Race of Victims over the Years (Proportions) - Stacked Bar Chart
fig_year_race = px.bar(normalized_counts, x=normalized_counts.index, y=normalized_counts.columns,
                       title='The Race of Victims over the Years (Proportions)',
                       labels={'x': 'Year', 'y': 'Proportion of Victims'},
                       barmode='stack', height=600)

fig_year_race.update_layout(xaxis={'title': 'Year'}, yaxis={'title': 'Proportion of Victims'})

fig_year_race.show()

# **Modeling and Question Answering**

In [None]:
# imports
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
import warnings

In [None]:
# upload file
from google.colab import files
fatal_police_shootings_data = files.upload()


Saving fatal-police-shootings-data.csv to fatal-police-shootings-data (1).csv


In [None]:
data = pd.read_csv('fatal-police-shootings-data.csv')

## Clean Data

In [None]:
# Clean data

# remove duplicates
duplicated_rows = data.duplicated(subset=['id'])
data.drop_duplicates(inplace=True)

# Delete Useless Data - date & name & flee
data = data.drop(['date', 'name', "flee", "manner_of_death"], axis=1)

## data.fillna('unknown', inplace=True) => below code
# Replace Empty Cell - 'armed', 'age', 'gender', 'race'
data['armed'].fillna('undetermined', inplace=True)
data['age'].fillna('unknown', inplace=True)
data['gender'].fillna('unknown', inplace=True)
data['race'].fillna('unknown', inplace=True)

# Remove data values of features that is not useful
# (1) Race : Since we have to see specific data, we have to remove when the race data is unknown/null
data.drop(data[(data["race"] == "unknown")].index, inplace=True)

data.head()

Unnamed: 0,id,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,body_camera
0,3,gun,53.0,M,A,Shelton,WA,True,attack,False
1,4,gun,47.0,M,W,Aloha,OR,False,attack,False
2,5,unarmed,23.0,M,H,Wichita,KS,False,other,False
3,8,toy weapon,32.0,M,W,San Francisco,CA,True,attack,False
4,9,nail gun,39.0,M,H,Evans,CO,False,attack,False


In [None]:

# Cleaned armed data
def count_each_armed_type(armed_str):
    if ' and ' in armed_str:
            parts = armed_str.split(' and ')
            split_parts = [item.strip() for part in parts for item in part.split(',')]
            return split_parts
    else:
            return [armed_str]

expanded_data = []

for index, row in data.iterrows():
    armed_types = count_each_armed_type(row['armed'])
    for armed_type in armed_types:
        expanded_data.append({'race': row['race'], 'armed': armed_type})

expanded_df = pd.DataFrame(expanded_data)

print(expanded_df)

     race         armed
0       A           gun
1       W           gun
2       H       unarmed
3       W    toy weapon
4       H      nail gun
...   ...           ...
4779    W           gun
4780    W           gun
4781    W       unarmed
4782    H           gun
4783    W  undetermined

[4784 rows x 2 columns]


## Correlation Matrix for all features in the data set


In [None]:
for col in data.columns:
  if data[col].dtype == 'object':
    data[col] = data[col].astype('category').cat.codes

# Create the correlation matrix
# Use spearman for non-numerical data
correlation_matrix = data.corr(method='spearman')

# Create a heatmap using plotly
fig = go.Figure(data=go.Heatmap(
  z=correlation_matrix.values,
  x=correlation_matrix.columns,
  y=correlation_matrix.columns,
  colorscale='Viridis'
))

fig.update_layout(
  title='Correlation Matrix Heatmap',
  xaxis_title='Features',
  yaxis_title='Features'
)
# Display Heatmap
fig.show()
# # Display Correlation Matrix
# print(correlation_matrix)
# Since displaying is tezxt consuming, save result as csv
correlation_matrix.to_csv("features_correlation_matrix.csv")


## Training Model 1: Logistic Regression

In [None]:
armed_encoder = LabelEncoder()
race_encoder = LabelEncoder()

## .fit_transform(data['armed']) => .fit_transform(expanded_df['armed'])
X_encoded = armed_encoder.fit_transform(expanded_df['armed']).reshape(-1, 1)
y_encoded = race_encoder.fit_transform(expanded_df['race'])

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

logistic = LogisticRegression(max_iter=1000, multi_class='multinomial')
logistic.fit(X_train, y_train)

y_pred = logistic.predict(X_test)

target_names = [str(cls) for cls in race_encoder.classes_]
report = classification_report(y_test, y_pred, target_names=target_names, zero_division=0)

accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
print()
print(report)

0.49216300940438873

              precision    recall  f1-score   support

           A       0.00      0.00      0.00        22
           B       0.00      0.00      0.00       273
           H       0.00      0.00      0.00       162
           N       0.00      0.00      0.00        19
           O       0.00      0.00      0.00        10
           W       0.49      1.00      0.66       471

    accuracy                           0.49       957
   macro avg       0.08      0.17      0.11       957
weighted avg       0.24      0.49      0.32       957



## Training Model 2: Random Forest Tree

In [None]:
label_encoders = {}
label_encoders['state'] = LabelEncoder()
data['state'] = label_encoders['state'].fit_transform(data['state'])

X = data[['state']] #feature
y = data['race'] # target


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

result = classification_report(y_test, y_pred)
print("Result of Classification:")
print(result)



Result of Classification:
              precision    recall  f1-score   support

           A       0.00      0.00      0.00        27
           B       0.49      0.18      0.27       245
           H       0.49      0.42      0.45       176
           N       0.00      0.00      0.00         8
           O       0.50      0.57      0.53         7
           W       0.57      0.82      0.67       484

    accuracy                           0.55       947
   macro avg       0.34      0.33      0.32       947
weighted avg       0.51      0.55      0.50       947




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



## Training Model 3: Gradient Boosting

In [None]:
data['age'].replace('unknown', np.nan, inplace=True)
data_rm_null = data.dropna(subset=['age', 'race'])

label_encoder = LabelEncoder()
X = data_rm_null[['age']]
y_encoded = label_encoder.fit_transform(data_rm_null['race'])


X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
gradient_boosting = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gradient_boosting.fit(X_train, y_train)
y_pred = gradient_boosting.predict(X_test)

target_names = [str(cls) for cls in label_encoder.classes_]

accuracy = gradient_boosting.score(X_test, y_test)
report = classification_report(y_test, y_pred, target_names=target_names, zero_division=0)

print(accuracy)
print()
print(report)

0.529039070749736

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        27
           1       0.45      0.22      0.30       245
           2       0.14      0.01      0.01       176
           3       0.00      0.00      0.00         8
           4       0.00      0.00      0.00         7
           5       0.54      0.92      0.68       484

    accuracy                           0.53       947
   macro avg       0.19      0.19      0.17       947
weighted avg       0.42      0.53      0.43       947

