<a href="https://colab.research.google.com/github/katelynnrachel/ckids_fred_morstatter/blob/amykim-analysis/Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [59]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Get the tweet counts per victim
data = pd.read_csv('./sample_data/df-relatedToHomicide.csv')
data.drop(['Unnamed: 0'], axis=1, inplace=True)
df = data.groupby(['victim_name', 'victim_age', 'victim_gender', 'victim_race', 'cause_of_death', 'if_officer_involved', 'death_location', 'agency'], sort=False).size().reset_index(name='counts')

df['victim_age'] = df['victim_age'].apply(lambda x: np.nan if x == 'None' else x)
df['victim_age'] = df['victim_age'].astype(float)
df['victim_age'].fillna(df['victim_age'].mean(), inplace=True)
df['victim_age'] = df['victim_age'].round().astype(int)

# Create age ranges
age_bins = np.arange(0, 90, 5)
age_labels = [f'{i}-{i+4}' for i in age_bins[:-1]]
df['age_range'] = pd.cut(df['victim_age'], bins=age_bins, labels=age_labels)
df

Unnamed: 0,victim_name,victim_age,victim_gender,victim_race,cause_of_death,if_officer_involved,death_location,agency,counts,age_range
0,Charles Robert Towns,47,Male,Black,Unknown,Officer-involved,In Front Of 2157 El Serano Ave,LASD,40,45-49
1,Ming Wei Ma,72,Male,Asian,Unknown,Unknown,122 West Garvey Ave #B,LASD,19048,70-74
2,Wen Tau Yu,64,Male,Asian,Unknown,Unknown,122 West Garvey Avenue #B,LASD,208,60-64
3,Valentino Marcos Alvero,68,Male,Asian,Unknown,Unknown,122 West Garvey Avenue #B,LASD,201,65-69
4,Chia Ling Yau,76,Male,Asian,Unknown,Unknown,122 West Garvey Avenue #B,LASD,207,75-79
...,...,...,...,...,...,...,...,...,...,...
469,Michael Di'Shawn Radford,20,Male,Black,Gunshot,Unknown,Torrance,Torrance PD,1,15-19
470,Astin Kyle Edwards,28,Male,Black,Gunshot,Unknown,Torrance,Torrance PD,1,25-29
471,Frank Borsotti,61,Male,White,Blunt force,Unknown,Lancaster,LASD,2,60-64
472,Fael Valente,30,Male,Latino,Gunshot,Unknown,East Los Angeles,LASD,1,25-29


In [60]:
categorical_cols = ['age_range', 'victim_gender', 'victim_race', 'cause_of_death', 'if_officer_involved']
df_encoded = pd.get_dummies(df, columns=categorical_cols)
df_encoded = df_encoded.drop(['cause_of_death_Unknown','victim_race_None', 'victim_race_Unknown'], axis=1)
df_encoded

Unnamed: 0,victim_name,victim_age,death_location,agency,counts,age_range_0-4,age_range_5-9,age_range_10-14,age_range_15-19,age_range_20-24,...,victim_race_Latino,victim_race_Other,victim_race_White,cause_of_death_Blunt force,cause_of_death_Gunshot,cause_of_death_Other,cause_of_death_Pending,cause_of_death_Stabbing,if_officer_involved_Officer-involved,if_officer_involved_Unknown
0,Charles Robert Towns,47,In Front Of 2157 El Serano Ave,LASD,40,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,Ming Wei Ma,72,122 West Garvey Ave #B,LASD,19048,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,Wen Tau Yu,64,122 West Garvey Avenue #B,LASD,208,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Valentino Marcos Alvero,68,122 West Garvey Avenue #B,LASD,201,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,Chia Ling Yau,76,122 West Garvey Avenue #B,LASD,207,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
469,Michael Di'Shawn Radford,20,Torrance,Torrance PD,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
470,Astin Kyle Edwards,28,Torrance,Torrance PD,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
471,Frank Borsotti,61,Lancaster,LASD,2,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
472,Fael Valente,30,East Los Angeles,LASD,1,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,1


--------------------victim_race------------------

In [61]:
X_race = df_encoded.drop(['victim_name', 'victim_age', 'death_location', 'agency'] + list(df_encoded.filter(regex = 'victim_race')), axis=1)  #features
y_race = df['victim_race']  #target variable

X_train_race, X_test_race, y_train_race, y_test_race = train_test_split(X_race, y_race, test_size=0.2, random_state=42)

treeModel_race = DecisionTreeClassifier(max_depth=5)
treeModel_race.fit(X_train_race, y_train_race)
print(f'Mean accuracy of race with DecisionTreeClassifier: {treeModel_race.score(X_test_race, y_test_race)}')
print(X_race.columns)

from collections import Counter
print(Counter(y_test_race))

Mean accuracy of race with DecisionTreeClassifier: 0.5894736842105263
Index(['counts', 'age_range_0-4', 'age_range_5-9', 'age_range_10-14',
       'age_range_15-19', 'age_range_20-24', 'age_range_25-29',
       'age_range_30-34', 'age_range_35-39', 'age_range_40-44',
       'age_range_45-49', 'age_range_50-54', 'age_range_55-59',
       'age_range_60-64', 'age_range_65-69', 'age_range_70-74',
       'age_range_75-79', 'age_range_80-84', 'victim_gender_Female',
       'victim_gender_Male', 'cause_of_death_Blunt force',
       'cause_of_death_Gunshot', 'cause_of_death_Other',
       'cause_of_death_Pending', 'cause_of_death_Stabbing',
       'if_officer_involved_Officer-involved', 'if_officer_involved_Unknown'],
      dtype='object')
Counter({'Latino': 56, 'Black': 29, 'White': 4, 'None': 3, 'Asian': 2, 'Other': 1})


--------------------victim_gender------------------

In [62]:
X_gender = df_encoded.drop(['victim_name', 'victim_age', 'death_location', 'agency'] + list(df_encoded.filter(regex = 'victim_gender')), axis=1) # features
y_gender = df['victim_gender']  # target variable

X_train_gender, X_test_gender, y_train_gender, y_test_gender = train_test_split(X_gender, y_gender, test_size=0.2, random_state=42)

treeModel_gender = DecisionTreeClassifier(max_depth=3)
treeModel_gender.fit(X_train_gender, y_train_gender)
print(f'Mean accuracy of gender with DecisionTreeClassifier: {treeModel_gender.score(X_test_gender, y_test_gender)}')
print(X_gender.columns)
print(sum(y_test_gender == 'Male'))
print(len(y_test_gender))

Mean accuracy of gender with DecisionTreeClassifier: 0.9263157894736842
Index(['counts', 'age_range_0-4', 'age_range_5-9', 'age_range_10-14',
       'age_range_15-19', 'age_range_20-24', 'age_range_25-29',
       'age_range_30-34', 'age_range_35-39', 'age_range_40-44',
       'age_range_45-49', 'age_range_50-54', 'age_range_55-59',
       'age_range_60-64', 'age_range_65-69', 'age_range_70-74',
       'age_range_75-79', 'age_range_80-84', 'victim_race_Asian',
       'victim_race_Black', 'victim_race_Korean', 'victim_race_Latino',
       'victim_race_Other', 'victim_race_White', 'cause_of_death_Blunt force',
       'cause_of_death_Gunshot', 'cause_of_death_Other',
       'cause_of_death_Pending', 'cause_of_death_Stabbing',
       'if_officer_involved_Officer-involved', 'if_officer_involved_Unknown'],
      dtype='object')
87
95


--------------------victim_age------------------

In [63]:
X_age = df_encoded.drop(['victim_name', 'victim_age', 'death_location', 'agency'] + list(df_encoded.filter(regex = 'age_range')), axis=1) # features
y_age = df['victim_age']  # target variable
# TODO: decision tree depth, cross validation, hyperparameter, try another model, put code on github
X_train_age, X_test_age, y_train_age, y_test_age = train_test_split(X_age, y_age, test_size=0.2, random_state=42)

treeModel_age = DecisionTreeClassifier()
treeModel_age.fit(X_train_age, y_train_age)
print(f'Mean accuracy of age with DecisionTreeClassifier: {treeModel_age.score(X_test_age, y_test_age)}')
print(X_age.columns)

Mean accuracy of age with DecisionTreeClassifier: 0.05263157894736842
Index(['counts', 'victim_gender_Female', 'victim_gender_Male',
       'victim_race_Asian', 'victim_race_Black', 'victim_race_Korean',
       'victim_race_Latino', 'victim_race_Other', 'victim_race_White',
       'cause_of_death_Blunt force', 'cause_of_death_Gunshot',
       'cause_of_death_Other', 'cause_of_death_Pending',
       'cause_of_death_Stabbing', 'if_officer_involved_Officer-involved',
       'if_officer_involved_Unknown'],
      dtype='object')


---------------Hyperparameter-------------------------

In [68]:
from sklearn.model_selection import cross_val_score
# DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
max_depths = [2, 3, 4] # default has no max depth
for val in max_depths:
  score = cross_val_score(DecisionTreeClassifier(max_depth=val, random_state=42), X_race, y_race, cv=5)
  print(f'For max depth: {val} and target variable race, score is {score.mean()}')

for val in max_depths:
  score = cross_val_score(DecisionTreeClassifier(max_depth=val, random_state=42), X_gender, y_gender, cv=5)
  print(f'For max depth: {val} and target variable gender, score is {score.mean()}')

for val in max_depths:
  score = cross_val_score(DecisionTreeClassifier(max_depth=val, random_state=42), X_age, y_age, cv=5)
  print(f'For max depth: {val} and target variable age, score is {score.mean()}')

# RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
estimators = [100, 150, 200, 250] # 100 is default
for val in estimators:
  score = cross_val_score(RandomForestClassifier(n_estimators=val, random_state=42), X_race, y_race, cv=5)
  print(f'For number of estimators: {val} and target variable race, score is {score.mean()}')

for val in estimators:
  score = cross_val_score(RandomForestClassifier(n_estimators=val, random_state=42), X_gender, y_gender, cv=5)
  print(f'For number of estimators: {val} and target variable gender, score is {score.mean()}')

for val in estimators:
  score = cross_val_score(RandomForestClassifier(n_estimators=val, random_state=42), X_age, y_age, cv=5)
  print(f'For number of estimators: {val} and target variable age, score is {score.mean()}')

# KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
neighbors = [5, 3, 4, 6] # 5 is default
for val in neighbors:
  score = cross_val_score(KNeighborsClassifier(n_neighbors=val), X_race, y_race, cv=5)
  print(f'For number of neighbors: {val} and target variable race, score is {score.mean()}')

for val in neighbors:
  score = cross_val_score(KNeighborsClassifier(n_neighbors=val), X_gender, y_gender, cv=5)
  print(f'For number of neighbors: {val} and target variable gender, score is {score.mean()}')

for val in neighbors:
  score = cross_val_score(KNeighborsClassifier(n_neighbors=val), X_age, y_age, cv=5)
  print(f'For number of neighbors: {val} and target variable age, score is {score.mean()}')





For max depth: 2 and target variable race, score is 0.578006718924972
For max depth: 3 and target variable race, score is 0.5716909294512879
For max depth: 4 and target variable race, score is 0.5527435610302353
For max depth: 2 and target variable gender, score is 0.8945128779395297
For max depth: 3 and target variable gender, score is 0.8966405375139977
For max depth: 4 and target variable gender, score is 0.8882194848824188
For max depth: 2 and target variable age, score is 0.029540873460246357
For max depth: 3 and target variable age, score is 0.021075027995520718
For max depth: 4 and target variable age, score is 0.029518477043673014




For number of estimators: 100 and target variable race, score is 0.4810526315789474




For number of estimators: 150 and target variable race, score is 0.47471444568868976




For number of estimators: 200 and target variable race, score is 0.48313549832026875




For number of estimators: 250 and target variable race, score is 0.4852407614781635
For number of estimators: 100 and target variable gender, score is 0.8692049272116462
For number of estimators: 150 and target variable gender, score is 0.8734154535274357
For number of estimators: 200 and target variable gender, score is 0.86494960806271
For number of estimators: 250 and target variable gender, score is 0.8691825307950728




For number of estimators: 100 and target variable age, score is 0.03585666293393057




For number of estimators: 150 and target variable age, score is 0.03798432250839866




For number of estimators: 200 and target variable age, score is 0.04851063829787235




For number of estimators: 250 and target variable age, score is 0.04217245240761478
For number of neighbors: 5 and target variable race, score is 0.5190145576707726
For number of neighbors: 3 and target variable race, score is 0.48313549832026875




For number of neighbors: 4 and target variable race, score is 0.48931690929451294
For number of neighbors: 6 and target variable race, score is 0.506293393057111
For number of neighbors: 5 and target variable gender, score is 0.8861142217245241




For number of neighbors: 3 and target variable gender, score is 0.8755431131019037
For number of neighbors: 4 and target variable gender, score is 0.8691825307950728
For number of neighbors: 6 and target variable gender, score is 0.8861142217245241




For number of neighbors: 5 and target variable age, score is 0.023202687569988802
For number of neighbors: 3 and target variable age, score is 0.025330347144456887
For number of neighbors: 4 and target variable age, score is 0.029540873460246357
For number of neighbors: 6 and target variable age, score is 0.03377379619260919


