# Importing Packages and Data

In [2]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [3]:
import numpy as np
import pandas as pd
from ucimlrepo import fetch_ucirepo
from numpy import mean
from numpy import std
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef
from sklearn import metrics

In [4]:
communities_and_crime = fetch_ucirepo(id=183)

X = communities_and_crime.data.features
y = communities_and_crime.data.targets

In [5]:
replication_df = X.join(y)
replication_df.head()

Unnamed: 0,state,county,community,communityname,fold,population,householdsize,racepctblack,racePctWhite,racePctAsian,...,LandArea,PopDens,PctUsePubTrans,PolicCars,PolicOperBudg,LemasPctPolicOnPatr,LemasGangUnitDeploy,LemasPctOfficDrugUn,PolicBudgPerPop,ViolentCrimesPerPop
0,8,?,?,Lakewoodcity,1,0.19,0.33,0.02,0.9,0.12,...,0.12,0.26,0.2,0.06,0.04,0.9,0.5,0.32,0.14,0.2
1,53,?,?,Tukwilacity,1,0.0,0.16,0.12,0.74,0.45,...,0.02,0.12,0.45,?,?,?,?,0.0,?,0.67
2,24,?,?,Aberdeentown,1,0.0,0.42,0.49,0.56,0.17,...,0.01,0.21,0.02,?,?,?,?,0.0,?,0.43
3,34,5,81440,Willingborotownship,1,0.04,0.77,1.0,0.08,0.12,...,0.02,0.39,0.28,?,?,?,?,0.0,?,0.12
4,42,95,6096,Bethlehemtownship,1,0.01,0.55,0.02,0.95,0.09,...,0.04,0.09,0.02,?,?,?,?,0.0,?,0.03


#Replicating the Study

In [6]:
replication_df['CrimeCategory'] = pd.cut(x=replication_df['ViolentCrimesPerPop'],
                     bins=[-0.1, 0.25, 0.40, 1],
                     labels=['Low', 'Medium', 'High'])
replication_df.head()

Unnamed: 0,state,county,community,communityname,fold,population,householdsize,racepctblack,racePctWhite,racePctAsian,...,PopDens,PctUsePubTrans,PolicCars,PolicOperBudg,LemasPctPolicOnPatr,LemasGangUnitDeploy,LemasPctOfficDrugUn,PolicBudgPerPop,ViolentCrimesPerPop,CrimeCategory
0,8,?,?,Lakewoodcity,1,0.19,0.33,0.02,0.9,0.12,...,0.26,0.2,0.06,0.04,0.9,0.5,0.32,0.14,0.2,Low
1,53,?,?,Tukwilacity,1,0.0,0.16,0.12,0.74,0.45,...,0.12,0.45,?,?,?,?,0.0,?,0.67,High
2,24,?,?,Aberdeentown,1,0.0,0.42,0.49,0.56,0.17,...,0.21,0.02,?,?,?,?,0.0,?,0.43,High
3,34,5,81440,Willingborotownship,1,0.04,0.77,1.0,0.08,0.12,...,0.39,0.28,?,?,?,?,0.0,?,0.12,Low
4,42,95,6096,Bethlehemtownship,1,0.01,0.55,0.02,0.95,0.09,...,0.09,0.02,?,?,?,?,0.0,?,0.03,Low


In [7]:
#Here I am replicating the dataset exactly as it was described in the study.
replication_df = replication_df[['state', 'population', 'medIncome', 'medFamInc', 'perCapInc', 'NumUnderPov', 'PctLess9thGrade','PctNotHSGrad',\
                                'PctBSorMore', 'PctUnemployed', 'PctEmploy', 'ViolentCrimesPerPop', 'CrimeCategory']]

replication_df.head()


Unnamed: 0,state,population,medIncome,medFamInc,perCapInc,NumUnderPov,PctLess9thGrade,PctNotHSGrad,PctBSorMore,PctUnemployed,PctEmploy,ViolentCrimesPerPop,CrimeCategory
0,8,0.19,0.37,0.39,0.4,0.08,0.1,0.18,0.48,0.27,0.68,0.2,Low
1,53,0.0,0.31,0.29,0.37,0.01,0.14,0.24,0.3,0.27,0.73,0.67,High
2,24,0.0,0.3,0.28,0.27,0.01,0.27,0.43,0.19,0.36,0.58,0.43,High
3,34,0.04,0.58,0.51,0.36,0.01,0.09,0.25,0.31,0.33,0.71,0.12,Low
4,42,0.01,0.5,0.46,0.43,0.0,0.25,0.3,0.33,0.12,0.65,0.03,Low


In [8]:
replication_df.to_csv('replication_dataset.csv', index=True)

In [9]:
features = replication_df.iloc[:, :-1]
target = replication_df.iloc[:, -1]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2,random_state=109)

In [11]:
#Fitting the decision tree on the dataset described in the study.
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [12]:
#This classification report shows the same results that were reported in the study, however I was suspicious of the model being perfect on every measurement.
y_pred_tree = clf.predict(X_test)
print(classification_report(y_test, y_pred_tree))

              precision    recall  f1-score   support

        High       1.00      1.00      1.00        67
         Low       1.00      1.00      1.00       270
      Medium       1.00      1.00      1.00        62

    accuracy                           1.00       399
   macro avg       1.00      1.00      1.00       399
weighted avg       1.00      1.00      1.00       399



In [13]:
#I remembered that the study included the 'ViolentCrimesPerPop' feature that it used to split 'CrimeCategory' into classes, and so decided to test the model without
#it because it was providing perfect information to the model being trained.
replication_df_crimecat = replication_df.drop(['ViolentCrimesPerPop'], axis = 1).copy()
features = replication_df_crimecat.iloc[:, :-1]
target = replication_df_crimecat.iloc[:, -1]

In [14]:
replication_df_crimecat.columns

Index(['state', 'population', 'medIncome', 'medFamInc', 'perCapInc',
       'NumUnderPov', 'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore',
       'PctUnemployed', 'PctEmploy', 'CrimeCategory'],
      dtype='object')

In [15]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2,random_state=109)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [16]:
#This classification report shows a more realistic picture of the training model, with the average f1 score of 0.51 and the MCC score of 0.32.
#This is the model I will compare my own testing to.
y_pred_tree = clf.predict(X_test)
print(classification_report(y_test, y_pred_tree))
matthews_corrcoef(y_test, y_pred_tree)

              precision    recall  f1-score   support

        High       0.41      0.33      0.36        67
         Low       0.82      0.82      0.82       270
      Medium       0.26      0.32      0.29        62

    accuracy                           0.66       399
   macro avg       0.50      0.49      0.49       399
weighted avg       0.67      0.66      0.66       399



0.3090754895119096