In [1]:
import pandas as pd
import numpy as np

### Process *Communities and Crime* dataset

In [23]:
columns = ['state',
           'county',
           'community',
           'communityname',
           'fold',
           'population',
           'householdsize',
           'racepctblack',
           'racePctWhite',
           'racePctAsian',
           'racePctHisp',
           'agePct12t21',
           'agePct12t29',
           'agePct16t24',
           'agePct65up',
           'numbUrban',
           'pctUrban',
           'medIncome',
           'pctWWage',
           'pctWFarmSelf',
           'pctWInvInc',
           'pctWSocSec',
           'pctWPubAsst',
           'pctWRetire',
           'medFamInc',
           'perCapInc',
           'whitePerCap',
           'blackPerCap',
           'indianPerCap',
           'AsianPerCap',
           'OtherPerCap',
           'HispPerCap',
           'NumUnderPov',
           'PctPopUnderPov',
           'PctLess9thGrade',
           'PctNotHSGrad',
           'PctBSorMore',
           'PctUnemployed',
           'PctEmploy',
           'PctEmplManu',
           'PctEmplProfServ',
           'PctOccupManu',
           'PctOccupMgmtProf',
           'MalePctDivorce',
           'MalePctNevMarr',
           'FemalePctDiv',
           'TotalPctDiv',
           'PersPerFam',
           'PctFam2Par',
           'PctKids2Par',
           'PctYoungKids2Par',
           'PctTeen2Par',
           'PctWorkMomYoungKids',
           'PctWorkMom',
           'NumIlleg',
           'PctIlleg',
           'NumImmig',
           'PctImmigRecent',
           'PctImmigRec5',
           'PctImmigRec8',
           'PctImmigRec10',
           'PctRecentImmig',
           'PctRecImmig5',
           'PctRecImmig8',
           'PctRecImmig10',
           'PctSpeakEnglOnly',
           'PctNotSpeakEnglWell',
           'PctLargHouseFam',
           'PctLargHouseOccup',
           'PersPerOccupHous',
           'PersPerOwnOccHous',
           'PersPerRentOccHous',
           'PctPersOwnOccup',
           'PctPersDenseHous',
           'PctHousLess3BR',
           'MedNumBR',
           'HousVacant',
           'PctHousOccup',
           'PctHousOwnOcc',
           'PctVacantBoarded',
           'PctVacMore6Mos',
           'MedYrHousBuilt',
           'PctHousNoPhone',
           'PctWOFullPlumb',
           'OwnOccLowQuart',
           'OwnOccMedVal',
           'OwnOccHiQuart',
           'RentLowQ',
           'RentMedian',
           'RentHighQ',
           'MedRent',
           'MedRentPctHousInc',
           'MedOwnCostPctInc',
           'MedOwnCostPctIncNoMtg',
           'NumInShelters',
           'NumStreet',
           'PctForeignBorn',
           'PctBornSameState',
           'PctSameHouse85',
           'PctSameCity85',
           'PctSameState85',
           'LemasSwornFT',
           'LemasSwFTPerPop',
           'LemasSwFTFieldOps',
           'LemasSwFTFieldPerPop',
           'LemasTotalReq',
           'LemasTotReqPerPop',
           'PolicReqPerOffic',
           'PolicPerPop',
           'RacialMatchCommPol',
           'PctPolicWhite',
           'PctPolicBlack',
           'PctPolicHisp',
           'PctPolicAsian',
           'PctPolicMinor',
           'OfficAssgnDrugUnits',
           'NumKindsDrugsSeiz',
           'PolicAveOTWorked',
           'LandArea',
           'PopDens',
           'PctUsePubTrans',
           'PolicCars',
           'PolicOperBudg',
           'LemasPctPolicOnPatr',
           'LemasGangUnitDeploy',
           'LemasPctOfficDrugUn',
           'PolicBudgPerPop',
           'ViolentCrimesPerPop']

In [24]:
df = pd.read_csv('../data/communities-crime/communities.data', names=columns, sep=',')
df = df.applymap(lambda x: np.nan if x == '?' else x)

In [25]:
columns_with_nan = list(df.columns[df.isna().any()])
columns_to_remove = columns_with_nan + ['fold', 'county', 'community', 'communityname', 'state']
columns_to_remove

['county',
 'community',
 'OtherPerCap',
 'LemasSwornFT',
 'LemasSwFTPerPop',
 'LemasSwFTFieldOps',
 'LemasSwFTFieldPerPop',
 'LemasTotalReq',
 'LemasTotReqPerPop',
 'PolicReqPerOffic',
 'PolicPerPop',
 'RacialMatchCommPol',
 'PctPolicWhite',
 'PctPolicBlack',
 'PctPolicHisp',
 'PctPolicAsian',
 'PctPolicMinor',
 'OfficAssgnDrugUnits',
 'NumKindsDrugsSeiz',
 'PolicAveOTWorked',
 'PolicCars',
 'PolicOperBudg',
 'LemasPctPolicOnPatr',
 'LemasGangUnitDeploy',
 'PolicBudgPerPop',
 'fold',
 'county',
 'community',
 'communityname',
 'state']

In [26]:
df = df.drop(columns=columns_to_remove)

In [27]:
df.head()

Unnamed: 0,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,agePct12t29,agePct16t24,agePct65up,...,PctForeignBorn,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85,LandArea,PopDens,PctUsePubTrans,LemasPctOfficDrugUn,ViolentCrimesPerPop
0,0.19,0.33,0.02,0.9,0.12,0.17,0.34,0.47,0.29,0.32,...,0.12,0.42,0.5,0.51,0.64,0.12,0.26,0.2,0.32,0.2
1,0.0,0.16,0.12,0.74,0.45,0.07,0.26,0.59,0.35,0.27,...,0.21,0.5,0.34,0.6,0.52,0.02,0.12,0.45,0.0,0.67
2,0.0,0.42,0.49,0.56,0.17,0.04,0.39,0.47,0.28,0.32,...,0.14,0.49,0.54,0.67,0.56,0.01,0.21,0.02,0.0,0.43
3,0.04,0.77,1.0,0.08,0.12,0.1,0.51,0.5,0.34,0.21,...,0.19,0.3,0.73,0.64,0.65,0.02,0.39,0.28,0.0,0.12
4,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,0.23,0.36,...,0.11,0.72,0.64,0.61,0.53,0.04,0.09,0.02,0.0,0.03


In [87]:
prot_words = ['black', 'white', 'asian', 'hisp', 'age', 'indian', 'other', 'male', 'female', 'minor']

In [88]:
protected_columns = list(filter(lambda x: any([word in x.lower() for word in prot_words]), df.columns))
protected_columns

['racepctblack',
 'racePctWhite',
 'racePctAsian',
 'racePctHisp',
 'agePct12t21',
 'agePct12t29',
 'agePct16t24',
 'agePct65up',
 'pctWWage',
 'whitePerCap',
 'blackPerCap',
 'indianPerCap',
 'AsianPerCap',
 'HispPerCap',
 'MalePctDivorce',
 'MalePctNevMarr',
 'FemalePctDiv',
 'black']

### Explore how to convert target variable

In [32]:
df['ViolentCrimesPerPop'].describe()

count    1994.000000
mean        0.237979
std         0.232985
min         0.000000
25%         0.070000
50%         0.150000
75%         0.330000
max         1.000000
Name: ViolentCrimesPerPop, dtype: float64

In [49]:
THRES = 0.7

In [50]:
df['high-crime'] = (df['ViolentCrimesPerPop'] > THRES).astype(int)

In [51]:
df['black'] = (df['racepctblack'] > 0.06).astype(int)
protected_columns += ['black']

In [69]:
df_not_protected = df.loc[:, list(map(lambda c: c not in protected_columns, df.columns))]

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [70]:
X_train, X_test, y_train, y_test = train_test_split(df_not_protected.iloc[:, :-2].to_numpy(),
                                                    df_not_protected.iloc[:, -1].to_numpy(),
                                                    test_size=0.2)

In [72]:

model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)


LogisticRegression(max_iter=10000)

In [73]:

y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

acc

0.9624060150375939

In [74]:
df.loc[df['black'] == 1, 'high-crime'].mean(), df.loc[df['black'] != 1, 'high-crime'].mean()

(0.1134020618556701, 0.005859375)

In [75]:
df.loc[df['high-crime'] == 1, 'black'].mean(), df.loc[df['high-crime'] != 1, 'black'].mean()

(0.9482758620689655, 0.4579339723109691)

#### Export

In [76]:
df.columns

Index(['population', 'householdsize', 'racepctblack', 'racePctWhite',
       'racePctAsian', 'racePctHisp', 'agePct12t21', 'agePct12t29',
       'agePct16t24', 'agePct65up',
       ...
       'PctSameHouse85', 'PctSameCity85', 'PctSameState85', 'LandArea',
       'PopDens', 'PctUsePubTrans', 'LemasPctOfficDrugUn',
       'ViolentCrimesPerPop', 'high-crime', 'black'],
      dtype='object', length=102)

In [82]:
df_to_export = df[list(df.columns[:-3])+[df.columns[-1]]+[df.columns[-2]]]

In [83]:
df_to_export.head()

Unnamed: 0,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,agePct12t29,agePct16t24,agePct65up,...,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85,LandArea,PopDens,PctUsePubTrans,LemasPctOfficDrugUn,black,high-crime
0,0.19,0.33,0.02,0.9,0.12,0.17,0.34,0.47,0.29,0.32,...,0.42,0.5,0.51,0.64,0.12,0.26,0.2,0.32,0,0
1,0.0,0.16,0.12,0.74,0.45,0.07,0.26,0.59,0.35,0.27,...,0.5,0.34,0.6,0.52,0.02,0.12,0.45,0.0,1,0
2,0.0,0.42,0.49,0.56,0.17,0.04,0.39,0.47,0.28,0.32,...,0.49,0.54,0.67,0.56,0.01,0.21,0.02,0.0,1,0
3,0.04,0.77,1.0,0.08,0.12,0.1,0.51,0.5,0.34,0.21,...,0.3,0.73,0.64,0.65,0.02,0.39,0.28,0.0,1,0
4,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,0.23,0.36,...,0.72,0.64,0.61,0.53,0.04,0.09,0.02,0.0,0,0


In [84]:
df_to_export.to_csv('../data/communities-crime/communities-crime-processed.csv', index=False)

# START HERE IF ALREADY PROCESSED

In [85]:
df = pd.read_csv('../data/communities-crime/communities-crime-processed.csv')

In [86]:
df.head()

Unnamed: 0,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,agePct12t29,agePct16t24,agePct65up,...,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85,LandArea,PopDens,PctUsePubTrans,LemasPctOfficDrugUn,black,high-crime
0,0.19,0.33,0.02,0.9,0.12,0.17,0.34,0.47,0.29,0.32,...,0.42,0.5,0.51,0.64,0.12,0.26,0.2,0.32,0,0
1,0.0,0.16,0.12,0.74,0.45,0.07,0.26,0.59,0.35,0.27,...,0.5,0.34,0.6,0.52,0.02,0.12,0.45,0.0,1,0
2,0.0,0.42,0.49,0.56,0.17,0.04,0.39,0.47,0.28,0.32,...,0.49,0.54,0.67,0.56,0.01,0.21,0.02,0.0,1,0
3,0.04,0.77,1.0,0.08,0.12,0.1,0.51,0.5,0.34,0.21,...,0.3,0.73,0.64,0.65,0.02,0.39,0.28,0.0,1,0
4,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,0.23,0.36,...,0.72,0.64,0.61,0.53,0.04,0.09,0.02,0.0,0,0


In [89]:
df[protected_columns].dtypes

racepctblack      float64
racePctWhite      float64
racePctAsian      float64
racePctHisp       float64
agePct12t21       float64
agePct12t29       float64
agePct16t24       float64
agePct65up        float64
pctWWage          float64
whitePerCap       float64
blackPerCap       float64
indianPerCap      float64
AsianPerCap       float64
HispPerCap        float64
MalePctDivorce    float64
MalePctNevMarr    float64
FemalePctDiv      float64
black               int64
dtype: object