# Simple Analysis of Crime Data
---
Means, Modes, Frequencies, etc.

** Author(s):** Eoin Doherty

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Add other paths here
crimeData = '../Data/Crime_Data_Clean_Consolidated.csv'
weatherData = '../Data/Weather_Final.csv'
unemploymentData = '../Data/UnemploymentRate.csv'

premiseData = '../Data/PremiseDescriptions.csv'
statusData = '../Data/StatusCodeDescriptions.csv'
areaData = '../Data/AreaDescriptions.csv'
weaponData = '../Data/WeaponDescriptions.csv'
crimeCodeData = '../Data/CrimeCodeDescriptions.csv'

Load every dataset.

Each dataframe is in a separate cell if you need to reload it later. 

In [3]:
dfCrime = pd.read_csv(crimeData).drop(["Unnamed: 0"], axis=1)

In [4]:
dfWeath = pd.read_csv(weatherData)

In [5]:
dfUnemploy = pd.read_csv(unemploymentData)

In [6]:
dfPremise = pd.read_csv(premiseData)
dfStatus = pd.read_csv(statusData)
dfArea = pd.read_csv(areaData)
dfWeapon = pd.read_csv(weaponData)
dfCC = pd.read_csv(crimeCodeData)

In [7]:
dfCrime.head()

Unnamed: 0,ID,DateReported,DateOccurred,TimeOccurred,AreaID,ReportingDistrict,CrimeCode,MOCodes,VictimAge,VictimSex,...,DailyAverageDryBulbTemp,DailyAverageRelativeHumidity,DailySunrise,DailySunset,DailyPrecip,DailySnowfall,DailySnowDepth,DailyAverageWindSpeed,DailyAverageHeatIndex,night
0,180204779,2018-01-17,2010-01-01,00:01:00,2,245,813,0510 0522 0558 1258 0602,-1,F,...,60,44,06:59:00,16:56:00,0.0,0.0,0.0,0.9,82.6,1
1,172020383,2017-11-12,2010-01-01,00:01:00,20,2074,354,0100,51,F,...,60,44,06:59:00,16:56:00,0.0,0.0,0.0,0.9,82.6,1
2,171913109,2017-06-13,2010-01-01,00:01:00,19,1994,354,0377 1822,42,F,...,60,44,06:59:00,16:56:00,0.0,0.0,0.0,0.9,82.6,1
3,172013071,2017-06-28,2010-01-01,00:01:00,20,2025,820,1257 0550,-1,F,...,60,44,06:59:00,16:56:00,0.0,0.0,0.0,0.9,82.6,1
4,171809308,2017-04-05,2010-01-01,00:01:00,18,1891,760,0515 0913 1817 1820 0516 0500 0506,18,F,...,60,44,06:59:00,16:56:00,0.0,0.0,0.0,0.9,82.6,1


In [8]:
list(dfCrime)

['ID',
 'DateReported',
 'DateOccurred',
 'TimeOccurred',
 'AreaID',
 'ReportingDistrict',
 'CrimeCode',
 'MOCodes',
 'VictimAge',
 'VictimSex',
 'VictimDescent',
 'PremiseCode',
 'WeaponUsedCode',
 'StatusCode',
 'CrimeCode1',
 'CrimeCode2',
 'CrimeCode3',
 'CrimeCode4',
 'Address',
 'CrossStreet',
 'Location',
 'UnemploymentRate',
 'WeatherDate',
 'ReportType',
 'HourlyVisibility',
 'HourlyDryBulbTempF',
 'HourlyRelativeHumidity',
 'HourlyPrecip',
 'HourlyHeatIndex',
 'DailyMaximumDryBulbTemp',
 'DailyMinimumDryBulbTemp',
 'DailyAverageDryBulbTemp',
 'DailyAverageRelativeHumidity',
 'DailySunrise',
 'DailySunset',
 'DailyPrecip',
 'DailySnowfall',
 'DailySnowDepth',
 'DailyAverageWindSpeed',
 'DailyAverageHeatIndex',
 'night']

### Averages
---
Some averages to establish a sort of baseline.

In [9]:
trueHI = dfCrime.loc[dfCrime["DailyAverageHeatIndex"] < 999]["DailyAverageHeatIndex"]

realTrueHI = dfWeath.loc[dfWeath["DailyAverageHeatIndex"] < 999]["DailyAverageHeatIndex"];

In [10]:
meanVictimAge = np.mean(dfCrime["VictimAge"])
meanUnemploymentRate = np.mean(dfCrime["UnemploymentRate"])
meanHI = np.mean(trueHI)

print("Average victim age:", meanVictimAge)
print("\"Average\" unemployment rate:", meanUnemploymentRate)
print("\"Average\" Heat Index:", meanHI)

Average victim age: 32.97893772232137
"Average" unemployment rate: 8.649744172380776
"Average" Heat Index: 77.65726601980991


Now to compare them to the means of all of the data

In [11]:
realMeanUR = np.mean(dfUnemploy["UnemploymentRate"])
realMeanHI = np.mean(realTrueHI)

print("Average unemployment rate:", realMeanUR)
print("Average Heat Index:", realMeanHI)

Average unemployment rate: 7.652976190476189
Average Heat Index: 77.30146660262312


The averages taken from the crime data are skewed since not every day is represented equally.

The average unemployment rate is about 1 percent higher in the crime data than the average from the unemployment data. The heat index from the crime data and the weather data is about the same. This suggests that crime increases with unemployment rate, and weather does not have mush of an effect on crime.

## Frequencies
---
The frequencies of some nominal data from the crime dataset (using percent for better interpretation at a glance).

In [12]:
crimeLen = len(dfCrime)
crimeLen

1660884

Some information about the area and victims

In [13]:
areaIDFreq = dfCrime["AreaID"].value_counts()/crimeLen * 100

In [14]:
premiseFreq = dfCrime["PremiseCode"].value_counts()/crimeLen * 100

In [15]:
victimDescFreq = dfCrime["VictimDescent"].value_counts()/crimeLen * 100
victimSexFreq = dfCrime["VictimSex"].value_counts()/crimeLen * 100

In [16]:
# Crime code counts
crimeCodeCounts = dfCrime["CrimeCode"].value_counts()
crimeCode1Counts = dfCrime["CrimeCode1"].value_counts()
crimeCode2Counts = dfCrime["CrimeCode2"].value_counts()
crimeCode3Counts = dfCrime["CrimeCode3"].value_counts()
crimeCode4Counts = dfCrime["CrimeCode4"].value_counts()

crimeCodeFreqs = pd.concat([dfCrime["CrimeCode"], dfCrime["CrimeCode1"], dfCrime["CrimeCode2"], dfCrime["CrimeCode3"], dfCrime["CrimeCode4"]]).value_counts()/(crimeLen*5) * 100


In [17]:
# Status code counts
statusCodeFreq = dfCrime["StatusCode"].value_counts()/crimeLen * 100

In [18]:
# Weapon code counts
weaponCodeFreq = dfCrime["WeaponUsedCode"].value_counts()/crimeLen * 100

In [19]:
isNightFreq = dfCrime["night"].value_counts()/crimeLen * 100

### Translating into English

Translate the areaIDs

In [20]:
dfArea.head()

Unnamed: 0,AreaID,AreaName
0,10,West Valley
1,14,Pacific
2,15,N Hollywood
3,18,Southeast
4,5,Harbor


In [21]:
d = dict(areaIDFreq)
readableAreaID = {}

for k in d.keys():
    name = list(dfArea.loc[dfArea["AreaID"] == k]["AreaName"])[0]
    readableAreaID[name] = d[k]
pd.Series(readableAreaID).sort_values(ascending = False)

77th Street    6.967615
Southwest      6.452347
N Hollywood    5.439272
Pacific        5.286643
Southeast      5.265449
Mission        5.044302
Northeast      4.816110
Van Nuys       4.772880
Newton         4.701292
Devonshire     4.667514
Topanga        4.635303
Hollywood      4.582921
Harbor         4.451846
Olympic        4.446247
Central        4.283863
West Valley    4.272845
Rampart        4.223414
West LA        4.179040
Wilshire       4.034237
Foothill       3.834886
Hollenbeck     3.641976
dtype: float64

Translate the premise codes

In [22]:
d = dict(premiseFreq)
readablePremiseFreq = {}

for k in d.keys():
    name = list(dfPremise.loc[dfPremise["PremiseCode"] == k]["PremiseDescription"])[0]
    readablePremiseFreq[name] = d[k]
pd.Series(readablePremiseFreq).sort_values(ascending = False)

STREET                                          22.273922
SINGLE FAMILY DWELLING                          20.633349
MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)    12.869291
PARKING LOT                                      7.117053
SIDEWALK                                         4.992341
OTHER BUSINESS                                   4.480325
VEHICLE, PASSENGER/TRUCK                         3.808695
DRIVEWAY                                         2.050294
GARAGE/CARPORT                                   1.785555
DEPARTMENT STORE                                 1.400820
RESTAURANT/FAST FOOD                             1.339287
MARKET                                           1.034690
OTHER STORE                                      0.793553
PARKING UNDERGROUND/BUILDING                     0.699146
YARD (RESIDENTIAL/BUSINESS)                      0.699146
PARK/PLAYGROUND                                  0.685779
OTHER PREMISE                                    0.668560
HIGH SCHOOL   

Translate the crime codes

In [23]:
d = dict(crimeCodeFreqs)
readableCC = {}
for k in d.keys():
    try:
        name = list(dfCC.loc[dfCC["CrimeCode"] == k]["CrimeCodeDescription"])[0]
    except IndexError:
        if k == -1:
            name = "UNDEFINED"
        else:
            name = str(k)
    readableCC[name] = d[k]
crimes = pd.Series(readableCC).sort_values(ascending = False)

Translate the status codes

In [24]:
d = dict(statusCodeFreq)
readableStatusCodeFreq = {}

for k in d.keys():
    name = list(dfStatus.loc[dfStatus["StatusCode"] == k]["StatusDescription"])[0]
    readableStatusCodeFreq[name] = d[k]
pd.Series(readableStatusCodeFreq).sort_values(ascending = False)

Invest Cont     77.319488
Adult Other     11.359011
Adult Arrest    10.282055
Juv Arrest       0.790182
Juv Other        0.247459
UNK              0.000060
dtype: float64

Translate the weapon codes

In [25]:
dfWeapon.head()

Unnamed: 0,WeaponUsedCode,WeaponDescription
0,-1,
1,400,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)"
2,102,HAND GUN
3,106,UNKNOWN FIREARM
4,500,UNKNOWN WEAPON/OTHER WEAPON


In [26]:
d = dict(weaponCodeFreq)
readableWeaponCodeFreq = {}

for k in d.keys():
    name = list(dfWeapon.loc[dfWeapon["WeaponUsedCode"] == k]["WeaponDescription"])[0]
    readableWeaponCodeFreq[name] = d[k]
pd.Series(readableWeaponCodeFreq).sort_values(ascending = False)

STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)    20.148848
VERBAL THREAT                                      2.750222
UNKNOWN WEAPON/OTHER WEAPON                        2.584226
HAND GUN                                           1.595957
SEMI-AUTOMATIC PISTOL                              0.630869
KNIFE WITH BLADE 6INCHES OR LESS                   0.592516
OTHER KNIFE                                        0.439766
UNKNOWN FIREARM                                    0.378413
VEHICLE                                            0.337230
REVOLVER                                           0.286052
BOTTLE                                             0.239330
BLUNT INSTRUMENT                                   0.210430
ROCK/THROWN OBJECT                                 0.205674
STICK                                              0.204650
CLUB/BAT                                           0.198629
FOLDING KNIFE                                      0.196161
SIMULATED GUN                           