In [29]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.preprocessing import TransactionEncoder

In [2]:
dfCrime = pd.read_csv("../Data/Crime_Data_Clean_Consolidated.csv")

In [3]:
dfCrime.head()

Unnamed: 0.1,Unnamed: 0,ID,DateReported,DateOccurred,TimeOccurred,AreaID,ReportingDistrict,CrimeCode,MOCodes,VictimAge,...,DailyAverageDryBulbTemp,DailyAverageRelativeHumidity,DailySunrise,DailySunset,DailyPrecip,DailySnowfall,DailySnowDepth,DailyAverageWindSpeed,DailyAverageHeatIndex,night
0,0,180204779,2018-01-17,2010-01-01,00:01:00,2,245,813,0510 0522 0558 1258 0602,-1,...,60,44,06:59:00,16:56:00,0.0,0.0,0.0,0.9,82.6,1
1,1,172020383,2017-11-12,2010-01-01,00:01:00,20,2074,354,0100,51,...,60,44,06:59:00,16:56:00,0.0,0.0,0.0,0.9,82.6,1
2,2,171913109,2017-06-13,2010-01-01,00:01:00,19,1994,354,0377 1822,42,...,60,44,06:59:00,16:56:00,0.0,0.0,0.0,0.9,82.6,1
3,3,172013071,2017-06-28,2010-01-01,00:01:00,20,2025,820,1257 0550,-1,...,60,44,06:59:00,16:56:00,0.0,0.0,0.0,0.9,82.6,1
4,4,171809308,2017-04-05,2010-01-01,00:01:00,18,1891,760,0515 0913 1817 1820 0516 0500 0506,18,...,60,44,06:59:00,16:56:00,0.0,0.0,0.0,0.9,82.6,1


Isolate the crime codes

In [11]:
crimeCodes = dfCrime[["CrimeCode", "CrimeCode1", "CrimeCode2", "CrimeCode3", "CrimeCode4"]]
crimeCodes.head(15)

Unnamed: 0,CrimeCode,CrimeCode1,CrimeCode2,CrimeCode3,CrimeCode4
0,813,813,-1,-1,-1
1,354,354,-1,-1,-1
2,354,354,-1,-1,-1
3,820,812,820,-1,-1
4,760,760,-1,-1,-1
5,121,121,-1,-1,-1
6,354,354,-1,-1,-1
7,354,354,-1,-1,-1
8,860,860,-1,-1,-1
9,121,121,812,-1,-1


This is a little weird. Some crime codes are in both `CrimeCode` and `CrimeCode1` or `CrimeCode` and `CrimeCode2`. There could be some situations where only `CrimeCode` is entered or `CrimeCode` contains a unique crime code, so I won't drop `CrimeCode` entirely.

In [36]:
crimeLists = []
for tup in crimeCodes.itertuples():
    # Quick and dirty, convert each row into a set and remove -1 from that set if possible
    # Then append that to a list of lists of crime codes
    crimeLists.append(list(set(tup[1:])-{-1}))

Any more cleaning?

In [40]:
crimeLists.index([])

ValueError: [] is not in list

Good! Time to transform this list of lists into something mlxtend can use for [apriori](https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/).

This might be a little expensive...

In [41]:
te = TransactionEncoder()
teItems = te.fit(crimeLists).transform(crimeLists)
dfTransformed = pd.DataFrame(teItems, columns=te.columns_)

In [45]:
dfTransformed.head()

Unnamed: 0,93,99,110,111,113,121,122,210,220,230,...,980,986,990,993,994,995,996,997,998,999
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


Now for apriori:

In [46]:
# minumum support of 1% since the data is pretty sparse
apriori(dfTransformed, min_support=0.01)

Unnamed: 0,support,itemsets
0,0.040058,[7]
1,0.042905,[9]
2,0.072255,[17]
3,0.076859,[19]
4,0.014378,[20]
5,0.035456,[21]
6,0.06313,[30]
7,0.040686,[32]
8,0.071616,[44]
9,0.022336,[46]


This is only 1 itemsets :/

In [47]:
apriori(dfTransformed, min_support=0.005)

Unnamed: 0,support,itemsets
0,0.040058,[7]
1,0.005695,[8]
2,0.042905,[9]
3,0.005817,[13]
4,0.072255,[17]
5,0.006232,[18]
6,0.076859,[19]
7,0.014378,[20]
8,0.035456,[21]
9,0.006652,[26]


Some 2 itemsets here. That's better. But these will stay if we lower the minimum support...

In [48]:
apriori(dfTransformed, min_support=0.001)

Unnamed: 0,support,itemsets
0,0.001359,[2]
1,0.004669,[5]
2,0.040058,[7]
3,0.005695,[8]
4,0.042905,[9]
5,0.005817,[13]
6,0.002809,[14]
7,0.001289,[16]
8,0.072255,[17]
9,0.006232,[18]


There are about 1.6 million crimes here, so 0.1% of that is still 1600 crimes with the same crime codes. This minimum support, while small, is still not too specific.

It would be interesting to see if at least 500 crimes shared the same crime code. To capture that, we would need a minimum support of ~0.0003.

This might take a while.

In [57]:
freqCrimes = apriori(dfTransformed, min_support=0.0003, use_colnames=True)

In [58]:
freqCrimes

Unnamed: 0,support,itemsets
0,0.001359,[110]
1,0.004669,[121]
2,0.000552,[122]
3,0.040058,[210]
4,0.005695,[220]
5,0.042905,[230]
6,0.000798,[231]
7,0.000814,[235]
8,0.005817,[236]
9,0.002809,[237]


That's better

In [69]:
freqCrimes["length"] = freqCrimes.apply(lambda row: len(row["itemsets"]), axis=1)
freqCrimes.head()

Unnamed: 0,support,itemsets,length
0,0.001359,[110],1
1,0.004669,[121],1
2,0.000552,[122],1
3,0.040058,[210],1
4,0.005695,[220],1


Now to import the labels

In [72]:
dfLabels = pd.read_csv("../Data/CrimeCodeDescriptions.csv")
dfLabels.head()

Unnamed: 0,CrimeCode,CrimeCodeDescription
0,510,VEHICLE - STOLEN
1,440,THEFT PLAIN - PETTY ($950 & UN
2,626,INTIMATE PARTNER - SIMPLE ASSA
3,900,VIOLATION OF COURT ORDER
4,740,VANDALISM - FELONY ($400 & OVE


In [87]:
def lookupCode(code):
    return list(dfLabels.loc[dfLabels["CrimeCode"] == code]["CrimeCodeDescription"])

def translate(row):
    ccs = row["itemsets"]
    t = []
    for entry in ccs:
        t += lookupCode(entry)
    return t

In [88]:
freqCrimes["translated"] = freqCrimes.apply(translate, axis=1)

In [90]:
nFreqItems = freqCrimes.loc[freqCrimes["length"] > 1]

In [91]:
nFreqItems

Unnamed: 0,support,itemsets,length,translated
81,0.000597,"[110, 998]",2,[CRIMINAL HOMICIDE]
82,0.001856,"[121, 998]",2,"[RAPE, FORCIBLE]"
83,0.003902,"[210, 998]",2,[ROBBERY]
84,0.000638,"[220, 998]",2,[ATTEMPTED ROBBERY]
85,0.008848,"[230, 998]",2,"[ASSAULT WITH DEADLY WEAPON, AG]"
86,0.000657,"[236, 998]",2,[INTIMATE PARTNER - AGGRAVATED ]
87,0.000532,"[251, 998]",2,[SHOTS FIRED AT INHABITED DWELL]
88,0.00607,"[310, 998]",2,[BURGLARY]
89,0.000495,"[320, 998]",2,"[BURGLARY, ATTEMPTED]"
90,0.00252,"[330, 998]",2,[BURGLARY FROM VEHICLE]


In [95]:
lookupCode(998)

[]

Code 998 is actually a filler

In [101]:
freqCrimes["hasFiller"] = freqCrimes.apply(lambda row: 1 if 998 in row["itemsets"] else 0, axis = 1)

In [105]:
useful = freqCrimes.loc[freqCrimes["hasFiller"] == 0]

In [114]:
useful.sort_values(by="support", ascending=False).head(30)

Unnamed: 0,support,itemsets,length,translated,hasFiller
33,0.09204,[624],1,[BATTERY - SIMPLE ASSAULT],0
30,0.076893,[510],1,[VEHICLE - STOLEN],0
13,0.076859,[330],1,[BURGLARY FROM VEHICLE],0
11,0.072255,[310],1,[BURGLARY],0
26,0.071616,[440],1,[THEFT PLAIN - PETTY ($950 & UN],0
20,0.06313,[354],1,[THEFT OF IDENTITY],0
35,0.054711,[626],1,[INTIMATE PARTNER - SIMPLE ASSA],0
46,0.051024,[740],1,[VANDALISM - FELONY ($400 & OVE],0
47,0.045322,[745],1,[VANDALISM - MISDEAMEANOR ($399],0
5,0.042905,[230],1,"[ASSAULT WITH DEADLY WEAPON, AG]",0


The most frequent two item sets:

In [110]:
nFreqItems = useful.loc[useful["length"] > 1].sort_values(by="support", ascending=False)
for i, r in nFreqItems.iterrows():
    print(r["support"], r["translated"])

0.0009741800149799746 ['CRM AGNST CHLD (13 OR UNDER) (', 'BATTERY WITH SEXUAL CONTACT']
0.0007899407785251709 ['BRANDISH WEAPON', 'CRIMINAL THREATS - NO WEAPON D']
0.0005262257930114325 ['THEFT PLAIN - PETTY ($950 & UN', 'BIKE - STOLEN']
0.0004021954573588523 ['THEFT OF IDENTITY', 'DOCUMENT FORGERY / STOLEN FELO']


# Results
---
The most frequent individual crimes are 

| Crime |
|-|
|BATTERY - SIMPLE ASSAULT|
|VEHICLE - STOLEN|
|BURGLARY FROM VEHICLE|
|BURGLARY|
|THEFT PLAIN - PETTY ($950 & UN|

The most frequent 2 itemsets for crime codes are 

| Crime 1 | Crime 2 |
|-|-|
|CRM AGNST CHLD (13 OR UNDER) | BATTERY WITH SEXUAL CONTACT|
|BRANDISH WEAPON|CRIMINAL THREATS - NO WEAPON D|
|THEFT PLAIN - PETTY ($950 & UN | BIKE - STOLEN|
|THEFT OF IDENTITY| DOCUMENT FORGERY / STOLEN FELONY|