In [1]:
import requests
import json
import pandas
from tqdm import tqdm

from bs4 import BeautifulSoup

from data.models import AttachmentFile
from data_importer.ipra_portal_crawler.crawler import OpenIpraInvestigationCrawler, HEADERS, ComplaintCrawler

In [2]:
def ordinal(n):
    return "%d%s" % (n,"tsnrhtdd"[(n/10%10!=1)*(n%10<4)*n%10::4])

## Crawling complaint data from Ipra website
http://www.chicagocopa.org/wp-content/themes/copa/DynamicSearch.php

In [3]:
links = OpenIpraInvestigationCrawler().crawl()
incidents = []
for link in tqdm(links):
    incidents.append(ComplaintCrawler(link).crawl())

100%|██████████| 195/195 [02:21<00:00,  1.02it/s]


## Clean up data

In [4]:
df = pandas.DataFrame.from_records(incidents, exclude=['attachments'])
df

Unnamed: 0,date,district,log_number,subjects,time,type
0,08-24-2014,11,1071166,[Roshad McIntosh],08-24-2014 7:08 pm,Firearm Discharge
1,07-21-2017,09,1086011,"[Donzell Grant, Dante Jeffries, Cortez Harring...",07-21-2017 1:38 pm,Firearm Discharge
2,07-16-2017,25,1085949,[Saul Flores],07-16-2017 3:07 am,Firearm Discharge
3,07-11-2017,15,1085898,[Delrius Wilkins],07-09-2017 1:54 am,Other Use Of Force
4,01-23-2016,03,1078979,[Bryan Nelson],01-22-2016 10:58 pm,Other Use Of Force
5,07-09-2017,25,1085876,[Brayant Alvarez],07-09-2017 11:35 am,Firearm Discharge
6,07-02-2017,10,1085782,[Quentica Locke],07-01-2017 10:54 pm,Firearm Discharge
7,06-30-2017,15,1085761,[Delbert Maddox],06-30-2017 9:37 am,Taser Discharge
8,06-27-2017,11,1085722,[Taylor Clark],06-27-2017 1:13 am,Other Use Of Force
9,06-27-2017,Other,1085672,[Unknown],06-18-2017 10:08 pm,Firearm Discharge


## Map areas to complaints

#### Read police district id from database

In [5]:
df_police_districts = pandas.DataFrame.from_records(Area.objects.using('import').filter(area_type='police-districts').values('name', 'id'))

#### Get corresponding area id of district

In [6]:
def get_area_ids(data):
    districts = [x.strip() for x in data.split(',')]
    
    ids = []
    for district in districts:
        try: 
            ids.append(Area.objects.using('import').get(area_type='police-districts', name=ordinal(int(district))).id)
        except:
            pass
        
    return ids
    
df['areas'] = df['district'].apply(get_area_ids)

#### Print out data

In [7]:
df

Unnamed: 0,date,district,log_number,subjects,time,type,areas
0,08-24-2014,11,1071166,[Roshad McIntosh],08-24-2014 7:08 pm,Firearm Discharge,[1546]
1,07-21-2017,09,1086011,"[Donzell Grant, Dante Jeffries, Cortez Harring...",07-21-2017 1:38 pm,Firearm Discharge,[1550]
2,07-16-2017,25,1085949,[Saul Flores],07-16-2017 3:07 am,Firearm Discharge,[1532]
3,07-11-2017,15,1085898,[Delrius Wilkins],07-09-2017 1:54 am,Other Use Of Force,[1547]
4,01-23-2016,03,1078979,[Bryan Nelson],01-22-2016 10:58 pm,Other Use Of Force,[1535]
5,07-09-2017,25,1085876,[Brayant Alvarez],07-09-2017 11:35 am,Firearm Discharge,[1532]
6,07-02-2017,10,1085782,[Quentica Locke],07-01-2017 10:54 pm,Firearm Discharge,[1548]
7,06-30-2017,15,1085761,[Delbert Maddox],06-30-2017 9:37 am,Taser Discharge,[1547]
8,06-27-2017,11,1085722,[Taylor Clark],06-27-2017 1:13 am,Other Use Of Force,[1546]
9,06-27-2017,Other,1085672,[Unknown],06-18-2017 10:08 pm,Firearm Discharge,[]


#### Export data to csv file

In [8]:
df.to_csv('ipra_crawled_complaint_data.csv')