In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook, tqdm_pandas

from data.models import Allegation
from data_importer.base.cleaner import DataCleaner, strip
from data_importer.base.storage import AzureStorage
from document_cloud.document_clould_uploader import DocumentCloudUploader, DOCUMENT_TITLE_KEYWORD_TYPE_MAPPING
from data.constants import MEDIA_TYPE_DOCUMENT

In [2]:
storage = AzureStorage(account_name='cpdbv2data', 
                       account_key='your-key-here',
                       share='crawled-data')
import_db = 'import'

## Read the data

In [3]:
df = pd.read_csv(
    storage.path_for('/ipra_crawled_complaint_attachments_data.csv'),
    usecols=['crid', 'type', 'link', 'title']
)

df = df.rename(columns={
    'type': 'file_type',
    'link': 'original_url'
})

## Clean the data

In [4]:
df = DataCleaner(schema={
    'crid': [strip]
}).perform(df)

100%|██████████| 1/1 [00:00<00:00, 22.65it/s]


#### Exludes attachments which its associated crids are not in the imported complaints data

In [5]:
crids = set(Allegation.objects.using(import_db).all().values_list('crid', flat=True))

In [6]:
df = df[df['crid'].apply(lambda x: x in crids)]

#### Add url field, tag field and addition_info field

In [7]:
def get_attachment_tag(row):
    file_type = row['file_type']
    title = row['title']
    if file_type == MEDIA_TYPE_DOCUMENT:
        for key in DOCUMENT_TITLE_KEYWORD_TYPE_MAPPING:
            if title.startswith(key):
                return DOCUMENT_TITLE_KEYWORD_TYPE_MAPPING[key]
        return 'Other'
    else:
        return file_type.capitalize()

In [8]:
tqdm.pandas(tqdm_notebook(), desc='Adding url field, tag field and addition_info field')
df = df.assign(
    url=df.progress_apply(lambda x: x['original_url'], axis=1),
    tag=df.progress_apply(get_attachment_tag, axis=1),
    additional_info=df.progress_apply(lambda x: {}, axis=1)
)

A Jupyter Widget

Adding url field, tag field and addition_info field: 100%|██████████| 576/576 [00:00<00:00, 50335.84it/s]
Adding url field, tag field and addition_info field: 100%|██████████| 576/576 [00:00<00:00, 36501.97it/s]
Adding url field, tag field and addition_info field: 100%|██████████| 576/576 [00:00<00:00, 141240.52it/s]


#### Get the allegation's ids which documents are associated with

In [9]:
df_allegations = pd.DataFrame.from_records(Allegation.objects.using(import_db).all().values('id', 'crid'))

In [10]:
def get_allegation_id(crid):
    df_found = df_allegations[df_allegations['crid']==crid]
    return df_found.iloc[0]['id'] if not df_found.empty else np.NaN

In [11]:
tqdm.pandas(tqdm_notebook(), desc='Get allegation id')
df['allegation_id'] = df['crid'].progress_apply(get_allegation_id)
del df['crid']

A Jupyter Widget

Get allegation id:   1%|          | 7/576 [00:00<00:08, 69.99it/s]




Get allegation id: 100%|██████████| 576/576 [00:09<00:00, 58.99it/s]


## Import complaint attachments data

#### Delete all existing attachments before importing

In [12]:
AttachmentFile.objects.using(import_db).delete()

(0, {u'data.AttachmentFile': 0})

#### Import and print out the imported data

In [13]:
tqdm.pandas(tqdm_notebook(), desc='Importing document')
df.progress_apply(lambda x: AttachmentFile.objects.using(import_db).create(**x), axis=1)

pd.DataFrame.from_records(AttachmentFile.objects.using(import_db).all().values())

A Jupyter Widget

Importing document: 100%|██████████| 576/576 [00:00<00:00, 744.69it/s]


Unnamed: 0,additional_info,allegation_id,file_type,id,original_url,tag,title,url
0,{},109233,document,1,http://www.chicagocopa.org/wp-content/uploads/...,AR,Arrest Report (Acuna),http://www.chicagocopa.org/wp-content/uploads/...
1,{},109233,document,2,http://www.chicagocopa.org/wp-content/uploads/...,AR,Arrest Report (Cruz Rodriguez),http://www.chicagocopa.org/wp-content/uploads/...
2,{},109233,document,3,http://www.chicagocopa.org/wp-content/uploads/...,AR,Arrest Report (Louis Rodriguez),http://www.chicagocopa.org/wp-content/uploads/...
3,{},109233,document,4,http://www.chicagocopa.org/wp-content/uploads/...,AR,Arrest Report (Oscar Rodriguez),http://www.chicagocopa.org/wp-content/uploads/...
4,{},109233,document,5,http://www.chicagocopa.org/wp-content/uploads/...,OCIR,Original Case Incident Report,http://www.chicagocopa.org/wp-content/uploads/...
5,{},109233,document,6,http://www.chicagocopa.org/wp-content/uploads/...,OBR,Officer's Battery Report (Fergus),http://www.chicagocopa.org/wp-content/uploads/...
6,{},109233,document,7,http://www.chicagocopa.org/wp-content/uploads/...,OBR,Officer's Battery Report (Konior),http://www.chicagocopa.org/wp-content/uploads/...
7,{},109233,document,8,http://www.chicagocopa.org/wp-content/uploads/...,OBR,Officer's Battery Report (Reyes),http://www.chicagocopa.org/wp-content/uploads/...
8,{},109233,document,9,http://www.chicagocopa.org/wp-content/uploads/...,OBR,Officer's Battery Report (Ridge),http://www.chicagocopa.org/wp-content/uploads/...
9,{},109233,document,10,http://www.chicagocopa.org/wp-content/uploads/...,OBR,Officer's Battery Report (Smith),http://www.chicagocopa.org/wp-content/uploads/...
