In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook, tqdm_pandas

from data.models import AttachmentFile, Allegation
from data_importer.base.cleaner import DataCleaner, strip
from data_importer.base.storage import AzureStorage

In [2]:
storage = AzureStorage(account_name='cpdbv2data', 
                       account_key='your-key-here',
                       share='data-v1')
import_db = 'import'

## Read allegation documents data

In [3]:
df = pd.read_csv(
    storage.path_for('/allegation_attachment_v1_2.csv'),
    usecols=['file_type', 'title', 'url', 'additional_info', 'tag', 'original_url', 'crid']
)

## Clean allegation data

In [4]:
cleaner = DataCleaner(
    schema={
        'crid': [strip]
    }
)
cleaner.perform(df)

100%|██████████| 1/1 [00:00<00:00, 30.14it/s]


Unnamed: 0,file_type,title,url,additional_info,tag,original_url,crid
0,document,CPB CR 1014796,http://documentcloud.org/documents/2646172-13S...,"{""normalized_title"": ""13SR2315"", ""documentclou...",CPB,http://documentcloud.org/documents/2646172-13S...,1014796
1,document,CRID 283121 CR,http://documentcloud.org/documents/2082448-cr-...,"{""normalized_title"": ""cr-283121"", ""documentclo...",CR,http://documentcloud.org/documents/2082448-cr-...,283121
2,document,CRID 309153 CR,https://www.documentcloud.org/documents/325241...,"{""normalized_title"": ""CRID-309153-CR"", ""docume...",CR,,309153
3,document,CRID 289138 CR,https://www.documentcloud.org/documents/325244...,"{""normalized_title"": ""CRID-289138-CR"", ""docume...",CR,,289138
4,document,CRID 289965 CR,https://www.documentcloud.org/documents/325245...,"{""normalized_title"": ""CRID-289965-CR"", ""docume...",CR,,289965
5,document,CRID 289177 CR,https://www.documentcloud.org/documents/361773...,"{""normalized_title"": ""CRID-289177-CR"", ""docume...",CR,,289177
6,document,CPB CR 1016176,http://documentcloud.org/documents/2646222-14S...,"{""normalized_title"": ""14SR232423252328"", ""docu...",CPB,http://documentcloud.org/documents/2646222-14S...,1016176
7,document,CRID 289862 CR,https://www.documentcloud.org/documents/325245...,"{""normalized_title"": ""CRID-289862-CR"", ""docume...",CR,,289862
8,document,CPB CR 1009860,http://documentcloud.org/documents/2646214-14S...,"{""normalized_title"": ""14SR2327"", ""documentclou...",CPB,http://documentcloud.org/documents/2646214-14S...,1009860
9,document,CPB CR 1002203,http://documentcloud.org/documents/2646167-13P...,"{""normalized_title"": ""13PB28352836"", ""document...",CPB,http://documentcloud.org/documents/2646167-13P...,1002203


#### Get the allegation's ids which documents are associated with

In [5]:
df_allegations = pd.DataFrame.from_records(Allegation.objects.using(import_db).all().values('id', 'crid'))

In [6]:
def get_allegation_id(crid):
    df_found = df_allegations[df_allegations['crid']==crid]
    return df_found.iloc[0]['id'] if not df_found.empty else np.NaN

In [7]:
tqdm.pandas(tqdm_notebook(), desc='Get allegation id')
df['allegation_id'] = df['crid'].progress_apply(get_allegation_id)
del df['crid']

A Jupyter Widget

Get allegation id: 100%|██████████| 2145/2145 [00:35<00:00, 60.73it/s]


#### Drop all documents which do not associated with any allegation

In [8]:
df = df[df['allegation_id'].notnull()]

#### Replace all `NaN` with `''`

In [9]:
df = df.where(pd.notnull(df), '')

## Import Allegation Document data

#### Delete all existing documents before importing

In [10]:
AttachmentFile.objects.using(import_db).delete()

(0, {u'data.AttachmentFile': 0})

#### Import allegation document and print out the imported data

In [11]:
tqdm.pandas(tqdm_notebook(), desc='Importing allegation document')
df.progress_apply(lambda x: AttachmentFile.objects.using(import_db).update_or_create(**x), axis=1)

pd.DataFrame.from_records(AttachmentFile.objects.using(import_db).all().values())

A Jupyter Widget

Importing allegation document: 32it [00:00, 314.95it/s]             




Importing allegation document: 791it [00:02, 353.55it/s]


Unnamed: 0,additional_info,allegation_id,file_type,id,original_url,tag,title,url
0,"{""normalized_title"": ""13SR2315"", ""documentclou...",67017,document,1,http://documentcloud.org/documents/2646172-13S...,CPB,CPB CR 1014796,http://documentcloud.org/documents/2646172-13S...
1,"{""normalized_title"": ""cr-283121"", ""documentclo...",23775,document,2,http://documentcloud.org/documents/2082448-cr-...,CR,CRID 283121 CR,http://documentcloud.org/documents/2082448-cr-...
2,"{""normalized_title"": ""CRID-309153-CR"", ""docume...",49851,document,3,,CR,CRID 309153 CR,https://www.documentcloud.org/documents/325241...
3,"{""normalized_title"": ""CRID-289138-CR"", ""docume...",30033,document,4,,CR,CRID 289138 CR,https://www.documentcloud.org/documents/325244...
4,"{""normalized_title"": ""CRID-289965-CR"", ""docume...",30846,document,5,,CR,CRID 289965 CR,https://www.documentcloud.org/documents/325245...
5,"{""normalized_title"": ""CRID-289177-CR"", ""docume...",30071,document,6,,CR,CRID 289177 CR,https://www.documentcloud.org/documents/361773...
6,"{""normalized_title"": ""14SR232423252328"", ""docu...",68062,document,7,http://documentcloud.org/documents/2646222-14S...,CPB,CPB CR 1016176,http://documentcloud.org/documents/2646222-14S...
7,"{""normalized_title"": ""CRID-289862-CR"", ""docume...",30747,document,8,,CR,CRID 289862 CR,https://www.documentcloud.org/documents/325245...
8,"{""normalized_title"": ""14SR2327"", ""documentclou...",63931,document,9,http://documentcloud.org/documents/2646214-14S...,CPB,CPB CR 1009860,http://documentcloud.org/documents/2646214-14S...
9,"{""normalized_title"": ""13PB28352836"", ""document...",58317,document,10,http://documentcloud.org/documents/2646167-13P...,CPB,CPB CR 1002203,http://documentcloud.org/documents/2646167-13P...
