In [26]:
import collections
import unicodedata
import requests
from datetime import datetime, date, time
import csv
import pandas as pd

# Universal get by query

In [71]:
class ApiException(Exception): pass
class InvalidRequestException(ApiException): pass
class DataNotFoundException(ApiException): pass
class TooManyRecordsException(ApiException): pass

def get_raw_contracts_if_possible(params):
    raw_json = requests.get("http://openapi.clearspending.ru/restapi/v3/contracts/search/", params=params)
    if raw_json.text == "Invalid request.":
        raise InvalidRequestException(params)
    if raw_json.text == "Data not found.":
        raise DataNotFoundException(params)
        
    json = raw_json.json()["contracts"]
    contracts = json["data"]
    
    if (json["page"] == 1):
        total = json["total"]
        print 'total:', total,
        if total == 500:
            raise TooManyRecordsException(params)
        print '\t| 1',
        if total > 50:
            for page_num in range(2, total/50 + 2):
                params_with_page = params.copy()
                params_with_page['page'] = page_num
                print page_num,
                page_contracts = get_raw_contracts_if_possible(params_with_page)
                contracts.extend(page_contracts)
    return contracts

def get_deals_with_restriction(params, start_date, end_date):
    params_with_daterange = params.copy()
    params_with_daterange['daterange'] = start_date.strftime("%d.%m.%Y") + "-" + end_date.strftime("%d.%m.%Y")
    print params_with_daterange['daterange'], 
    try:
        contracts = get_raw_contracts_if_possible(params_with_daterange)
        print ''
    except TooManyRecordsException as e:
        print 'too many!'
        middle_date = start_date + (end_date - start_date)/2
        contracts = get_deals_with_restriction(params, start_date, middle_date)
        contracts2 = get_deals_with_restriction(params, middle_date, end_date)
        contracts.extend(contracts2)
    except Exception as e:
        print ''
        raise
    return contracts

# Preprocessing

In [5]:
def dict_to_plain_dict(branch, prefix):
    d = {}
    for key, value in branch.iteritems():
        if isinstance(value, dict):
            dd = dict_to_plain_dict(value, prefix + "_" + key if prefix != '' else key)
            d.update(dd)
        else:
            d[prefix + "_" + key if prefix != '' else key] = value
    return d

# Main

In [76]:
def get_raw_contracts_by_okdp_list(okdp_list, start_date, end_date):
    contracts = []
    for okdp in okdp_list:
        print '!okdp', okdp,
        params = {
            "okdp_okpd": okdp
        }
        try:
            okdp_contracts = get_deals_with_restriction(params, start_date, end_date)
            contracts.extend(okdp_contracts)
        except ApiException as e:
            print type(e), e
    return contracts

In [77]:
start_date = date(2014, 1, 1)
end_date = date(2015, 12, 31)

with open('okdp_list.txt', 'r') as okdp_list_file:
    okdp_list = okdp_list_file.read().split('\n')

raw_contracts = get_raw_contracts_by_okdp_list(okdp_list, start_date, end_date)

!okdp 92.20.11.110 01.01.2014-31.12.2015 total: 1 	| 1 
!okdp 92.20.11.111 01.01.2014-31.12.2015 total: 20 	| 1 
!okdp 92.20.11.112 01.01.2014-31.12.2015 total: 23 	| 1 
!okdp 92.20.11.120 01.01.2014-31.12.2015 total: 41 	| 1 
!okdp 92.20.11.121 01.01.2014-31.12.2015 total: 407 	| 1 2 3 4 5 6 7 8 9 
!okdp 92.20.11.122 01.01.2014-31.12.2015 total: 457 	| 1 2 3 4 5 6 7 8 9 10 
!okdp 92.20.11.130 01.01.2014-31.12.2015 total: 3 	| 1 
!okdp 92.20.11.131 01.01.2014-31.12.2015 total: 23 	| 1 
!okdp 92.20.11.132 01.01.2014-31.12.2015 total: 24 	| 1 
!okdp 92.20.11.140 01.01.2014-31.12.2015 total: 2 	| 1 
!okdp 92.20.11.141 01.01.2014-31.12.2015 total: 9 	| 1 
!okdp 92.20.11.142 01.01.2014-31.12.2015 total: 12 	| 1 
!okdp 92.20.11.190 01.01.2014-31.12.2015 total: 2 	| 1 
!okdp 92.20.11.191 01.01.2014-31.12.2015 total: 88 	| 1 2 
!okdp 92.20.11.192 01.01.2014-31.12.2015 total: 94 	| 1 2 
!okdp 92.20.12.110 01.01.2014-31.12.2015 total: 4 	| 1 
!okdp 92.20.12.111 01.01.2014-31.12.2015 total: 28 	|

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [78]:
plain_contracts = [dict_to_plain_dict(raw_contract, "") for raw_contract in raw_contracts]
raw_df = pd.DataFrame(plain_contracts)
df = raw_df[[
        'regNum',
        'signDate',
        'regionCode',
        'customer_fullName',
        'customer_inn',
        'customer_kpp',
        'suppliers',
        'economic_sectors',
        'finances_budget_code',
        'finances_budget_name',
        'finances_budgetLevel_code',
        'finances_financeSource',
        'products',
        'price',
        'currency_code',
        'singleCustomerReason_id',
        'singleCustomerReason_name',
        'contractUrl'
    ]]

df['signDate'] = pd.to_datetime(df['signDate'], format='%Y-%m-%dT%H:%M:%S')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [10]:
df.to_csv("contracts.csv", sep='\t', encoding='utf-8', escapechar=' ')

In [79]:
df.to_excel("contracts.xlsx")