In [1]:
import pandas as pd
from itertools import combinations
from fuzzywuzzy import fuzz
from utils import load_data
import utils
import numpy as np

# Load data

In [2]:
data = load_data()

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




In [3]:
expenses = data["expenses"].pipe(
    lambda df: df.assign(
        **{
            "parent_name": df.parent_name.replace("^\s*$", np.nan, regex=True).str.strip(),
            "budget_type_name": df["budget_type_name"].replace(
                "^\s*$", np.nan, regex=True
            ).str.strip(),
            "organization_name": df["organization_name"].replace(
                "^\s*$", np.nan, regex=True
            ).str.strip(),
            "value": pd.to_numeric(df.value, errors="coerce")
        }
    )
)

# Expenses

## All ministries have top level budget type

Check the number of unique ministries per year, compare it to the number of ministries with a top level budget type

In [None]:
for year in expenses.year.unique():
    print(year)
    for i in utils.top_level_budget(expenses, year):
        if i not in ['الدولة', 'الصندوق العام للتعويـــــض']:
            print(i)

## Similar but not quite exaclty the same organization names for the same year

In [None]:
for year in expenses.year.unique():
    print(year)
    orgs = expenses[expenses.year == year].organization_name.unique()
    combs = combinations(orgs, 2)
    for comb in combs:
        ratio = fuzz.token_set_ratio(*comb)
        if 80 < ratio < 100:
            print(comb)

## State budget must equal the sum of ministries' budgets and state-level expenses

For each year, the sum of ministries budgets and نفقات طارئة و غير موزعة and الدين العمومي must equal ميزانية الدولة

In [None]:
year = 2019

In [None]:
expenses[
    (expenses.year == year)
    & (expenses.organization_name == "الدولة")
    & (expenses.extra == 0)
]

In [5]:
gap = utils.budget_gap(expenses)
gap[(gap.gap != 0) & (gap.double != 2)].set_index(["year", "organization_name"]).sort_index(axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,budget_type_name,double,extra,gap,parent_name_typed,value_agg,value_typed
year,organization_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015,وزارة التنمية و التعاون الدولي,التمويل العمومي,1.928,0,-386.006,نفقات التنمية,802.012,416.006
2016,وزارة الداخلية,التمويل العمومي,3.383,0,-270.0,نفقات التنمية,383.3,113.3
2019,مجلس نواب الشعب,نفقات التصرف,1.038,0,1.13,,29.793,30.923


In [None]:
expenses.loc[
    (expenses.organization_name.str.strip() == "مجلس نواب الشعب")
    & (expenses.year == year)
    & (expenses.parent_name.str.strip() == "نفقات التصرف")
]

## Compensation

In [None]:
comp_cols = ["ministry", "budget_type"]
comp_cols.extend(reversed(range(2015, 2020)))  # reversed because rtl
converters = {}
for year in range(2015, 2020):
    converters[year] = pd.to_numeric
comp = (
    pd.read_excel(
        "data/compensation.xlsx",
        sheet_name="الدعم ",
        skiprows=1,
        nrows=20,
        usecols=range(7),
        names=comp_cols,
        #converters=converters
    )
    .pipe(lambda df: df.assign(ministry=df.ministry.fillna(method="ffill")))
    .pipe(lambda df: df.loc[df.ministry.str.strip() != "الجملة"])
)

In [None]:
names = rt.name.unique()
#names = bt.name.unique()

In [None]:
for c in combinations(names, 2):
    r = fuzz.token_sort_ratio(*c)
    if 90 < r < 100:
        print(c)

In [6]:
revenues = data["revenues"]

In [7]:
summed = (
    revenues.groupby(["year", "parent_name"])
    .agg(sum)
    .reset_index()
    .rename(columns={"parent_name": "name"})
)
typed = revenues.loc[
    revenues.name.isin(revenues.parent_name), ["name", "year", "value"]
]

In [8]:
revenues_gap = pd.merge(
    summed, typed, on=["name", "year"], suffixes=("_summed", "_typed")
).pipe(lambda df: df.assign(gap=np.round(df.value_typed - df.value_summed, 3)))

In [9]:
revenues.loc[(revenues.parent_name.str.contains("مداخيل غير اعتيادية اخرى")) & (revenues.year == 2015)]

Unnamed: 0,name,parent_name,resource_type_description,type,value,year
422,مداخيل التخصيص,مداخيل غير اعتيادية اخرى,مداخيل التخصيص,95,0.0,2015


In [10]:
revenues_gap.pipe(lambda df: df.loc[df.gap != 0]).sort_values(
    ["year", "name", "value_typed"]
).set_index("year")

Unnamed: 0_level_0,name,value_summed,value_typed,gap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015,مداخيل أملاك الدولة الاعتيادية,301.5,501.5,200.0
2015,مداخيل غير اعتيادية اخرى,0.0,211.0,211.0
2016,الضريبة على دخل الأشخاص الطبيعين و الضريبة على...,1094.0,1103.0,9.0
2016,الضريبة على دخل الأشخاص الطبيعيين و الضريبة عل...,1195.0,1530.0,335.0
2016,المداخيل الجبائية الاعتيادية,11209.2,19987.2,8778.0
2016,مداخيل غير اعتيادية اخرى,0.0,350.0,350.0
2017,مداخيل غير اعتيادية اخرى,200.0,450.0,250.0
2018,المداخيل الجبائية الاعتيادية,14462.2,22847.2,8385.0
2018,المداخيل غير الجبائية الاعتيادية,946.0,1756.0,810.0
2018,مداخيل غير اعتيادية اخرى,500.0,865.0,365.0


## Generate Opend data files

In [None]:
for year in expenses.year.unique():
    for organization in expenses.loc[
        expenses.year == year, "organization_name"
    ].unique():
        res_df = expenses[
            (expenses.year == year) & (expenses.organization_name == organization)
        ]
        res_df.to_csv(
            "data/open/{}_{}.csv".format(organization.strip().replace(" ", "_"), year),
            index=False
        )
        res_df.to_json(
            "data/open/{}_{}.json".format(organization.strip().replace(" ", "_"), year),
            index=False,
            orient="table",
        )

## Pseudo names generation
For each one of these:

* Duplicate them to match the number of their possible parents
* Add a suffix with their parent name for each entry
* Use them in budget_by_type

In [None]:
expenses = data["expenses"].pipe(
    lambda df: df.assign(parent_name=df.parent_name.replace("", np.nan))
)
expenses.loc[expenses.parent_name.isnull(), "budget_type_name"].unique()