In [None]:
from os.path import join
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from region_merge import merge_region
from data_cleanup import return_rows_where_all_corruption_data_is_available

In [None]:
data_dir = 'data'
qog_dataset_filename = 'qog_std_ts_jan22.csv'
df = pd.read_csv(join(data_dir, qog_dataset_filename), low_memory=False)

df = merge_region(df)
df_reduced = return_rows_where_all_corruption_data_is_available(df)


In [None]:
df.info(verbose=True, memory_usage='deep', show_counts=True)
df.describe()

In [None]:
corruption_col = ['bci_bci', 'ti_cpi', 'vdem_corr', 'vdem_execorr', 'vdem_jucorrdc', 'vdem_pubcorr', 'wbgi_cce', 'ti_cpi_om']
identication_col = ['ccode', 'ccode_qog', 'ccodealp', 'ccodealp_year', 'ccodecow', 'cname', 'cname_qog', 'cname_year', 'version', 'year', 'region', 'sub-region']
corruption_corr = df[corruption_col].corr()
mask = np.triu(np.ones_like(corruption_corr, dtype=bool))
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corruption_corr, mask=mask, cmap=cmap, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True)


In [None]:
sns.pairplot(df[corruption_col])



In [None]:
msno.bar(df[corruption_col])

CPI has a new and old methodology therefore non of the values exist at the same time

In [None]:
corruption_col = ['bci_bci', 'ti_cpi', 'vdem_corr', 'vdem_execorr', 'vdem_jucorrdc', 'vdem_pubcorr', 'wbgi_cce', 'ti_cpi_om']

# drop every row where non of the corruption data is available
df_any_corruption_info_available = df.dropna(subset=corruption_col, axis=0, how="all")

# display(df)
#display(df_any_corruption_info_available)
#15k to 11k rows

# drop every row where not every corruption data is available
# CPI dofferent methodology must first be merged
def test(df):
    corruption_col = ['bci_bci', 'ti_cpi', 'vdem_corr', 'vdem_execorr', 'vdem_jucorrdc', 'vdem_pubcorr', 'wbgi_cce']

    df_cpi_combined = df.copy()
    df_cpi_combined['ti_cpi']=df['ti_cpi'].combine_first(df['ti_cpi_om'])
    df_all_corruption_info_available = df_cpi_combined.dropna(subset=corruption_col, axis=0, how='any')
    return df_all_corruption_info_available

df_all_corruption_info_available = test(df)

display(df)

display(df_all_corruption_info_available)
#11k to 1.7k rows

corruption_col_with_year = ['year','bci_bci', 'ti_cpi', 'vdem_corr', 'vdem_execorr', 'vdem_jucorrdc', 'vdem_pubcorr', 'wbgi_cce']
msno.matrix(df[corruption_col_with_year].sort_values(by='year'))
msno.matrix(df_any_corruption_info_available[corruption_col_with_year].sort_values(by='year'))
msno.matrix(df_all_corruption_info_available[corruption_col_with_year].sort_values(by='year'))
msno.heatmap(df_any_corruption_info_available[corruption_col])


In [None]:
df[corruption_col].describe()


In [None]:

def plot_not_nan_threshold(df, plotname):
    length = df.shape[0]
    percent_nan = (df.isna().sum().values/length)*100
    column_count = []
    percent_nan
    for i in range (100):
        column_count.append((percent_nan > i).sum())

    fig, (ax1, ax2) = plt.subplots(2)

    ax1.plot(column_count)
    fig.suptitle(f"{plotname}")
    ax1.grid()
    ax1.set_ylabel('number of columns')

    ax2.plot(column_count)
    ax2.set_xlim([85, 100])
    ax2.set_ylim([0, 1000])
    ax2.grid()
    ax2.set_xlabel('not nan threshold')

plot_not_nan_threshold(df_all_corruption_info_available, "Non nan threshold for all corruption data available")
plot_not_nan_threshold(df_any_corruption_info_available, "Non nan threshold for any corruption data available")


In [None]:
def clean_data_for_corruption(df, target_column, threshold):
    if target_column not in corruption_col:
        print('arget column not in corruption columns')
        return
    #drop all corruption columns but the target_column
    columns_to_drop = [col for col in corruption_col if col != 'target_column']
    columns_reduced = df.drop(labels=columns_to_drop)

#
display(df_all_corruption_info_available)
df_without_corruption = df_all_corruption_info_available.drop(columns=corruption_col)

treshhold = (df_without_corruption.shape[0]/100)*100
df_columns_reduced = df_without_corruption.dropna(axis='columns', thresh=treshhold)
display(df_columns_reduced)

In [None]:
def drop_values(df, min_percentage_non_nan, exclude_columns):
    df_without_exclude_columns = df.drop(columns = exclude_columns)
    mask = np.logical_or.reduce(df_without_exclude_columns.isnull(), axis = 0)

    


    return df.drop(columns = df_without_exclude_columns.loc[:,mask].columns.values.tolist())

df_reduced = drop_values(df_cpi_combined_all_corruption_indices, 99, corruption_columns)
# display(df_reduced)
print(df_reduced.columns.values.tolist())
df_reduced = drop_values(df_any_corruption_index_available, 99, corruption_columns)
# display(df_reduced)
print(df_reduced.columns.values.tolist())
