In [14]:
import io
import pandas as pd

In [15]:
df_combined = pd.read_pickle('../data/df_combined.pkl')

In [16]:
# ✅ Combine all 'country' columns into a single 'country' column (excluding 'countrycode')
country_columns = [col for col in df_combined.columns if 'country' in col and col != 'countrycode']

# ✅ Only proceed if there are multiple country columns
if country_columns:
    df_combined['country'] = df_combined[country_columns].bfill(axis=1).iloc[:, 0]  # Combine into one column

    # ✅ Keep 'countrycode' and the newly created 'country' column, drop duplicates
    df_combined = df_combined.drop(columns=[col for col in country_columns if col != 'country'])

# ✅ Display final DataFrame
print("\nFinal DataFrame with Separate 'countrycode' and 'country' Columns:")
print(df_combined.head())



Final DataFrame with Separate 'countrycode' and 'country' Columns:
  countrycode currency_unit_2  year        rgdpna currency_unit_3  \
0         AGO          Kwanza  1970  54237.054688          Kwanza   
1         AGO          Kwanza  1971  57491.277344          Kwanza   
2         AGO          Kwanza  1972  57606.261719          Kwanza   
3         AGO          Kwanza  1973  62272.367188          Kwanza   
4         AGO          Kwanza  1974  64202.808594          Kwanza   

           rnna currency_unit_4       emp currency_unit_5        hc  \
0  295517.62500          Kwanza  3.666207          Kwanza  1.015686   
1  314195.09375          Kwanza  3.742484          Kwanza  1.018196   
2  332435.84375          Kwanza  3.853271          Kwanza  1.020712   
3  352647.90625          Kwanza  3.987807          Kwanza  1.023234   
4  373267.71875          Kwanza  4.130696          Kwanza  1.025762   

  currency_unit_6       pop currency_unit_7     labsh country  
0          Kwanza  5.89036

In [17]:
# ✅ Combine all 'currency_unit' columns into a single column
currency_columns = [col for col in df_combined.columns if 'currency_unit' in col]
df_combined['currency_unit'] = df_combined[currency_columns].bfill(axis=1).iloc[:, 0]
df_combined = df_combined.drop(columns=currency_columns)

# ✅ Display final DataFrame with a single 'currency_unit' column
print("\nCombined and Cleaned DataFrame with Single 'currency_unit' Column:")
print(df_combined.head())



Combined and Cleaned DataFrame with Single 'currency_unit' Column:
  countrycode  year        rgdpna          rnna       emp        hc       pop  \
0         AGO  1970  54237.054688  295517.62500  3.666207  1.015686  5.890365   
1         AGO  1971  57491.277344  314195.09375  3.742484  1.018196  6.040777   
2         AGO  1972  57606.261719  332435.84375  3.853271  1.020712  6.248552   
3         AGO  1973  62272.367188  352647.90625  3.987807  1.023234  6.496962   
4         AGO  1974  64202.808594  373267.71875  4.130696  1.025762  6.761380   

      labsh country currency_unit  
0  0.284385  Angola        Kwanza  
1  0.284385  Angola        Kwanza  
2  0.284385  Angola        Kwanza  
3  0.284385  Angola        Kwanza  
4  0.284385  Angola        Kwanza  


In [18]:
# ✅ Reorder DataFrame columns
columns_order = ['country', 'currency_unit'] + [col for col in df_combined.columns if col not in ['country', 'currency_unit']]
df_combined = df_combined[columns_order]

# ✅ Display final DataFrame with reordered columns
print("\nReordered DataFrame:")
print(df_combined.head())


Reordered DataFrame:
  country currency_unit countrycode  year        rgdpna          rnna  \
0  Angola        Kwanza         AGO  1970  54237.054688  295517.62500   
1  Angola        Kwanza         AGO  1971  57491.277344  314195.09375   
2  Angola        Kwanza         AGO  1972  57606.261719  332435.84375   
3  Angola        Kwanza         AGO  1973  62272.367188  352647.90625   
4  Angola        Kwanza         AGO  1974  64202.808594  373267.71875   

        emp        hc       pop     labsh  
0  3.666207  1.015686  5.890365  0.284385  
1  3.742484  1.018196  6.040777  0.284385  
2  3.853271  1.020712  6.248552  0.284385  
3  3.987807  1.023234  6.496962  0.284385  
4  4.130696  1.025762  6.761380  0.284385  


In [19]:
df_combined = df_combined.drop(columns=['pop', 'labsh'])

In [20]:
df_combined = df_combined.rename(columns={'rgdpna': 'Real GDP', 'rnna':'Capital', 'emp':'Labour','hc':'Human Capital'})
df_combined = df_combined.drop(columns='countrycode')

In [21]:
df_combined

Unnamed: 0,country,currency_unit,year,Real GDP,Capital,Labour,Human Capital
0,Angola,Kwanza,1970,54237.054688,295517.625000,3.666207,1.015686
1,Angola,Kwanza,1971,57491.277344,314195.093750,3.742484,1.018196
2,Angola,Kwanza,1972,57606.261719,332435.843750,3.853271,1.020712
3,Angola,Kwanza,1973,62272.367188,352647.906250,3.987807,1.023234
4,Angola,Kwanza,1974,64202.808594,373267.718750,4.130696,1.025762
...,...,...,...,...,...,...,...
6359,Zimbabwe,US Dollar,2015,42008.199219,64916.476562,6.393752,2.584653
6360,Zimbabwe,US Dollar,2016,42325.726562,66257.859375,6.504374,2.616257
6361,Zimbabwe,US Dollar,2017,44316.742188,67627.562500,6.611773,2.648248
6362,Zimbabwe,US Dollar,2018,46457.097656,69059.625000,6.714952,2.680630


In [22]:
df_cleaned = df_combined.copy()
df_cleaned.to_pickle('../data/df_cleaned.pkl') 
