# Pivoting / Reshaping - Exercises

# Preparations

In [1]:
import pandas as pd

pd.set_option("display.max_columns", 500)

# Exercise

1. Load the first sheet of the Excel file "wdi_reduced.xlsx" into a pandas DataFrame (see [here](https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html) for help with `pandas.read_excel()`)
2. Select only the columns *countrycode*, *year*, and *NY_GDP_MKTP_CD*.
3. Select only the years after 2010.
4. Reshape to wide format. (One row per *countrycode*)
5. Generate a new column "large_value_in_2011" indicating whether a country had a *NY_GDP_MKTP_CD* above the 2011-median.
6. Reshape to long format. (One row per *countrycode* and *year*).

# 1. Load the first sheet of the Excel file "wdi_timeseries.xlsx" into a pandas DataFrame (see [here](https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html) for help with `pandas.read_excel()`)

In [2]:
df = pd.read_excel("../../data/raw/wdi_reduced.xlsx", sheet_name="wdi")
df.head(3)

Unnamed: 0,countrycode,countryname,region,year,NY_GDP_MKTP_CD,NY_GDP_MKTP_KD_ZG,SP_POP_TOTL
0,ABW,Aruba,Latin America & Caribbean,2007,2623726000.0,-3.654626,101220.0
1,ABW,Aruba,Latin America & Caribbean,2011,2584464000.0,,102053.0
2,ABW,Aruba,Latin America & Caribbean,1992,,,68235.0


# 2. Select only the columns *countrycode*, *year*, and *NY_GDP_MKTP_CD*.

In [3]:
df = df[["countrycode", "year", "NY_GDP_MKTP_CD"]]
df.head(3)

Unnamed: 0,countrycode,year,NY_GDP_MKTP_CD
0,ABW,2007,2623726000.0
1,ABW,2011,2584464000.0
2,ABW,1992,


# 3. Select only the years after 2010.

In [4]:
df = df[df["year"] > 2010]
df.head(3)

Unnamed: 0,countrycode,year,NY_GDP_MKTP_CD
1,ABW,2011,2584464000.0
27,ABW,2012,
32,ABW,2016,


# 4. Reshape to wide format. (One row per *countrycode*)

In [5]:
df_wide = df.pivot_table(index="countrycode", columns="year")
df_wide.head(3)

Unnamed: 0_level_0,NY_GDP_MKTP_CD,NY_GDP_MKTP_CD,NY_GDP_MKTP_CD,NY_GDP_MKTP_CD,NY_GDP_MKTP_CD,NY_GDP_MKTP_CD
year,2011,2012,2013,2014,2015,2016
countrycode,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
ABW,2584464000.0,,,,,
AFG,17930240000.0,20536540000.0,20046330000.0,20050190000.0,19702990000.0,19469020000.0
AGO,104115900000.0,115398400000.0,124912100000.0,126776900000.0,102962200000.0,89633160000.0


In [6]:
def simplify_df(df, flatten_columns=True, reset_index=True):
    df = df.copy()

    def join_if_more_than_one(x):
        if len(x) > 1:
            return "_".join(x)
        else:
            return x[0]

    if flatten_columns and (isinstance(df.columns, pd.MultiIndex)):
        df.columns = [
            join_if_more_than_one(list(filter(None, item)))
            for item in [list(str(i) for i in t) for t in df.columns.to_flat_index()]
        ]
    if reset_index:
        df = df.reset_index()
    return df

In [7]:
df_wide = simplify_df(df_wide)
df_wide

Unnamed: 0,countrycode,NY_GDP_MKTP_CD_2011,NY_GDP_MKTP_CD_2012,NY_GDP_MKTP_CD_2013,NY_GDP_MKTP_CD_2014,NY_GDP_MKTP_CD_2015,NY_GDP_MKTP_CD_2016
0,ABW,2.584464e+09,,,,,
1,AFG,1.793024e+10,2.053654e+10,2.004633e+10,2.005019e+10,1.970299e+10,1.946902e+10
2,AGO,1.041159e+11,1.153984e+11,1.249121e+11,1.267769e+11,1.029622e+11,8.963316e+10
3,ALB,1.289087e+10,1.231978e+10,1.278103e+10,1.321986e+10,1.139037e+10,1.192689e+10
4,AND,3.427023e+09,3.146152e+09,3.248925e+09,,,
...,...,...,...,...,...,...,...
199,XKX,6.649291e+09,6.473725e+09,7.072092e+09,7.386891e+09,6.440501e+09,6.649889e+09
200,YEM,3.272642e+10,3.540134e+10,4.041523e+10,4.322858e+10,3.773392e+10,2.731761e+10
201,ZAF,4.164189e+11,3.963278e+11,3.666239e+11,3.508506e+11,3.174066e+11,2.948406e+11
202,ZMB,2.346010e+10,2.550337e+10,2.804546e+10,2.715063e+10,2.115439e+10,1.955109e+10


# 5. Generate a new column "large_value_in_2011" indicating whether a country had a *NY_GDP_MKTP_CD* above the 2011-median.

In [8]:
df_wide["large_value_in_2011"] = (
    df_wide["NY_GDP_MKTP_CD_2011"] > df_wide["NY_GDP_MKTP_CD_2011"].median()
)
df_wide.head(3)

Unnamed: 0,countrycode,NY_GDP_MKTP_CD_2011,NY_GDP_MKTP_CD_2012,NY_GDP_MKTP_CD_2013,NY_GDP_MKTP_CD_2014,NY_GDP_MKTP_CD_2015,NY_GDP_MKTP_CD_2016,large_value_in_2011
0,ABW,2584464000.0,,,,,,False
1,AFG,17930240000.0,20536540000.0,20046330000.0,20050190000.0,19702990000.0,19469020000.0,False
2,AGO,104115900000.0,115398400000.0,124912100000.0,126776900000.0,102962200000.0,89633160000.0,True


# 6. Reshape to long format. (One row per *countrycode* and *year*).

In [9]:
df_long = df_wide.melt(id_vars=["countrycode", "large_value_in_2011"], var_name="year")
df_long.head(3)

Unnamed: 0,countrycode,large_value_in_2011,year,value
0,ABW,False,NY_GDP_MKTP_CD_2011,2584464000.0
1,AFG,False,NY_GDP_MKTP_CD_2011,17930240000.0
2,AGO,True,NY_GDP_MKTP_CD_2011,104115900000.0


In [10]:
df_long["year"] = df_long["year"].str[-4:].astype("int")
df_long.head(3)

Unnamed: 0,countrycode,large_value_in_2011,year,value
0,ABW,False,2011,2584464000.0
1,AFG,False,2011,17930240000.0
2,AGO,True,2011,104115900000.0
