In [97]:
import pandas as pd
import seaborn as sns

In [122]:
# Designate the file path, and various sets of features of interest.
filepath = "../data/elephant.csv"
features = ["country", "bin_year", "pop", "RRinc"]
features_country = ["country", "bin_year", "pop", "RRinc"]
features_year_quintile = ["bin_year", "quintile", "RRinc"]
features_year_region = ["bin_year", "region", "quintile", "RRinc"]

# Read the data and reset the index.
elephant = pd.read_csv(filepath, header=0)
df = elephant.reset_index(drop=True)
df = df[df.mysample == 1]

# Extract features and observations for 1988 and 2008
df = df[df["bin_year"].isin([1988,2008])][features]
print("\nRaw Data")
print(df.head(20))

# Group by year, remove superfluous year column name, sort by income, and create cumulative population by year.
dfg = df.groupby(by=["bin_year"]).apply(pd.DataFrame.sort_values, "RRinc")\
                                            .drop(columns=["bin_year"])
print("\nData Grouped by Year and sorted by income")
print(dfg.head(100))

# Add cumulative population by year
dfg["cumpop"] = dfg.groupby(by="bin_year")["pop"].cumsum()
print("\n1988 Data")
print(dfg.loc[1988].head(20))
print("\n2008 Data")
print(dfg.loc[2008].head(20))

# Reset the index, groupby year again, create population quintiles
dfg = dfg.reset_index()
dfg["quintile"] = dfg.groupby(by="bin_year")["cumpop"].transform(lambda x: pd.qcut(x, 20, labels=range(1,21)))
print("\nData with Quintiles")
print(dfg.head(20))

# Extract relevant features, group by year and quintile, and average the income
df1 = dfg[features_year_quintile].groupby(by=["bin_year", "quintile"]).mean()
print("\nAggregated Data")
print(df1)

# Pivot the data by year
df1 = df1.reset_index()
df1 = df1.pivot(index="quintile", columns="bin_year", values="RRinc")
df1 = df1.reset_index(drop=True)
df1["Growth"] = (df1.loc[:,2008] - df1.loc[:,1988]) / df1.loc[:,1988]* 100
print("\nFinal Data")
df1


Raw Data
      country  bin_year       pop    RRinc
40    Albania      2008  0.318140    736.0
41    Albania      2008  0.318140    962.0
42    Albania      2008  0.318140   1162.0
43    Albania      2008  0.318140   1352.0
44    Albania      2008  0.318140   1548.0
45    Albania      2008  0.318140   1759.0
46    Albania      2008  0.318140   2013.0
47    Albania      2008  0.318140   2355.0
48    Albania      2008  0.318140   2927.0
49    Albania      2008  0.318140   6048.0
50  Argentina      1988  3.172913   1107.0
51  Argentina      1988  3.172913   1842.0
52  Argentina      1988  3.172913   2565.0
53  Argentina      1988  3.172913   3307.0
54  Argentina      1988  3.172913   4094.0
55  Argentina      1988  3.172913   4996.0
56  Argentina      1988  3.172913   6116.0
57  Argentina      1988  3.172913   7645.0
58  Argentina      1988  3.172913  10280.0
59  Argentina      1988  3.172913  22020.0

Data Grouped by Year and sorted by income
                   country        pop  RRinc

KeyError: 'quintile'

In [73]:
# Explore the data

# Reset the index, extract features of interest, and add cumulative sum of population, sorted by income.
dfg1 = dfg.reset_index()
dfg1 = dfg1[features_country]
dfg1 = dfg1.
dfg1 = dfg1.groupby(by=["bin_year","quintile"])
df1.info()

# sns.lineplot(x="quintile", y="RRinc", hue="bin_year",data=dfp)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2020 entries, 0 to 2019
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   bin_year  2020 non-null   int64   
 1   quintile  2020 non-null   category
 2   RRinc     2019 non-null   float64 
dtypes: category(1), float64(1), int64(1)
memory usage: 33.8 KB


In [None]:
# Group by year, sort by income, compute cumulative sum of and quintile population
dfg = df.groupby(by="bin_year").apply(pd.DataFrame.sort_values, "Income").drop(columns="bin_year")
dfg["popsum"] = dfg["pop"].cumsum()
dfg["quintile"] = pd.qcut(dfg["popsum"], 20, labels=range(20))

# Extract year, country, quintile, and Income
dfg.reset_index(inplace=True)
df = dfg[["bin_year","country", "quintile", "Income"]].groupby(by=["bin_year", "quintile"]).agg({"Income": "mean"}).reset_index()
df

# ## Pivot on quintile
# df = df.pivot(index="quintile", columns="bin_year", values="Income")
# df.head()
