# Getting started with Pandas - Exercises

## Preparations

In [1]:
import pandas as pd

pd.set_option("display.max_columns", 500)

# Exercise

*Preparations*
* create new jupyter notebook
* store it under notebooks/own_exercises

*Tasks*
1. Load the first sheet of the Excel file "wdi_reduced.xlsx" into a pandas DataFrame (see [here](https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html) for help with `pandas.read_excel()`). *Hint*: you need to save the notebook in a suitable directory before you can use relative file paths! Also, if you use a different folder from the one specified in *preparations*, you may have to adjust the relative path, e.g. the number of times you use the "../" prefix.
2. What are the data types of the columns?
3. Display the descriptive statistics of the numeric columns.
4. Select the columns *countryname* and *year* from the DataFrame.
5. Select the rows from Germany and France.
6. Bonus: Select the rows from Germany and France in the years 2008 to 2010 and all columns starting with 'country' as well as *year*.
7. Bonus: Select only numeric columns.
8. Bonus: Select the numeric columns except those starting with "NY_".

# 1. Load the first sheet of the Excel file "wdi_reduced.xlsx" into a pandas DataFrame (see [here](https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html) for help with `pandas.read_excel()`)

In [2]:
df = pd.read_excel("../../data/raw/wdi_reduced.xlsx", sheet_name="wdi")
df.head()

Unnamed: 0,countrycode,countryname,region,year,NY_GDP_MKTP_CD,NY_GDP_MKTP_KD_ZG,SP_POP_TOTL
0,ABW,Aruba,Latin America & Caribbean,2007,2623726000.0,-3.654626,101220.0
1,ABW,Aruba,Latin America & Caribbean,2011,2584464000.0,,102053.0
2,ABW,Aruba,Latin America & Caribbean,1992,,,68235.0
3,ABW,Aruba,Latin America & Caribbean,1989,,,61032.0
4,ABW,Aruba,Latin America & Caribbean,1975,,,60657.0


# 2. What are the data types of the columns?

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12586 entries, 0 to 12585
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   countrycode        12586 non-null  object 
 1   countryname        12586 non-null  object 
 2   region             12586 non-null  object 
 3   year               12586 non-null  int64  
 4   NY_GDP_MKTP_CD     9217 non-null   float64
 5   NY_GDP_MKTP_KD_ZG  8854 non-null   float64
 6   SP_POP_TOTL        12263 non-null  float64
dtypes: float64(3), int64(1), object(3)
memory usage: 688.4+ KB


# 3. Display the descriptive statistics of the numeric columns.

In [4]:
df.describe()

Unnamed: 0,year,NY_GDP_MKTP_CD,NY_GDP_MKTP_KD_ZG,SP_POP_TOTL
count,12586.0,9217.0,8854.0,12263.0
mean,1988.5,158368100000.0,3.882721,23792960.0
std,16.741334,836635800000.0,6.388015,99857930.0
min,1960.0,8824448.0,-64.047107,4279.0
25%,1974.0,1243469000.0,1.416333,449123.5
50%,1988.5,6624068000.0,3.889729,4046901.0
75%,2003.0,44010160000.0,6.416794,12923220.0
max,2017.0,18569100000000.0,149.972963,1378665000.0


# 4. Select the columns *countryname* and *year* from the DataFrame.

In [5]:
df.loc[:, ["countryname", "year"]]

Unnamed: 0,countryname,year
0,Aruba,2007
1,Aruba,2011
2,Aruba,1992
3,Aruba,1989
4,Aruba,1975
...,...,...
12581,Zimbabwe,1962
12582,Zimbabwe,1993
12583,Zimbabwe,1988
12584,Zimbabwe,1986


# 5. Select the rows from Germany and France.

In [6]:
# Let us do this step by step, first creating a boolean 'mask'
mask = (df["countrycode"] == "FRA") | (df["countrycode"] == "DEU")

In [7]:
# this is a pandas-Series and as such has many useful methods, such as .sum():
mask.sum()

np.int64(116)

In [8]:
# now use this mask to select the rows
df.loc[mask, :]

Unnamed: 0,countrycode,countryname,region,year,NY_GDP_MKTP_CD,NY_GDP_MKTP_KD_ZG,SP_POP_TOTL
2900,DEU,Germany,Europe & Central Asia,2017,,,
2901,DEU,Germany,Europe & Central Asia,1997,2.218689e+12,1.849201,82034771.0
2902,DEU,Germany,Europe & Central Asia,1999,2.199957e+12,1.987135,82100243.0
2903,DEU,Germany,Europe & Central Asia,1986,1.042301e+12,2.287339,77720436.0
2904,DEU,Germany,Europe & Central Asia,1975,4.887802e+11,-0.866739,78673554.0
...,...,...,...,...,...,...,...
3765,FRA,France,Europe & Central Asia,1975,3.620009e+11,-0.980162,54252574.0
3766,FRA,France,Europe & Central Asia,1993,1.330095e+12,-0.612653,59106766.0
3767,FRA,France,Europe & Central Asia,1965,1.021606e+11,4.778035,50023774.0
3768,FRA,France,Europe & Central Asia,2009,2.693827e+12,-2.941341,64707044.0


In [9]:
# of course, you could achieve this in a one-liner:
df.loc[(df["countrycode"] == "DEU") | (df["countrycode"] == "FRA"), :]

Unnamed: 0,countrycode,countryname,region,year,NY_GDP_MKTP_CD,NY_GDP_MKTP_KD_ZG,SP_POP_TOTL
2900,DEU,Germany,Europe & Central Asia,2017,,,
2901,DEU,Germany,Europe & Central Asia,1997,2.218689e+12,1.849201,82034771.0
2902,DEU,Germany,Europe & Central Asia,1999,2.199957e+12,1.987135,82100243.0
2903,DEU,Germany,Europe & Central Asia,1986,1.042301e+12,2.287339,77720436.0
2904,DEU,Germany,Europe & Central Asia,1975,4.887802e+11,-0.866739,78673554.0
...,...,...,...,...,...,...,...
3765,FRA,France,Europe & Central Asia,1975,3.620009e+11,-0.980162,54252574.0
3766,FRA,France,Europe & Central Asia,1993,1.330095e+12,-0.612653,59106766.0
3767,FRA,France,Europe & Central Asia,1965,1.021606e+11,4.778035,50023774.0
3768,FRA,France,Europe & Central Asia,2009,2.693827e+12,-2.941341,64707044.0


# 6. Bonus: Select the rows from Germany and France in the years 2008 to 2010 and all columns starting with 'country' as well as *year*.

In [10]:
# again, we proceed step by step, creating a selection mask for the rows first
# we have to make sure that the brackets are correctly set!
row_mask = (
    ((df["countrycode"] == "DEU") | (df["countrycode"] == "FRA"))
    & (df["year"] >= 2008)
    & (df["year"] <= 2010)
)
row_mask.sum()

np.int64(6)

In [11]:
# now we create the selection mask for the columns, step by step
column_mask = df.columns[df.columns.str.startswith("country")]
print(column_mask)
# in order to add "year" to the column_mask, we have to transform it into a list first
column_mask = column_mask.to_list()
print(column_mask)
# now we can add an item
column_mask = column_mask + ["year"]
print(column_mask)

Index(['countrycode', 'countryname'], dtype='object')
['countrycode', 'countryname']
['countrycode', 'countryname', 'year']


In [12]:
# finally, apply the two masks
df.loc[row_mask, column_mask]

Unnamed: 0,countrycode,countryname,year
2913,DEU,Germany,2008
2932,DEU,Germany,2009
2937,DEU,Germany,2010
3715,FRA,France,2010
3726,FRA,France,2008
3768,FRA,France,2009


In [13]:
# or do it all in one step
df.loc[
    ((df["countrycode"] == "DEU") | (df["countrycode"] == "FRA"))
    & (df["year"] >= 2008)
    & (df["year"] <= 2010),
    ["year"] + df.columns[df.columns.str.startswith("country")].to_list(),
]

Unnamed: 0,year,countrycode,countryname
2913,2008,DEU,Germany
2932,2009,DEU,Germany
2937,2010,DEU,Germany
3715,2010,FRA,France
3726,2008,FRA,France
3768,2009,FRA,France


# 7. Bonus: Select only numeric columns.

In [14]:
df.select_dtypes(["int", "float"])

Unnamed: 0,year,NY_GDP_MKTP_CD,NY_GDP_MKTP_KD_ZG,SP_POP_TOTL
0,2007,2.623726e+09,-3.654626,101220.0
1,2011,2.584464e+09,,102053.0
2,1992,,,68235.0
3,1989,,,61032.0
4,1975,,,60657.0
...,...,...,...,...
12581,1962,1.117602e+09,1.434471,3999419.0
12582,1993,6.563813e+09,1.051459,10905756.0
12583,1988,7.814784e+09,7.552375,9604302.0
12584,1986,6.217524e+09,2.099029,8976205.0


# 8. Bonus: Select the numeric columns except those starting with "NY_".

In [15]:
df[df.columns[~df.columns.str.startswith("NY_")]].select_dtypes(["int", "float"])

Unnamed: 0,year,SP_POP_TOTL
0,2007,101220.0
1,2011,102053.0
2,1992,68235.0
3,1989,61032.0
4,1975,60657.0
...,...,...
12581,1962,3999419.0
12582,1993,10905756.0
12583,1988,9604302.0
12584,1986,8976205.0


In [16]:
df.loc[:, df.columns[~df.columns.str.startswith("NY_")]].select_dtypes(["int", "float"])

Unnamed: 0,year,SP_POP_TOTL
0,2007,101220.0
1,2011,102053.0
2,1992,68235.0
3,1989,61032.0
4,1975,60657.0
...,...,...
12581,1962,3999419.0
12582,1993,10905756.0
12583,1988,9604302.0
12584,1986,8976205.0
