In [None]:
import pandas as pd

# Exploratory Data Analysis with Pandas

Exploratory Data Analysis (EDA) is a crucial step in data analysis process. The purpose of EDA is mainly to summarize the data into few key statistical measures and visualizations to help us understand underlying patterns in the data. During EDA, we are still in discovery phase of our entire analysis process which means that we would benefit from a tool offering these two main features

1. Accommodate rapid experimentation of data analysis
2. Give immediate result in the form of table/numbers/visualizations
3. Integrate text based explanations for future use

Notebooks are excellent tools for these as we can do multiple "experimental" analyses on our data in different cells or sections, get immediate results, and also document our thought process behind each "experiment".

In this session, we learn how to use `Pandas`, a popular Python library to analyze tabular data. 

## Introduction to Pandas

In [None]:
# loading csv file directly from url

url = 'https://raw.githubusercontent.com/resbaz/r-novice-gapminder-files/master/data/gapminder-FiveYearData.csv'
df = pd.read_csv(url)

In [None]:
df.head()

In [None]:
df.head(10)

In [None]:
df.tail()

In [None]:
df.tail(7)

In [None]:
df.info()

In [None]:
df.columns

In [None]:
len(df)

In [None]:
df['continent'].nunique()

In [None]:
df['continent'].unique()

In [None]:
df['continent'].value_counts()

In [None]:
df[['continent', 'year']].value_counts()

## Column Operations

In [None]:
df['country']

In [None]:
df['continent']

In [None]:
df[df['continent']== 'Asia'] 

In [None]:
df[(df['continent'] == 'Asia') | (df['continent'] == 'Europe')]

In [None]:
df[(df['continent'] == 'Europe') & (df['country'] == 'Germany')]

In [None]:
# assume a global increase in life expectancy by 5 years
df['lifeexp_Future'] = df['lifeExp'] + 5
df.head()

In [None]:
# subtract a constant value of 500 from gdppercapita
df['gdpPercap_inflation'] = df['gdpPercap'] - 500
df.head()

In [None]:
# future population projection at 10years with a growth factor of 1.2
df['pop_10yrs'] = df['pop'] * 1.02
df.head()

In [None]:
# convert population to million
df['pop_million'] = df['pop'] / 1e6
df.head()

In [None]:
# multiplying columns
df['gdp_billion'] = (df['gdpPercap'] * df['pop'])/1e9
df.head()

In [None]:
df.drop(columns=['lifeexp_Future'])

In [None]:
df.drop(columns=['pop_10yrs', 'gdpPercap_inflation'])

## Basic Statics of Continuous Data

In [None]:
df['pop_million'].min()

In [None]:
df['pop_million'].max()

In [None]:
df['pop_million'].mean()

In [None]:
df['pop_million'].median()

In [None]:
df['pop_million'].std()

In [None]:
df[['pop_million', 'gdp_billion']].mean()

In [None]:
df[['pop_million', 'gdp_billion']].agg(['mean', 'std'])

In [None]:
df.describe()

In [None]:
df[['pop_million', 'gdp_billion', 'lifeExp']].describe()

## Group-by Statistics

In [None]:
df.groupby('continent')['pop_million'].mean()

In [None]:
df.groupby('continent')['pop_million'].min()

In [None]:
df.groupby('continent')[['pop_million', 'gdp_billion']].mean()

In [None]:
df.groupby(['continent', 'year'])['pop_million'].mean()

In [None]:
df.groupby(['continent', 'year'])[['pop_million', 'gdp_billion']].mean()

In [None]:
df.groupby(['continent', 'year'])[['pop_million', 'gdp_billion']].agg(['mean', 'median', 'std'])