# 1 Pandas DataFrame Basics

Pandas is a open source Python library for data analysis. It offers a spreadsheet like data structure called a ```DataFrame``` and a separate structure for a column in a ```DataFrame``` called a ```Series```. A ```DataFrame``` can be thought of as a ```Dictionary``` of ```Series``` objects.

In [None]:
import pandas as pd # notice the alias, by convention pandas is aliased with pd

In [None]:
# by default the read_csv function will read a comma-separated file
# our gapminder data are separated by tabs
# we can use the sep parameter and indicate a tab with \t

df = pd.read_csv('gapminder.tsv', sep='\t')

# we use the head method so Python shows us onlye the first 5 rows

print(df.head())

In [None]:
df.head()

In [None]:
print(type(df))

In [None]:
# pyarrow is needed for this one.
parquetDf = pd.read_parquet('userdata.parquet', engine='auto')
parquetDf.head()

In [None]:
xmlDf = pd.read_xml('nutrition.xml')
xmlDf.head(100)

In [None]:
jsonDf = pd.read_json('sampleusers.json')
jsonDf.head()

In [None]:
nestedDf = pd.read_json('samplenested.json')
nestedDf.head()

In [None]:
# back to the gapminder data and do some EDA
df.head()

In [None]:
# get the number of rows and columns
print(df.shape) # shape an attribute of type tuple

In [None]:
# get the column names
print(df.columns) # columns is also an attribute of the DataFrame

In [None]:
# DataFrame rows can contain fields with different data type, colunns always
# the same datatype for every row.

# get the dtype of each column

print(df.dtypes)

In [None]:
# an object is actually a string

In [None]:
# get even more info about our data set
print(df.info()) # this is not an attribute, but a method

## Looking at columns, rows an cells

In [None]:
# just get the country column and save it to its own variable
country_df = df['country']

In [None]:
print(country_df.head())

In [None]:
# and the last rows with .tail()
print(country_df.tail())

In [None]:
# to select multiple columns you use a list with column names
subset = df[['country', 'continent', 'year']]
print(subset.head())

In [None]:
# selecting the first row using the loc attribute
print(df.loc[0])

In [None]:
# get the 100th row
print(df.loc[99])

In [None]:
# get the last row
print(df.loc[-1])

In [None]:
# mmm -1 is not an index.
number_of_rows = df.shape[0]
last_row_index = number_of_rows-1
print(df.loc[last_row_index])

In [None]:
# alternatively we can use the tail() method
print(df.tail(n=1))

In [None]:
# tail() and loc return different kinds of objects
subset_loc = df.loc[0]
subset_tail = df.tail(n=1)
print(type(subset_loc))

In [None]:
print(type(subset_tail))

In [None]:
# subsetting multiple rows
# select the first, 100th, and 1000th rows
# note the double square brackets similar to the syntax used to
# subset multiple columns

print(df.loc[[0, 99, 999]])

In [None]:
# we can also use the iloc attribute which use the index itself, 
# while loc uses the index label
# get the 2nd row
print(df.iloc[1])

In [None]:
dftst = pd.DataFrame({'team': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B'],
                   'points': [5, 7, 7, 9, 12, 9, 9, 4],
                   'assists': [11, 8, 10, 6, 6, 5, 9, 12]},
                   index=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'])
dftst.loc['A']

In [None]:
dftst.iloc[0]

In [None]:
## get the 100th row
print(df.iloc[99])

In [None]:
# using -1 to get the last row
print(df.iloc[-1])

In [None]:
## get the first, 100th, and 1000th rows
print(df.iloc[[0, 99, 999]])

In [None]:
# we can also slice columns with loc and iloc
# The basic syntax is loc[[rows], [columns]]
# subset columns with loc
# note the position of the colon
# it is used to select all rows

subset = df.loc[:, ['year', 'pop']]
print(subset.head())

In [None]:
# subset columns with iloc
# iloc will alow us to use integers
# -1 will select the last column
subset = df.iloc[:, [2, 4, -1]]
print(subset.head())

In [None]:
# subset columns with loc
# but pass in integer values
# this will cause an error
subset = df.loc[:, [2, 4, -1]]
print(subset.head())

In [None]:
# subset columns with iloc
# but pass in index names
# this will cause an error
subset = df.iloc[:, ['year', 'pop']]
print(subset.head())

In [None]:
# create a range of integers from 0 to 4 inclusive
small_range = list(range(5))
print(small_range)

In [None]:
# subset the dataframe with the range
subset = df.iloc[:, small_range]
print(subset.head())

In [None]:
# create a range from 3 to 5 inclusive
small_range = list(range(3, 6))
print(small_range)

In [None]:
subset = df.iloc[:, small_range]
print(subset.head())

In [None]:
# create a range from 0 to 5 inclusive, every other integer
small_range = list(range(0, 6, 2))
subset = df.iloc[:, small_range]
print(subset.head())

In [None]:
# the range() function and the slicing operator : do basically the same
small_range = list(range(3))
subset = df.iloc[:, small_range]
print(subset.head())
# slice the first 3 columns
subset = df.iloc[:, :3]
print(subset.head())

In [None]:
small_range = list(range(3, 6))
subset = df.iloc[:, small_range]
print(subset.head())
# slice columns 3 to 5 inclusive
subset = df.iloc[:, 3:6]
print(subset.head())

In [None]:
small_range = list(range(0, 6, 2))
subset = df.iloc[:, small_range]
print(subset.head())

# slice every other first 5 columns
subset = df.iloc[:, 0:6:2]
print(subset.head())

In [None]:
# subsetting rows and columns 
# using loc
print(df.loc[42, 'country'])

In [None]:
# using iloc
print(df.iloc[42, 0])

In [None]:
# get the 1st, 100th, and 1000th rows
# from the 1st, 4th, and 6th columns
# the columns we are hoping to get are
# country, lifeExp, and gdpPercap
print(df.iloc[[0, 99, 999], [0, 3, 5]])

## 1.4 Grouped and Aggregated Calculations

In [None]:
print(df.head(n=10))

In [None]:
# Grouped Means

# For each year in our data, what was the average life expectancy?
# To answer this question,
# we need to split our data into parts by year;
# then we get the 'lifeExp' column and calculate the mean
print(df.groupby('year')['lifeExp'].mean())

In [None]:
# let's unpack that statement
grouped_year_df = df.groupby('year')
print(type(grouped_year_df))

In [None]:
# we can subset the grouped by dataframe
grouped_year_df_lifeExp = grouped_year_df['lifeExp']
print(type(grouped_year_df_lifeExp))

In [None]:
mean_lifeExp_by_year = grouped_year_df_lifeExp.mean()
print(mean_lifeExp_by_year)

In [None]:
# The mean of an ungrouped dataframe is the mean of all rows...
print(df['lifeExp'].mean())

In [None]:
# doing stratified analysis on multiple columns
multi_group_var = df.groupby(['year', 'continent'])[['lifeExp', 'gdpPercap']].mean()
print(multi_group_var)

In [None]:
# if you actually need a flat tabel, you can use the 
# reset_index() function.
flat = multi_group_var.reset_index()
print(flat.head(15))

In [None]:
# use the nunique (number unique)
# to calculate the number of unique values in a series
# Let's calculate the number of unique countries per continent in the data set
print(df.groupby('continent')['country'].nunique())

In [None]:
global_yearly_life_expectancy = df.groupby('year')['lifeExp'].mean()
print(global_yearly_life_expectancy)

In [None]:
global_yearly_life_expectancy.plot()