# Exercises on pandas Basics

## 1. Getting Started
We first import `pandas` and load a table into a DataFrame.

In [61]:
import pandas as pd

countries = pd.read_csv('large_countries_2015.csv', index_col=0)

In [62]:
%matplotlib inline

## 2. Working with DataFrames
To view the contents of a data frame, type its name:

In [63]:
countries

Unnamed: 0,population,fertility,continent
Bangladesh,160995600.0,2.12,Asia
Brazil,207847500.0,1.78,South America
China,1376049000.0,1.57,Asia
India,1311051000.0,2.43,Asia
Indonesia,257563800.0,2.28,Asia
Japan,126573500.0,1.45,Asia
Mexico,127017200.0,2.13,North America
Nigeria,182202000.0,5.89,Africa
Pakistan,188924900.0,3.04,Asia
Philippines,100699400.0,2.98,Asia


**Columns --> Series**

**Index --> Special column with no name, doesn't have to be unique**

## 3. Examining DataFrames
Match the Python commands with the descriptions below. 

*In Jupyter, you can move the descriptions up/down with the arrow buttons.*

#### Show the first 3 lines

In [64]:
countries.head(3)

Unnamed: 0,population,fertility,continent
Bangladesh,160995600.0,2.12,Asia
Brazil,207847500.0,1.78,South America
China,1376049000.0,1.57,Asia


#### Show the last 3 lines

In [65]:
countries.tail(3)

Unnamed: 0,population,fertility,continent
Philippines,100699395.0,2.98,Asia
Russia,143456918.0,1.61,Europe
United States,321773631.0,1.97,North America


In [66]:
# countries = countries.sort_index(ascending=False)

#### Summarize all numerical columns

In [67]:
countries.describe()

Unnamed: 0,population,fertility
count,12.0,12.0
mean,375346200.0,2.4375
std,456519400.0,1.200781
min,100699400.0,1.45
25%,139347000.0,1.7375
50%,185563400.0,2.125
75%,273616300.0,2.5675
max,1376049000.0,5.89


#### Mean of a column

In [68]:
countries['population'].mean()

375346161.6666667

#### Summarize categorical data

In [69]:
countries['continent'].value_counts()

Asia             7
North America    2
Europe           1
Africa           1
South America    1
Name: continent, dtype: int64

#### Number of rows and columns

In [70]:
countries.shape

(12, 3)

#### Extract distinct values

In [71]:
countries['continent'].unique()

array(['Asia', 'South America', 'North America', 'Africa', 'Europe'],
      dtype=object)

In [72]:
#floor division (//) ---> without the remainder
countries['population'] // 1000000

Bangladesh        160.0
Brazil            207.0
China            1376.0
India            1311.0
Indonesia         257.0
Japan             126.0
Mexico            127.0
Nigeria           182.0
Pakistan          188.0
Philippines       100.0
Russia            143.0
United States     321.0
Name: population, dtype: float64

#### Apply a calculation to each value in a column

## 4. Selecting rows and columns
Match the Python commands with the descriptions below. 

#### Display column labels

In [73]:
countries.columns

Index(['population', 'fertility', 'continent'], dtype='object')

#### Display row index

In [74]:
countries.index

Index(['Bangladesh', 'Brazil', 'China', 'India', 'Indonesia', 'Japan',
       'Mexico', 'Nigeria', 'Pakistan', 'Philippines', 'Russia',
       'United States'],
      dtype='object')

#### Select one column

In [80]:
countries['continent']
### Returns a series

Bangladesh                Asia
Brazil           South America
China                     Asia
India                     Asia
Indonesia                 Asia
Japan                     Asia
Mexico           North America
Nigeria                 Africa
Pakistan                  Asia
Philippines               Asia
Russia                  Europe
United States    North America
Name: continent, dtype: object

#### Select multiple columns

In [85]:
countries[['population', 'continent']]

Unnamed: 0,population,continent
Bangladesh,160995600.0,Asia
Brazil,207847500.0,South America
China,1376049000.0,Asia
India,1311051000.0,Asia
Indonesia,257563800.0,Asia
Japan,126573500.0,Asia
Mexico,127017200.0,North America
Nigeria,182202000.0,Africa
Pakistan,188924900.0,Asia
Philippines,100699400.0,Asia


#### Select row by an index value

In [97]:
countries.loc['India']

population    1.31105e+09
fertility            2.43
continent            Asia
Name: India, dtype: object

In [91]:
# countries.set_index('continent').loc['Asia']

In [101]:
countries.iloc[3:7] #grab all the rows from 3 to 7

Unnamed: 0,population,fertility,continent
India,1311051000.0,2.43,Asia
Indonesia,257563800.0,2.28,Asia
Japan,126573500.0,1.45,Asia
Mexico,127017200.0,2.13,North America


#### Select rows by slicing the index

In [103]:
countries.iloc[5,1]

1.45

In [120]:
# countries.loc['Japan', 'fertility']

In [112]:
# (countries['fertility'] >= 1.45) & (countries['fertility'] < 1.46)

#### Filter rows by a condition

In [119]:
mask = countries['population'] > 200_000_000
countries[mask]

Unnamed: 0,population,fertility,continent
Brazil,207847500.0,1.78,South America
China,1376049000.0,1.57,Asia
India,1311051000.0,2.43,Asia
Indonesia,257563800.0,2.28,Asia
United States,321773600.0,1.97,North America


#### Extract raw data as a NumPy array

In [126]:
countries.values

array([[160995642.0, 2.12, 'Asia'],
       [207847528.0, 1.78, 'South America'],
       [1376048943.0, 1.57, 'Asia'],
       [1311050527.0, 2.43, 'Asia'],
       [257563815.0, 2.28, 'Asia'],
       [126573481.0, 1.45, 'Asia'],
       [127017224.0, 2.13, 'North America'],
       [182201962.0, 5.89, 'Africa'],
       [188924874.0, 3.04, 'Asia'],
       [100699395.0, 2.98, 'Asia'],
       [143456918.0, 1.61, 'Europe'],
       [321773631.0, 1.97, 'North America']], dtype=object)

## 5. Summarizing Data
Match the Python commands with the descriptions below. 

In [None]:
countries['fertility'].cumsum()

In [None]:
countries.groupby('continent')['population'].sum()

In [None]:
countries.sort_values(by=['continent', 'fertility'])

In [None]:
def get_initial(s):
    return s[0]

countries['initial'] = countries['continent'].apply(get_initial)
countries

In [None]:
countries.stack()

In [None]:
countries.transpose()

In [None]:
countries['fertility'].hist()

In [None]:
countries.plot('population', 'fertility', style='ro')

#### Draw a scatterplot

#### Move columns to a new index level

#### Create a new column using a function

#### Draw a histogram

#### Cumulatively apply a sum over a column

#### Swap rows and columns

#### Calculate sum of one column grouped by a second one

#### Sort values

## License
(c) 2017 Kristian Rother
Distributed under the conditions of the MIT License.