# 6. Reading Tabular Data into DataFrames

* Import the Pandas library.
* Use Pandas to load a simple CSV data set.
* Get some basic information about a Pandas DataFrame.

In [8]:
# Imports
import pandas as pd

In [9]:
# Read in data
data = pd.read_csv('./data/gapminder_gdp_oceania.csv')

In [10]:
# Print data
data

Unnamed: 0,country,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,gdpPercap_1992,gdpPercap_1997,gdpPercap_2002,gdpPercap_2007
0,Australia,10039.59564,10949.64959,12217.22686,14526.12465,16788.62948,18334.19751,19477.00928,21888.88903,23424.76683,26997.93657,30687.75473,34435.36744
1,New Zealand,10556.57566,12247.39532,13175.678,14463.91893,16046.03728,16233.7177,17632.4104,19007.19129,18363.32494,21050.41377,23189.80135,25185.00911


## Using `index_col` to use column values as row headings

In [11]:
# Read in data using country as index_col
data = pd.read_csv('./data/gapminder_gdp_oceania.csv', index_col='country')

In [12]:
# Print data
data

Unnamed: 0_level_0,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,gdpPercap_1992,gdpPercap_1997,gdpPercap_2002,gdpPercap_2007
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Australia,10039.59564,10949.64959,12217.22686,14526.12465,16788.62948,18334.19751,19477.00928,21888.88903,23424.76683,26997.93657,30687.75473,34435.36744
New Zealand,10556.57566,12247.39532,13175.678,14463.91893,16046.03728,16233.7177,17632.4104,19007.19129,18363.32494,21050.41377,23189.80135,25185.00911


## Using `DataFrame.info` to get info about a dataframe

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, Australia to New Zealand
Data columns (total 12 columns):
gdpPercap_1952    2 non-null float64
gdpPercap_1957    2 non-null float64
gdpPercap_1962    2 non-null float64
gdpPercap_1967    2 non-null float64
gdpPercap_1972    2 non-null float64
gdpPercap_1977    2 non-null float64
gdpPercap_1982    2 non-null float64
gdpPercap_1987    2 non-null float64
gdpPercap_1992    2 non-null float64
gdpPercap_1997    2 non-null float64
gdpPercap_2002    2 non-null float64
gdpPercap_2007    2 non-null float64
dtypes: float64(12)
memory usage: 208.0+ bytes


This provides us with the following info:
* datatype: `DataFrame`
* rows: 2 rows, `Australia` and `New Zealand`
* columns: 12 cols, each made up of 2 64-bit floats
* memory: 208 bytes

## Using `DataFrame.columns` to get info about a dataframe's columns

In [14]:
data.columns

Index(['gdpPercap_1952', 'gdpPercap_1957', 'gdpPercap_1962', 'gdpPercap_1967',
       'gdpPercap_1972', 'gdpPercap_1977', 'gdpPercap_1982', 'gdpPercap_1987',
       'gdpPercap_1992', 'gdpPercap_1997', 'gdpPercap_2002', 'gdpPercap_2007'],
      dtype='object')

## Using `DataFrame.T` to transpose a dataframe

In [15]:
# Transpose dataframe - rows become columns, columns become rows
data.T

country,Australia,New Zealand
gdpPercap_1952,10039.59564,10556.57566
gdpPercap_1957,10949.64959,12247.39532
gdpPercap_1962,12217.22686,13175.678
gdpPercap_1967,14526.12465,14463.91893
gdpPercap_1972,16788.62948,16046.03728
gdpPercap_1977,18334.19751,16233.7177
gdpPercap_1982,19477.00928,17632.4104
gdpPercap_1987,21888.88903,19007.19129
gdpPercap_1992,23424.76683,18363.32494
gdpPercap_1997,26997.93657,21050.41377


## Using `DataFrame.describe` to get summary statistics

In [16]:
data.describe()

Unnamed: 0,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,gdpPercap_1992,gdpPercap_1997,gdpPercap_2002,gdpPercap_2007
count,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
mean,10298.08565,11598.522455,12696.45243,14495.02179,16417.33338,17283.957605,18554.70984,20448.04016,20894.045885,24024.17517,26938.77804,29810.188275
std,365.560078,917.644806,677.727301,43.986086,525.09198,1485.263517,1304.328377,2037.668013,3578.979883,4205.533703,5301.85368,6540.991104
min,10039.59564,10949.64959,12217.22686,14463.91893,16046.03728,16233.7177,17632.4104,19007.19129,18363.32494,21050.41377,23189.80135,25185.00911
25%,10168.840645,11274.086022,12456.839645,14479.47036,16231.68533,16758.837652,18093.56012,19727.615725,19628.685413,22537.29447,25064.289695,27497.598692
50%,10298.08565,11598.522455,12696.45243,14495.02179,16417.33338,17283.957605,18554.70984,20448.04016,20894.045885,24024.17517,26938.77804,29810.188275
75%,10427.330655,11922.958888,12936.065215,14510.57322,16602.98143,17809.077557,19015.85956,21168.464595,22159.406358,25511.05587,28813.266385,32122.777857
max,10556.57566,12247.39532,13175.678,14526.12465,16788.62948,18334.19751,19477.00928,21888.88903,23424.76683,26997.93657,30687.75473,34435.36744


In [17]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
gdpPercap_1952,2.0,10298.08565,365.560078,10039.59564,10168.840645,10298.08565,10427.330655,10556.57566
gdpPercap_1957,2.0,11598.522455,917.644806,10949.64959,11274.086022,11598.522455,11922.958888,12247.39532
gdpPercap_1962,2.0,12696.45243,677.727301,12217.22686,12456.839645,12696.45243,12936.065215,13175.678
gdpPercap_1967,2.0,14495.02179,43.986086,14463.91893,14479.47036,14495.02179,14510.57322,14526.12465
gdpPercap_1972,2.0,16417.33338,525.09198,16046.03728,16231.68533,16417.33338,16602.98143,16788.62948
gdpPercap_1977,2.0,17283.957605,1485.263517,16233.7177,16758.837652,17283.957605,17809.077557,18334.19751
gdpPercap_1982,2.0,18554.70984,1304.328377,17632.4104,18093.56012,18554.70984,19015.85956,19477.00928
gdpPercap_1987,2.0,20448.04016,2037.668013,19007.19129,19727.615725,20448.04016,21168.464595,21888.88903
gdpPercap_1992,2.0,20894.045885,3578.979883,18363.32494,19628.685413,20894.045885,22159.406358,23424.76683
gdpPercap_1997,2.0,24024.17517,4205.533703,21050.41377,22537.29447,24024.17517,25511.05587,26997.93657


## Working with other data

In [19]:
# Read data
americas = pd.read_csv('./data/gapminder_gdp_americas.csv', index_col='country')

In [20]:
help(americas.head)

Help on method head in module pandas.core.generic:

head(n=5) method of pandas.core.frame.DataFrame instance
    Return the first `n` rows.
    
    This function returns the first `n` rows for the object based
    on position. It is useful for quickly testing if your object
    has the right type of data in it.
    
    Parameters
    ----------
    n : int, default 5
        Number of rows to select.
    
    Returns
    -------
    obj_head : type of caller
        The first `n` rows of the caller object.
    
    See Also
    --------
    pandas.DataFrame.tail: Returns the last `n` rows.
    
    Examples
    --------
    >>> df = pd.DataFrame({'animal':['alligator', 'bee', 'falcon', 'lion',
    ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
    >>> df
          animal
    0  alligator
    1        bee
    2     falcon
    3       lion
    4     monkey
    5     parrot
    6      shark
    7      whale
    8      zebra
    
    Viewing the first 5 lines
   

In [24]:
americas.head(3).T

country,Argentina,Bolivia,Brazil
continent,Americas,Americas,Americas
gdpPercap_1952,5911.32,2677.33,2108.94
gdpPercap_1957,6856.86,2127.69,2487.37
gdpPercap_1962,7133.17,2180.97,3336.59
gdpPercap_1967,8052.95,2586.89,3429.86
gdpPercap_1972,9443.04,2980.33,4985.71
gdpPercap_1977,10079,3548.1,6660.12
gdpPercap_1982,8997.9,3156.51,7030.84
gdpPercap_1987,9139.67,2753.69,7807.1
gdpPercap_1992,9308.42,2961.7,6950.28


In [22]:
help(americas.tail)

Help on method tail in module pandas.core.generic:

tail(n=5) method of pandas.core.frame.DataFrame instance
    Return the last `n` rows.
    
    This function returns last `n` rows from the object based on
    position. It is useful for quickly verifying data, for example,
    after sorting or appending rows.
    
    Parameters
    ----------
    n : int, default 5
        Number of rows to select.
    
    Returns
    -------
    type of caller
        The last `n` rows of the caller object.
    
    See Also
    --------
    pandas.DataFrame.head : The first `n` rows of the caller object.
    
    Examples
    --------
    >>> df = pd.DataFrame({'animal':['alligator', 'bee', 'falcon', 'lion',
    ...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
    >>> df
          animal
    0  alligator
    1        bee
    2     falcon
    3       lion
    4     monkey
    5     parrot
    6      shark
    7      whale
    8      zebra
    
    Viewing the last 5 lines
 

In [25]:
americas.tail(3).T

country,United States,Uruguay,Venezuela
continent,Americas,Americas,Americas
gdpPercap_1952,13990.5,5716.77,7689.8
gdpPercap_1957,14847.1,6150.77,9802.47
gdpPercap_1962,16173.1,5603.36,8422.97
gdpPercap_1967,19530.4,5444.62,9541.47
gdpPercap_1972,21806,5703.41,10505.3
gdpPercap_1977,24072.6,6504.34,13144
gdpPercap_1982,25009.6,6920.22,11152.4
gdpPercap_1987,29884.4,7452.4,9883.58
gdpPercap_1992,32003.9,8137,10733.9


In [26]:
# data_microbes = pd.read_csv('../field_data/microbes.csv')

## Writing out data

In [27]:
data.to_csv('./data/processed.csv')