In [None]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None  # default='warn'

# Exercise 1
### Preparation for the exercise


In [None]:
data = pd.DataFrame({'value':[632, 1638, 569, 115, 433, 1130, 754, 555],
                     'patient':[1, 1, 1, 1, 2, 2, 2, 2],
                     'phylum':['Firmicutes', 'Proteobacteria', 'Actinobacteria', 
    'Bacteroidetes', 'Firmicutes', 'Proteobacteria', 'Actinobacteria', 'Bacteroidetes']})
data

### Exercise

Try out these commands to see what they return:

- `data.head()` return the **n first rows** of the DataFrame (5 by default)
- `data.tail(3)` return the **n last rows** of the DataFrame (5 by default)
- `data.shape` return the **(#rows, #columns)** of the DataFrame i.e. the **(cardinality, degree)**

In [None]:
data.head()

In [None]:
data.tail(3)

In [None]:
data.shape

# Exercise 2
### Preparation for the exercise

In [None]:
data = pd.DataFrame([{'patient': 1, 'phylum': 'Firmicutes', 'value': 632},
                    {'patient': 1, 'phylum': 'Proteobacteria', 'value': 1638},
                    {'patient': 1, 'phylum': 'Actinobacteria', 'value': 569},
                    {'patient': 1, 'phylum': 'Bacteroidetes', 'value': 115},
                    {'patient': 2, 'phylum': 'Firmicutes', 'value': 433},
                    {'patient': 2, 'phylum': 'Proteobacteria', 'value': 1130},
                    {'patient': 2, 'phylum': 'Actinobacteria', 'value': 754},
                    {'patient': 2, 'phylum': 'Bacteroidetes', 'value': 555}])

### Exercise

From the `data` table above, create an index to return all rows for which the phylum name ends in "bacteria" and the value is greater than 1000.

In [None]:
# Write your answer here 
print(data)

#instead of looping over the rows we can directly apply the criterium on all of them
print(data.value > 1000)
print(data.phylum.str.endswith('bacteria')) 

data[(data.value > 1000) & (data.phylum.str.endswith('bacteria'))]

# Exercise 3
### Preparation for the exercise

In [None]:
baseball = pd.read_csv("Data/baseball.csv", index_col='id')
baseball.head()

### Exercise

You can use the `isin` method query a DataFrame based upon a list of values as follows: 

    data['phylum'].isin(['Firmacutes', 'Bacteroidetes'])

Use `isin` to find all players that played for the Los Angeles Dodgers (LAN) or the San Francisco Giants (SFN). How many records contain these values?

In [None]:
# Write your answer here
LAN_and_SFD = baseball[baseball['team'].isin(['LAN', 'SFN'])]
print("They are", LAN_and_SFD.shape[0], "players who played at LAN or SFN")
LAN_and_SFD

# Exercise 4

### Exercise

Calculate **on base percentage** for each player, and return the ordered series of estimates.

$$OBP = \frac{H + BB + HBP}{AB + BB + HBP + SF}$$



In [None]:
baseball.head()

In [None]:
# Write your answer here
def calculate_OBP(x):
    num = x['h'] + x['bb'] + x['hbp']
    denom = x['ab'] + x['bb'] + x['hbp'] + x['sf']
    if denom == 0:
        return np.NaN # handle 0 division
    else:
        return (num / denom)

baseball['OBP'] = baseball.apply(calculate_OBP, axis=1)
baseball.sort_values(['OBP'], ascending=False)

# Exercise 4.2

### Preparation for the exercise

In [None]:
frame = pd.DataFrame(np.arange(12).reshape(( 4, 3)), 
                  index =[['a', 'a', 'b', 'b'], [1, 2, 1, 2]], 
                  columns =[['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])

frame

### Exercise

Try retrieving the value corresponding to `b2` in `Colorado`:


In [None]:
# Write your answer here
int(frame.loc['b', 'Colorado'].iloc[1])

# Exercise 5

### Preparation for the exercise

In [None]:
treatment = pd.Series([0]*4 + [1]*2)
data['treatment'] = treatment

data['month'] = ['Jan']*len(data)
data

### Exercise

Try using the axis argument to drop columns with missing values:

In [None]:
# Write your answer here
data.dropna(axis=0)
# Delete the lines



In [None]:
# Write your answer here
data.dropna(axis=1) # or data.dropna(axis='columns')
# Delete the columns

# Advanced Exercise

The Data/ebola folder contains summarized reports of Ebola cases from three countries during the recent outbreak of the disease in West Africa. For each country, there are daily reports that contain various information about the outbreak in several cities in each country.

From these data files, use pandas to import them and create a single data frame that includes the daily totals of new cases and deaths for each country.

## For this exercise, we split the first processing of the data by the three countries

## We first start with Guinea

In [None]:
# This small code is just to test the data
file = 'Data/ebola/guinea_data/2014-08-04.csv'

# Read the data
data = pd.read_csv(file, parse_dates=True, dayfirst=True).fillna(0)
# Take the date
date = data['Date'][0]
print(date)
# Drop the Date column and change the index to be the column Description
data.drop(['Date'], axis=1, inplace=True)
data.index = data['Description']
data.drop(['Description'], axis=1, inplace=True)
if 'New deaths registered today' not in data.index:
    death = int(data['Totals'].loc[['New deaths registered']])
else:
    death = int(data['Totals'].loc[['New deaths registered today']])

case = int(data['Totals'].loc[['Total new cases registered so far']])
    
print(death)
print(case)

### Code to deal with all the files in the guinea_data folder

In [None]:
import os

# List the files for Guinea
basepath = 'Data/ebola/guinea_data/'

# Empty dic
guinea = {}
# Arrays for date, new cases and new deaths
date = []
new_cases = []
new_deaths = []

# Go through all the files
for fname in os.listdir(basepath):
    if 'lock' not in fname:
        #print(fname)
        path = os.path.join(basepath, fname)
        # Load the data
        data = pd.read_csv(path, parse_dates=True, dayfirst=True).fillna(0)
        # Get the date
        date.append(data["Date"][0])
        # Drop the Date column and change the index to be the column Description
        data.drop(['Date'], axis=1, inplace=True)
        data.index = data['Description']
        data.drop(['Description'], axis=1, inplace=True)
        # Get the new cases
        new_cases.append(int(data['Totals'].loc[['Total new cases registered so far']]))
        # Get the new deaths
        if 'New deaths registered today' not in data.index:
            new_deaths.append(int(data['Totals'].loc[['New deaths registered']]))
        else:
            new_deaths.append(int(data['Totals'].loc[['New deaths registered today']]))        
    
mat = np.column_stack((new_cases, new_deaths)) 
guinea_frame = pd.DataFrame(mat, columns = [["Guinea", "Guinea"], ["New cases", "New deaths"]])
guinea_frame.index = date
guinea_frame.index.name = "Date"
guinea_frame.head() 

## Now, we take care of the data from the Sierra Leone


In [None]:
# This small code is just to test the data
file = 'Data/ebola/sl_data/2014-08-23-v88.csv'

# Read the data
data = pd.read_csv(file, parse_dates=True, dayfirst=True).fillna(0)
# Take the date
date = data['date'][0]
print(date)
# Drop the Date column and change the index to be the column Description
data.drop(['date'], axis=1, inplace=True)
data.index = data['variable']
data.drop(['variable'], axis=1, inplace=True)
data
case = data['National'].loc[['new_noncase', 'new_suspected', \
                                 'new_probable', 'new_confirmed']]
death = data['National'].loc[['death_suspected', 'death_probable', \
                              'death_confirmed']]

print(case.astype(int).sum())
print(death.astype(int).sum())

### Code to deal with all the files in the sl_data folder

In [None]:
# List the files for Sierra Leone
basepath = 'Data/ebola/sl_data/'

# Empty dic
sierra_leone = {}
# Arrays for date, new cases and new deaths
date = []
new_cases = []
new_deaths = []

# Go through all the files
for fname in os.listdir(basepath):
    if "lock" not in fname:
        #print(fname)
        path = os.path.join(basepath, fname)
        # Load the data
        data = pd.read_csv(path, parse_dates=True, dayfirst=True).fillna(0)
        # Get the date
        date.append(data["date"][0])
        # Drop the Date column and change the index to be the column Description
        data.drop(['date'], axis=1, inplace=True)
        data.index = data['variable']
        data.drop(['variable'], axis=1, inplace=True)
        # Get the new cases
        new_cases.append(data['National'].loc[['new_noncase', \
                            'new_suspected', 'new_probable', \
                            'new_confirmed']].astype(int).sum())
        # Get the new death
        new_deaths.append(data['National'].loc[['death_suspected', \
                            'death_probable', 'death_confirmed']] \
                             .astype(int).sum()) 
        
mat = np.column_stack((new_cases, new_deaths)) 
sierra_leone_frame = pd.DataFrame(mat, columns = [["Sierra Leone", "Sierra Leone"], ["New cases", "New deaths"]])
sierra_leone_frame.index = date
sierra_leone_frame.index.name = "Date"
sierra_leone_frame.head()

## Now, we take care of the data from the Liberia


In [None]:
# This small code is just to test the data
file = 'Data/ebola/liberia_data/2014-06-16.csv'

# Read the data
data = pd.read_csv(file, parse_dates=True, dayfirst=True).fillna(0)
# Take the date (00:00:00 is present all the time, we have to remove it)
date = str(pd.to_datetime(data['Date'][0])).replace(' 00:00:00', '')
print(date)
# Drop the Date column and change the index to be the column Variable
data.drop(['Date'], axis=1, inplace=True)
data.index = data['Variable']
data.drop(['Variable'], axis=1, inplace=True)
data
case = data['National'].loc[['New Case/s (Suspected)', \
                             'New Case/s (Probable)', \
                             'New case/s (confirmed)']]
death = data['National'].loc[['Newly reported deaths']]

print(case.astype(int).sum())
print(int(death))

### Code to deal with all the files in the liberia_data folder

In [None]:
# List the files for Liberia
basepath = 'Data/ebola/liberia_data/'

# Empty dic
liberia = {}
# Arrays for date, new cases and new deaths
date = []
new_cases = []
new_deaths = []

# Go through all the files
for fname in os.listdir(basepath):
    if "lock" not in fname:
        path = os.path.join(basepath, fname)
        # Load the data
        data = pd.read_csv(path, parse_dates=True, dayfirst=True).fillna(0)
        # Get the date 
        # Default format is different from Guinea and Sierra Leone =>
        # We change the format back to the usual one. We also have to 
        # replace the 00:00:00 coming from the change of format
        date.append(str(pd.to_datetime(data['Date'][0])).replace(' 00:00:00', ''))
        # Drop the Date column and change the index to be the column Variable
        data.drop(['Date'], axis=1, inplace=True)
        data.index = data['Variable']
        data.drop(['Variable'], axis=1, inplace=True)
        # Get the new cases
        new_cases.append(data['National'].loc[['New Case/s (Suspected)', \
                             'New Case/s (Probable)', \
                             'New case/s (confirmed)']].astype(int).sum()) 
        # Get the new deaths
        new_deaths.append(int(data['National'].loc[['Newly reported deaths']]))
        

mat = np.column_stack((new_cases, new_deaths)) 
liberia_frame = pd.DataFrame(mat, columns = [["Liberia", "Liberia"], ["New cases", "New deaths"]])
liberia_frame.index = date
liberia_frame.index.name = "Date"
liberia_frame.head()

### Now, we merge the 3 frames together and we fill the NA with 0

In [None]:
ebola = liberia_frame.add(guinea_frame, fill_value=0)
ebola = ebola.add(sierra_leone_frame, fill_value=0)
ebola