In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
sns.set_context('notebook')

# Table of Contents
 <p><div class="lev1"><a href="#Task-1.-Compiling-Ebola-Data"><span class="toc-item-num">Task 1.&nbsp;&nbsp;</span>Compiling Ebola Data</a></div>
 <div class="lev1"><a href="#Task-2.-RNA-Sequences"><span class="toc-item-num">Task 2.&nbsp;&nbsp;</span>RNA Sequences</a></div>
 <div class="lev1"><a href="#Task-3.-Class-War-in-Titanic"><span class="toc-item-num">Task 3.&nbsp;&nbsp;</span>Class War in Titanic</a></div></p>

In [2]:
pd.options.mode.chained_assignment = None  # default='warn', Mutes warnings when copying a slice from a DataFrame.

In [3]:
DATA_FOLDER = './Data' 
EBOLA_FOLDER = DATA_FOLDER + '/ebola'

GUINEA_FOLDER = EBOLA_FOLDER + '/guinea_data'
LIBERIA_FOLDER = EBOLA_FOLDER + '/liberia_data'
SIERRA_LEONE_FOLDER = EBOLA_FOLDER + '/sl_data'

## Task 1. Compiling Ebola Data

The `DATA_FOLDER/ebola` folder contains summarized reports of Ebola cases from three countries (Guinea, Liberia and Sierra Leone) during the recent outbreak of the disease in West Africa. For each country, there are daily reports that contain various information about the outbreak in several cities in each country.

Use pandas to import these data files into a single `Dataframe`.
Using this `DataFrame`, calculate for *each country*, the *daily average per month* of *new cases* and *deaths*.
Make sure you handle all the different expressions for *new cases* and *deaths* that are used in the reports.

### 1 - Cleaning Guinea

We want to identify the models. We will call model the different kind of ```Description``` values that are present in files. To be able to clean them as a whole. We will drop the columns of each region as we are only interested in national numbers. 

We use ```glob``` in order to get all UNIX style pathname pattern expansion to be able to import all files.

In [63]:
def find_model(distinct_model_variables,current_model):
    for i,d in enumerate(distinct_model_variables):
        if d == current_model:
            return i
    return -1

def print_distinct_model_repartition(n_distinct_models, belongs_to):
    for model_num in range(0,n_distinct_models):
        n_model = len([x for x in belongs_to if x == model_num])
        print("{} file(s) in model {}".format(n_model,model_num))

def get_distinct_model(path_to_folder,desc_col_name,date_col_name,total_col_name):
    country_df = pd.DataFrame()
    list_ = []
    distinct_model_variables = []
    belongs_to_model = []
    
    allFiles = glob.glob(path_to_folder + "/*.csv")
    cols_to_use = [desc_col_name,date_col_name,total_col_name]
    

    for file_ in allFiles:
        df = pd.read_csv(file_,index_col=None, header=0,usecols=cols_to_use,parse_dates=[date_col_name],thousands=',')
        list_.append(df)
    
    for df in list_:
        current_model = set(df[desc_col_name].values)
        index = find_model(distinct_model_variables,current_model)
        if(index == -1):
            distinct_model_variables.append(current_model)
            belongs_to_model.append(len(distinct_model_variables)-1)
        else:
            belongs_to_model.append(index)
    
    n_distinct_models = len(distinct_model_variables)
    print('We have found {} different model(s).'.format(n_distinct_models))
    print_distinct_model_repartition(n_distinct_models,belongs_to_model)
    
    return (list_,distinct_model_variables,belongs_to_model)

def get_grouped_dataframes(path_to_folder,desc_col_name,date_col_name,total_col_name):
    (dfs,distinct_model_variables,belongs_to_model) = get_distinct_model(path_to_folder,desc_col_name,date_col_name,total_col_name)
    
    n_distinct_models = len(distinct_model_variables)
    model_dataframes = []
    for i in range(0,n_distinct_models):
        list_ = []

        for j,m in enumerate(belongs_to_model):
            if(m == i):
                list_.append(dfs[j])
        
        # We also parse the date and get rid of the 'Date' column
        df = pd.concat(list_)
        df['Month'] = df['Date'].map(lambda x:x.month)
        df['Day'] = df['Date'].map(lambda x:x.day)
        df = df.drop('Date',axis=1)
        
        df.set_index(['Month','Day',desc_col_name],inplace=True)
        #df = df.dropna()
        
        if df.index.is_unique:
            df = df[[total_col_name]].unstack(desc_col_name)
            
            # To avoid hierarchical columns, we drop the first level
            df.columns = df.columns.droplevel(0)
            df.columns.name = ''

        model_dataframes.append(df)
    return model_dataframes

In [5]:
path = GUINEA_FOLDER
model_dataframes = get_grouped_dataframes(GUINEA_FOLDER,'Description','Date','Totals')
df_model_0 = model_dataframes[0]

We have found 2 different model(s).
1 file(s) in model 0
21 file(s) in model 1


We have found 2 different 'models' of description values. Therefore we select the columns of interest for each of them. To do so we list the columns and look at those that talk about cases and deaths.

In [7]:
# We check for deaths
cols = df_model_0.columns.values.tolist()
rel_cols = [c for c in cols if 'deaths' in c]
print(rel_cols)

['New deaths registered today', 'New deaths registered today (confirmed)', 'New deaths registered today (probables)', 'New deaths registered today (suspects)', 'Number of deaths of confirmed cases among health workers', 'Number of deaths of probables cases among health workers', 'Total deaths (confirmed + probables + suspects)', 'Total deaths of confirmed', 'Total deaths of probables', 'Total deaths of suspects', 'Total of deaths in confirmed cases in CTE']


In [8]:
df_model_0.loc[:,rel_cols]

Unnamed: 0_level_0,Unnamed: 1_level_0,New deaths registered today,New deaths registered today (confirmed),New deaths registered today (probables),New deaths registered today (suspects),Number of deaths of confirmed cases among health workers,Number of deaths of probables cases among health workers,Total deaths (confirmed + probables + suspects),Total deaths of confirmed,Total deaths of probables,Total deaths of suspects,Total of deaths in confirmed cases in CTE
Month,Day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
8,4,2,2,0,0,12,8,363,228,133,2,138


We will only consider the column containing new, as the ```Total``` and ```Number`` columns seem to be cumulative. Seeing, that ```New deaths registered today = 2``` and that it equals the sum of :
* New deaths registered today (confirmed)
* New deaths registered today (probables) 
* New deaths registered today (suspects)

In [9]:
df_model_0['New Deaths'] = df_model_0[['New deaths registered today']]

In [11]:
# We check for deaths
cols = df_model_0.columns.values.tolist()
rel_cols = [c for c in cols if ('cases' in c and ('new' in c or 'New' in c))]
df_model_0.loc[:,rel_cols]

Unnamed: 0_level_0,Unnamed: 1_level_0,New cases of confirmed,New cases of probables,New cases of suspects,Total new cases registered so far
Month,Day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8,4,4,0,5,9


Again we only keep the headers that contain both ``new`` (with or without caps) and ```cases```. One can see that the sum of all new cases (```confirmed+probables+suspects```) is in ```Total new cases registered so far```

In [12]:
df_model_0['New Cases'] =  df_model_0['Total new cases registered so far']

In [13]:
df_model_0 = df_model_0[['New Deaths','New Cases']]
model_dataframes[0] = df_model_0
df_model_0.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,New Deaths,New Cases
Month,Day,Unnamed: 2_level_1,Unnamed: 3_level_1
8,4,2,9


### 2 - Cleaning Liberia

In [64]:
path = LIBERIA_FOLDER
# by quickly opening the first few files we observed that columns that are going to be relevant for us
# are going to be Date, Variable and National columns. We assumed that the National columns was the correct number
# even though we saw some inconsistency between the different region sum and the total value supposed to be in the National Column
model_dataframes = get_grouped_dataframes(path,'Variable','Date','National')

We have found 6 different model(s).
55 file(s) in model 0
1 file(s) in model 1
11 file(s) in model 2
1 file(s) in model 3
8 file(s) in model 4
24 file(s) in model 5


Here again, we have found 5 different 'models' of description values. Therefore we select the columns of interest for each of them. To do so we list the columns and look at those that talk about cases and deaths.

#### model 0

In [56]:
df_model_0 = model_dataframes[0]

# We check for deaths
cols = df_model_0.columns.values.tolist()
rel_cols = [c for c in cols if 'deaths' in c]
print(rel_cols)

['Cumulative deaths among HCW', 'Newly Reported deaths in HCW', 'Newly reported deaths']


In [57]:
df_model_0.loc[:,rel_cols].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Cumulative deaths among HCW,Newly Reported deaths in HCW,Newly reported deaths
Month,Day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6,16,1.0,0.0,2.0
6,17,1.0,0.0,0.0
6,22,1.0,0.0,4.0
6,24,2.0,0.0,4.0
6,25,3.0,1.0,3.0


Here we will only consider the column ```Newly reported deaths``` because we are interested in the total number of death for each day and not the one among HCW.

In [58]:
df_model_0['New Deaths'] = df_model_0[['Newly reported deaths']]

# We check for deaths
cols = df_model_0.columns.values.tolist()
print(cols);
rel_cols = [c for c in cols if ('cases' in c or 'case' in c)]
df_model_0.loc[:,rel_cols].head()

['Case Fatality Rate (CFR) - Confirmed & Probable Cases', 'Contacts lost to follow-up', 'Contacts seen', 'Contacts who completed 21 day follow-up', 'Cumulative admission/isolation', 'Cumulative cases among HCW', 'Cumulative deaths among HCW', 'Currently under follow-up', 'New Case/s (Probable)', 'New Case/s (Suspected)', 'New admissions', 'New case/s (confirmed)', 'Newly Reported Cases in HCW', 'Newly Reported deaths in HCW', 'Newly reported contacts', 'Newly reported deaths', 'Specimens collected', 'Specimens pending for testing', 'Total Number of Confirmed Cases of Guinean Nationality', 'Total Number of Confirmed Cases of Sierra Leonean Nationality', 'Total confirmed cases', 'Total contacts listed', 'Total death/s in confirmed cases', 'Total death/s in confirmed, probable, suspected cases', 'Total death/s in probable cases', 'Total death/s in suspected cases', 'Total discharges', 'Total no. currently in Treatment Units', 'Total probable cases', 'Total specimens tested', 'Total suspec

Unnamed: 0_level_0,Unnamed: 1_level_0,Cumulative cases among HCW,New case/s (confirmed),Total confirmed cases,Total death/s in confirmed cases,"Total death/s in confirmed, probable, suspected cases",Total death/s in probable cases,Total death/s in suspected cases,Total probable cases,Total suspected cases
Month,Day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6,16,1.0,1.0,12.0,8.0,16.0,6.0,2.0,6.0,4.0
6,17,1.0,0.0,12.0,8.0,16.0,6.0,2.0,6.0,8.0
6,22,1.0,5.0,28.0,16.0,25.0,8.0,1.0,8.0,6.0
6,24,2.0,4.0,33.0,18.0,32.0,8.0,6.0,8.0,13.0
6,25,4.0,2.0,35.0,20.0,37.0,8.0,9.0,9.0,17.0


Here it seems that we should select the ```New case/s (confirmed)``` column as it seems to correspond to the daily new cases and is not cumulative.

In [59]:
# add the new cases to the model df
df_model_0['New Cases'] =  df_model_0['New case/s (confirmed)']
# return a df with only the New Death and New Cases
df_model_0 = df_model_0[['New Deaths','New Cases']]
df_model_0.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,New Deaths,New Cases
Month,Day,Unnamed: 2_level_1,Unnamed: 3_level_1
6,16,2.0,1.0
6,17,0.0,0.0
6,22,4.0,5.0
6,24,4.0,4.0
6,25,3.0,2.0


#### model 1

In [130]:
df_model_1 = model_dataframes[1]
df_model_1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,National
Month,Day,Variable,Unnamed: 3_level_1
10,4,New Case/s (Suspected),23.0
10,4,New Case/s (Probable),14.0
10,4,New case/s (confirmed),5.0
10,4,Total suspected cases,1190.0
10,4,Total probable cases,1796.0


In [131]:
print('df has unique index :', df_model_1.index.is_unique)
# here we observe that the model can't be unstack because the index is not unique 
#(several time the same Variable in the file)

# compute a list of the index appearing multiple times:
index_list = df_model_1.index.get_level_values(2).tolist()
multiple_index_list = [i for i in index_list if index_list.count(i)>1]

for index in multiple_index_list:
    print(df_model_1.query('Variable == @index'))

df has unique index : False
                                                         National
Month Day Variable                                               
10    4   Cumulative (confirmed + probable + suspected)    3921.0
          Cumulative (confirmed + probable + suspected)    3929.0
                                      National
Month Day Variable                            
10    4   Cumulative cases among HCW     190.0
          Cumulative cases among HCW     192.0
                                       National
Month Day Variable                             
10    4   Cumulative deaths among HCW      92.0
          Cumulative deaths among HCW      94.0
                                            National
Month Day Variable                                  
10    4   Total death/s in confirmed cases    1012.0
          Total death/s in confirmed cases    1018.0
                                           National
Month Day Variable                                 
10    4   To

We observe only a small difference between the value that are in double here hence we choose to merge them and take the mean in order to be able to unstack the dataframe and see better what Variable we should choose for this model.

In [134]:
for index in multiple_index_list:
    # get the mean of the index values
    mean = df_model_1.query('Variable == @index').values.mean()
    # drop the multiple index from the df
    df_model_1.drop(index, level=2, inplace=True)
    # add the mean instead in the df
    df_model_1.loc[(df_model_1.index.get_level_values(0)[0], df_model_1.index.get_level_values(1)[0], index)] = [mean]


  user_expressions, allow_stdin)
  res = shell.run_cell(code, store_history=store_history, silent=silent)


In [137]:
# now our index should be unique:
print("my index are unique:", df_model_1.index.is_unique)

# so I can unstack and process like the other models:
df_model_1 = df_model_1[['National']].unstack('Variable')
            
# To avoid hierarchical columns, we drop the first level
df_model_1.columns = df_model_1.columns.droplevel(0)
df_model_1.columns.name = ''

df_model_1

my index are unique: True


Unnamed: 0_level_0,Unnamed: 1_level_0,Case Fatality Rate (CFR) - Confirmed & Probable Cases,Contacts lost to follow-up,Contacts seen,Contacts who completed 21 day follow-up,Cumulative (confirmed + probable + suspected),Cumulative CFR,Cumulative cases among HCW,Cumulative deaths among HCW,Currently under follow-up,New Case/s (Probable),...,Total case/s (confirmed),Total confirmed cases,Total contacts listed,Total death/s in confirmed cases,"Total death/s in confirmed, probable, suspected cases",Total death/s in probable cases,Total discharges,Total no. currently in Treatment Units,Total probable cases,Total suspected cases
Month,Day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10,4,62.7,2.0,3101.0,132.0,3925.0,62.8,191.0,93.0,6689.0,14.0,...,941.0,935.0,15271.0,1015.0,2204.5,700.0,25.0,448.0,1796.0,1190.0


In [140]:
# lets check for death:
cols = df_model_1.columns.values.tolist()
print(cols);
rel_cols = [c for c in cols if ('death' in c)]
df_model_1.loc[:,rel_cols].head()

['Case Fatality Rate (CFR) - Confirmed & Probable Cases', 'Contacts lost to follow-up', 'Contacts seen', 'Contacts who completed 21 day follow-up', 'Cumulative (confirmed + probable + suspected)', 'Cumulative CFR', 'Cumulative cases among HCW', 'Cumulative deaths among HCW', 'Currently under follow-up', 'New Case/s (Probable)', 'New Case/s (Suspected)', 'New admissions', 'New case/s (confirmed)', 'Newly Reported Cases in HCW', 'Newly Reported deaths in HCW', 'Newly reported contacts', 'Newly reported deaths', 'Total Case/s (Probable)', 'Total Case/s (Suspected)', 'Total Number of Confirmed Cases of Guinean Nationality', 'Total Number of Confirmed Cases of Sierra Leonean Nationality', 'Total case/s (confirmed)', 'Total confirmed cases', 'Total contacts listed', 'Total death/s in confirmed cases', 'Total death/s in confirmed, probable, suspected cases', 'Total death/s in probable cases', 'Total discharges', 'Total no. currently in Treatment Units', 'Total probable cases', 'Total suspecte

Unnamed: 0_level_0,Unnamed: 1_level_0,Cumulative deaths among HCW,Newly Reported deaths in HCW,Newly reported deaths,Total death/s in confirmed cases,"Total death/s in confirmed, probable, suspected cases",Total death/s in probable cases
Month,Day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10,4,93.0,0.0,43.0,1015.0,2204.5,700.0


```Newly reported``` death seems to be the good columns to pick here for this model.

In [142]:
df_model_1['New Deaths'] = df_model_1[['Newly reported deaths']]

In [146]:
# now lets check for the new case of the day
cols = df_model_1.columns.values.tolist()
print(cols);
rel_cols = [c for c in cols if ('cases' in c or 'case' in c or 'Case' in c)]
df_model_1.loc[:,rel_cols].head()

['Case Fatality Rate (CFR) - Confirmed & Probable Cases', 'Contacts lost to follow-up', 'Contacts seen', 'Contacts who completed 21 day follow-up', 'Cumulative (confirmed + probable + suspected)', 'Cumulative CFR', 'Cumulative cases among HCW', 'Cumulative deaths among HCW', 'Currently under follow-up', 'New Case/s (Probable)', 'New Case/s (Suspected)', 'New admissions', 'New case/s (confirmed)', 'Newly Reported Cases in HCW', 'Newly Reported deaths in HCW', 'Newly reported contacts', 'Newly reported deaths', 'Total Case/s (Probable)', 'Total Case/s (Suspected)', 'Total Number of Confirmed Cases of Guinean Nationality', 'Total Number of Confirmed Cases of Sierra Leonean Nationality', 'Total case/s (confirmed)', 'Total confirmed cases', 'Total contacts listed', 'Total death/s in confirmed cases', 'Total death/s in confirmed, probable, suspected cases', 'Total death/s in probable cases', 'Total discharges', 'Total no. currently in Treatment Units', 'Total probable cases', 'Total suspecte

Unnamed: 0_level_0,Unnamed: 1_level_0,Case Fatality Rate (CFR) - Confirmed & Probable Cases,Cumulative cases among HCW,New Case/s (Probable),New Case/s (Suspected),New case/s (confirmed),Newly Reported Cases in HCW,Total Case/s (Probable),Total Case/s (Suspected),Total Number of Confirmed Cases of Guinean Nationality,Total Number of Confirmed Cases of Sierra Leonean Nationality,Total case/s (confirmed),Total confirmed cases,Total death/s in confirmed cases,"Total death/s in confirmed, probable, suspected cases",Total death/s in probable cases,Total probable cases,Total suspected cases
Month,Day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
10,4,62.7,191.0,14.0,23.0,5.0,1.0,1798.0,1190.0,4.0,13.0,941.0,935.0,1015.0,2204.5,700.0,1796.0,1190.0


The value that seems to make sense here is ```New case/s (confirmed)```

In [147]:
df_model_1['New Cases'] = df_model_1[['New case/s (confirmed)']]

In [148]:
# now we merge and we are done with this model:
# return a df with only the New Death and New Cases
df_model_1 = df_model_1[['New Deaths','New Cases']]
df_model_1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,New Deaths,New Cases
Month,Day,Unnamed: 2_level_1,Unnamed: 3_level_1
10,4,43.0,5.0


#### model 2

In [149]:
df_model_2 = model_dataframes[2]
df_model_2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Case Fatality Rate (CFR) - Confirmed & Probable Cases,Contacts lost to follow-up,Contacts seen,Contacts who completed 21 day follow-up,Cumulative (confirmed + probable + suspects),Cumulative admission/isolation,Cumulative cases among HCW,Cumulative deaths among HCW,Currently under follow-up,New Case/s (Probable),...,Total contacts listed,Total death/s in confirmed cases,"Total death/s in confirmed, probable, suspected cases",Total death/s in probable cases,Total death/s in suspected cases,Total discharges,Total no. currently in Treatment Units,Total probable cases,Total specimens tested,Total suspected cases
Month,Day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10,8,,114.0,4251.0,481.0,4122.0,,199.0,93.0,7066.0,10.0,...,16423.0,,2344.0,,,88.0,459.0,,,
10,9,,17.0,6220.0,622.0,4182.0,,202.0,94.0,7384.0,27.0,...,16793.0,,2389.0,,,10.0,511.0,,,
10,10,,24.0,5521.0,307.0,4209.0,,206.0,93.0,7055.0,7.0,...,16989.0,,2414.0,,,35.0,452.0,,,
10,11,66.0,4.0,6398.0,442.0,4241.0,,207.0,94.0,7081.0,5.0,...,17061.0,1149.0,,,,76.0,436.0,,,
10,12,66.0,16.0,6391.0,452.0,4247.0,326.0,207.0,94.0,7136.0,8.0,...,17095.0,,2451.0,,,75.0,436.0,,,


### 3 - Cleaning Sierra Leone

## Task 2. RNA Sequences

In the `DATA_FOLDER/microbiome` subdirectory, there are 9 spreadsheets of microbiome data that was acquired from high-throughput RNA sequencing procedures, along with a 10<sup>th</sup> file that describes the content of each. 

Use pandas to import the first 9 spreadsheets into a single `DataFrame`.
Then, add the metadata information from the 10<sup>th</sup> spreadsheet as columns in the combined `DataFrame`.
Make sure that the final `DataFrame` has a unique index and all the `NaN` values have been replaced by the tag `unknown`.

### 1- Importing the data

First we import the data, and we add the ```BARCODE``` column using the file name to be able to match the metadata after that.

In [None]:
MICROBIOME_FOLDER = DATA_FOLDER + '/microbiome'

path = MICROBIOME_FOLDER
allFiles = glob.glob(path + "/MID[1-9].xls")
microbiome = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_excel(file_,header=None)
    df['BARCODE'] = file_[18:22]
    list_.append(df)
microbiome = pd.concat(list_,axis=0)
microbiome.columns =['taxon','count','BARCODE']
microbiome.head()

we check wether there is any NaN in the data

In [None]:
microbiome.isnull().values.any()

Then we import the metadata and change the only NA as ```unknown```

In [None]:
METADATA_PATH = MICROBIOME_FOLDER + '/metadata.xls'
metadata = pd.read_excel(METADATA_PATH,header=0)
metadata.fillna(value='unknown',inplace=True)
metadata

### 2 - Merging and keeping a unique index

Now we need to merge the two tables

In [None]:
result = pd.merge(microbiome,metadata)
result.head()

As explained in ```Intro to Pandas II``` : "Lets import two microbiome datasets, each consisting of counts of micro-/organisms from a particular patient", but we can see that the metadata of each ```MID[1-9]``` are different, therefore, the count of taxon for each barcode has been sampled uing differen```(GROUP,SAMPLE)``` pair. Therefore, we can index the dataframe using ```['taxon','GROUP','SAMPLE']``` to then unstack ```GROUP and SAMPLE``` to have for each ```(GROUP,SAMPLE)``` pair (described on the y axis) the count of the taxon (on the x axis).

In [None]:
result = result.set_index(['taxon','GROUP','SAMPLE'])
result.head()

In [None]:
result['count'].unstack('GROUP').unstack('SAMPLE').fillna('unknown')

We see that the indices are unique :

In [None]:
print("The indices are unique : {}".format(result.index.is_unique))

## Task 3. Class War in Titanic

Use pandas to import the data file `Data/titanic.xls`. It contains data on all the passengers that travelled on the Titanic.

In [None]:
from IPython.core.display import HTML
HTML(filename=DATA_FOLDER+'/titanic.html')

For each of the following questions state clearly your assumptions and discuss your findings:
1. Describe the *type* and the *value range* of each attribute. Indicate and transform the attributes that can be `Categorical`. 
2. Plot histograms for the *travel class*, *embarkation port*, *sex* and *age* attributes. For the latter one, use *discrete decade intervals*. 
3. Calculate the proportion of passengers by *cabin floor*. Present your results in a *pie chart*.
4. For each *travel class*, calculate the proportion of the passengers that survived. Present your results in *pie charts*.
5. Calculate the proportion of the passengers that survived by *travel class* and *sex*. Present your results in *a single histogram*.
6. Create 2 equally populated *age categories* and calculate survival proportions by *age category*, *travel class* and *sex*. Present your results in a `DataFrame` with unique index.

In [None]:
titanic = pd.read_excel(DATA_FOLDER+'/titanic.xls',header=0,converters={'ticket':str,'boat':str,'cabin':str,'home.dest':str})
titanic.head()

#We replace the embarcation port by their full name
treatment_map = {'S': 'Southampton', 'Q': 'Queenstown', 'C': 'Cherbourg'}
titanic['embarked'] = titanic['embarked'].map(treatment_map)

In [None]:
titanic.head()

**Question 1: We decided to represent the answer with a dataframe to make the visualization easier**

In [None]:
ranges = []
for i,var in enumerate(titanic.columns):
    ranges.append("'{}' to '{}'".format(titanic.iloc[:, i].dropna().min(),titanic.iloc[:, i].dropna().max()))

df = titanic.transpose()
df['type'] = df[0].map(lambda x: type(x).__name__)
df['range'] = ranges
df[['type','range']]


Following the documentation we see that the values that can be transformes as categories are : ``` pclass, sex, cabin, embarked, boat```

In [None]:
categorical_col= ['pclass','sex', 'cabin', 'embarked', 'boat']
for col in categorical_col:
    titanic[col] = titanic[col].astype('category')


** Question 2 : Now we plot the histograms**

In [None]:
titanic['pclass'].value_counts().plot(kind='bar').set_title('Travel Class histogram')

In [None]:
titanic['sex'].value_counts().plot(kind='bar').set_title('Gender histogram')

In [None]:
titanic['embarked'].value_counts().plot(kind='bar').set_title('Embarcation Port')

In [None]:
pd.cut(titanic.age, [0,10,20,30,40,50,60,70,80,90]).value_counts().plot(kind='bar').set_title('Ages')

***Question 3 : Now we plot the number of passenger depending on the cabine floor (represented by the letter at the beginning of the cabin***

In [None]:
titanic['cabin floor'] = titanic['cabin'].map(lambda x : x[0])

In [None]:
n_passenger = titanic['cabin floor'].value_counts().sum()
floor_repartition = titanic['cabin floor'].value_counts()
print("Floor repartion is :")
floor_repartition.map(lambda x: "{:06.3f}%".format(x/n_passenger *100))

In [None]:
floor_repartition.plot(title='Repartition of passengers according to cabin floors',kind='pie')

**Question 4 : Now for each travel class, we calculate the proportion of the passengers that survived.**
The code is quite straightforward, we compute the number of survivor per class, then take the percentage per class. All the reste is simply cosmetic for it to look better

In [None]:
first_class = titanic[[c == 1 for c in titanic.pclass]]['survived'].value_counts()
n_1 = first_class.sum()
second_class = titanic[[c == 2 for c in titanic.pclass]]['survived'].value_counts()
n_2 = second_class.sum()
third_class = titanic[[c == 3 for c in titanic.pclass]]['survived'].value_counts()
n_3 = third_class.sum()

In [None]:
first_class_repartition = first_class.rename('First class survivors')
first_class_repartition.index = ['survived','died']
print(first_class_repartition.map(lambda x: "{:06.3f}%".format(x/n_1*100)))
first_class_repartition.plot(kind='pie')

In [None]:
second_class_repartition = second_class.rename('Second class survivors')
second_class_repartition.index = ['survived','died']
print(second_class_repartition.map(lambda x: "{:06.3f}%".format(x/n_2*100)))
second_class_repartition.plot(kind='pie')

In [None]:
third_class_repartition = third_class.rename('third class survivors')
third_class_repartition.index = ['survived','died']
print(third_class_repartition.map(lambda x: "{:06.3f}%".format(x/n_3*100)))
third_class_repartition.plot(kind='pie')

**Question 5 : We now calculate the proportion of the passengers that survived by travel class and sex.**
We use seaborn ```factorplot``` to do that very simply?

In [None]:
sns.factorplot(x='sex',y='survived',hue='pclass',kind='bar',data=titanic,ci=None)

** Question 6 : We want to create 2 equally populated age categories and calculate survival proportions by age category, travel class and sex. **
First we separate the ages into two equally populated categories that we will name 'young' and 'old' for the sake of simplicity. 

In [None]:
df = titanic[['pclass','sex','age','survived']]
df['age'] = pd.cut(titanic.age,2,labels=['young','old'])
df.head()

Then we group those according to the the categories that interwst us, compute the sum (will sume the 1s corresponding to survivors and then compute the totals thanks to the ```count``` aggregation function

In [None]:
grouped = df.groupby(['pclass','sex','age']).agg({'survived':'sum'})
grouped['totals'] = df.groupby(['pclass','sex','age']).agg({'survived':'count'})
grouped

Now that we have the number of survivors as well as the count of the population of each group, we can very simply compute the proportion of survivors.


In [None]:
grouped['survived (%)'] = grouped['survived']/grouped['totals']*100
grouped[['survived (%)']].round(2)