In [1]:
import numpy as np
import pandas as pd
import pickle

# 1. Import et traitement de la population par pays

In [2]:
# Read the data from CSV
df1 = pd.read_csv("Sources/population-2003-2013.csv")

# Select the useful columns + Pivot with years (2014 & 2018) + rename columns + rename the column axe
df1 = (pd
              .pivot_table(df1, values='Valeur', index=['Code Pays', 'Pays'],  columns='Année', aggfunc=sum)
              
              .reset_index()
  
              .rename(columns = {
                'Pays': 'Country',
                'Code Pays': 'country_code',
                2003: 'pop_before',
                2013: 'pop_after',
              })
              
              .rename_axis(None, axis=1)
        )

# Compute the progression
df1['pop_evol'] = df1.pop_after - df1.pop_before
df1['pop_evol (%)'] = (df1.pop_evol / df1.pop_before)*100

# Keep the useful columns
df_population = df1[['country_code','Country', 'pop_evol (%)']].copy()
df_population.head()

Unnamed: 0,country_code,Country,pop_evol (%)
0,1,Arménie,-1.943347
1,2,Afghanistan,32.168195
2,3,Albanie,-2.037666
3,4,Algérie,18.801321
4,7,Angola,39.2387


# 2. Import et traitement de PIB ($) par habitant

In [3]:
# Read the data from CSV
df2 = pd.read_csv("Sources/PIB_par_habitant_2013.csv")

# Select the useful columns + Pivot with year + rename columns + rename the column axe
df2 = (pd
              .pivot_table(df2, values='Valeur', index=['Code zone (FAO)', 'Zone'],  columns='Année', aggfunc=sum)
              
              .reset_index()
  
              .rename(columns = {
                'Zone': 'country',
                'Code zone (FAO)': 'country_code',
                2013: 'PIB 2013'
              })
              
              .rename_axis(None, axis=1)
        )

# Keep the useful columns
df_pib = df2[['country_code','country', 'PIB 2013']].copy()
df_pib.head()

Unnamed: 0,country_code,country,PIB 2013
0,1,Arménie,3838.180041
1,2,Afghanistan,608.058642
2,3,Albanie,4399.863881
3,4,Algérie,5499.587619
4,6,Andorre,39525.132911


# 3. Import et traitement de la disponibilité alimentaire totale

In [4]:
# Read the data from CSV
df3 = pd.read_csv("Sources/disponibilite-alimentaire-2013.csv")

# Select the useful columns + pivot table with 'elements' aggregated by sum, rename columns and axis
df3 = (pd
          .pivot_table(df3, values='Valeur', index=['Code Pays', 'Pays'], columns=['Élément'], aggfunc=np.sum)
       
           .reset_index()
       
           .rename(columns={
               'Code Pays':'country_code',
               'Pays': 'country',
               'Disponibilité alimentaire (Kcal/personne/jour)': 'kcal_total_capita_day',
               'Disponibilité de protéines en quantité (g/personne/jour)': 'proteins_total_capita_day'
           })
       
           .rename_axis(None, axis=1)
      )

# Keep the useful columns
df_food_availability_total = df3[['country_code',
                                  'country',
                                  'kcal_total_capita_day',
                                  'proteins_total_capita_day']].copy()
df_food_availability_total.head(10)

Unnamed: 0,country_code,country,kcal_total_capita_day,proteins_total_capita_day
0,1,Arménie,2924.0,90.06
1,2,Afghanistan,2087.0,58.26
2,3,Albanie,3188.0,111.37
3,4,Algérie,3293.0,91.92
4,7,Angola,2474.0,57.27
5,8,Antigua-et-Barbuda,2416.0,83.49
6,9,Argentine,3226.0,102.6
7,10,Australie,3278.0,106.26
8,11,Autriche,3770.0,106.2
9,12,Bahamas,2670.0,86.16


# 4. Import et traitement de la disponibilité alimentaire animale

In [5]:
# Read the data from CSV
df4 = pd.read_csv("Sources/disponibilite-alimentaire-animale-2013.csv")

# Select the useful columns + pivot table with 'elements' aggregated by sum, rename columns and axis
df4 = (pd
          .pivot_table(df4, values='Valeur', index=['Code Pays', 'Pays'], columns=['Élément'], aggfunc=np.sum)
       
           .reset_index()
       
           .rename(columns={
               'Code Pays':'country_code',
               'Pays': 'country',
               'Disponibilité alimentaire (Kcal/personne/jour)': 'kcal_animal_capita_day',
               'Disponibilité de protéines en quantité (g/personne/jour)': 'proteins_animal_capita_day'
           })
       
           .rename_axis(None, axis=1)
      )

# Keep the useful columns
df_food_availability_animal = df4[['country_code','country', 'proteins_animal_capita_day']].copy()
df_food_availability_animal.head(10)

Unnamed: 0,country_code,country,proteins_animal_capita_day
0,1,Arménie,43.25
1,2,Afghanistan,12.22
2,3,Albanie,59.42
3,4,Algérie,24.99
4,7,Angola,18.4
5,8,Antigua-et-Barbuda,56.83
6,9,Argentine,66.94
7,10,Australie,71.68
8,11,Autriche,62.86
9,12,Bahamas,56.43


# 5. Construction, nettoyage et sauvegarde du dataframe principal

In [6]:
main_df = (
            # We use df_population as main dataframe
            df_population
                
                # We copy it not to modify the original one
                .copy()

                # We merge df_pib
                .merge(df_pib, how='left', on='country_code')
    
                # We merge df_food_availability_total
                .merge(df_food_availability_total, how='left', on='country_code')
    
                # We merge df_food_availability_animal
                .merge(df_food_availability_animal, how='left', on='country_code')
    
                # We create a new column for the [animal proteins] / [total proteins] ratio
                .assign(
                    proteins_animal_ratio = lambda x: x.proteins_animal_capita_day / x.proteins_total_capita_day
                )
    
                # We drop columns that we don't need
                .drop(['proteins_animal_capita_day'], axis=1)
)

main_df.drop(['country_x','country_y','country','country_code'],1,inplace=True)

main_df.drop(main_df.loc[main_df['Country']=='Chine - RAS de Hong-Kong'].index, inplace=True)
main_df.drop(main_df.loc[main_df['Country']=='Chine - RAS de Macao'].index, inplace=True)
main_df.drop(main_df.loc[main_df['Country']=='Chine, continentale'].index, inplace=True)

display(main_df.head(20))

# We save the main_df in a file
with open('Data/part1.pkl', 'wb') as f:
    my_pickler = pickle.Pickler(f)
    my_pickler.dump(main_df)

Unnamed: 0,Country,pop_evol (%),PIB 2013,kcal_total_capita_day,proteins_total_capita_day,proteins_animal_ratio
0,Arménie,-1.943347,3838.180041,2924.0,90.06,0.480235
1,Afghanistan,32.168195,608.058642,2087.0,58.26,0.209749
2,Albanie,-2.037666,4399.863881,3188.0,111.37,0.533537
3,Algérie,18.801321,5499.587619,3293.0,91.92,0.271867
4,Angola,39.2387,5254.883253,2474.0,57.27,0.321285
5,Antigua-et-Barbuda,11.111111,12909.744178,2416.0,83.49,0.68068
6,Argentine,9.154596,14534.922536,3226.0,102.6,0.652437
7,Australie,16.989926,66360.873829,3278.0,106.26,0.674572
8,Autriche,4.335544,50264.050148,3770.0,106.2,0.591902
9,Bahamas,19.303797,28783.475729,2670.0,86.16,0.654944
