In [112]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

## Load the data

In [113]:
raw_df = pd.read_csv('./data/FAOSTAT_data.csv')
raw_df

Unnamed: 0,Domain Code,Domain,Area Code (M49),Area,Element Code,Element,Item Code,Item,Year Code,Year,Unit,Value,Flag,Flag Description,Note
0,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,21010,Average dietary energy supply adequacy (percen...,20002002,2000-2002,%,88,E,Estimated value,
1,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,21010,Average dietary energy supply adequacy (percen...,20012003,2001-2003,%,89,E,Estimated value,
2,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,21010,Average dietary energy supply adequacy (percen...,20022004,2002-2004,%,92,E,Estimated value,
3,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,21010,Average dietary energy supply adequacy (percen...,20032005,2003-2005,%,93,E,Estimated value,
4,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,21010,Average dietary energy supply adequacy (percen...,20042006,2004-2006,%,94,E,Estimated value,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139523,FS,Suite of Food Security Indicators,716,Zimbabwe,6123,Value,21061,Average fat supply (g/cap/day) (3-year average),20142016,2014-2016,g/pc/d,69.9,E,Estimated value,
139524,FS,Suite of Food Security Indicators,716,Zimbabwe,6123,Value,21061,Average fat supply (g/cap/day) (3-year average),20152017,2015-2017,g/pc/d,68.6,E,Estimated value,
139525,FS,Suite of Food Security Indicators,716,Zimbabwe,6123,Value,21061,Average fat supply (g/cap/day) (3-year average),20162018,2016-2018,g/pc/d,66.9,E,Estimated value,
139526,FS,Suite of Food Security Indicators,716,Zimbabwe,6123,Value,21061,Average fat supply (g/cap/day) (3-year average),20172019,2017-2019,g/pc/d,63.6,E,Estimated value,


Food insecurity:

* Prevalence of severe food insecurity in the total population (percent): 210401 (M/F)
* Prevalence of moderate or severe food insecurity in the total population (percent): 210091 (M/F)
* Number of severely food insecure people (million): 210071 (M/F)
* Number of moderately or severely food insecure people (million): 210081 (M/F)

## Further cleaning and preprocessing

In [114]:
raw_df.columns = raw_df.columns.str.lower()
raw_df.columns = raw_df.columns.str.replace(' ', '_')
# change the area column to country column
raw_df = raw_df.rename(columns={'area': 'country'})

In [115]:
# filter out the data by item code
# prevalence of severe food insecurity in the total population
prev_severe_total = raw_df[raw_df['item_code'] == '210401'] 
prev_severe_male = raw_df[raw_df['item_code'] == '210401M']
prev_sever_female = raw_df[raw_df['item_code'] == '210401F']

# prevalence of moderate or severe food insecurity in the total population
prev_mod_total = raw_df[raw_df['item_code'] == '210091']
prev_mod_male = raw_df[raw_df['item_code'] == '210091M']
prev_mod_female = raw_df[raw_df['item_code'] == '210091F']


In [116]:
prev_severe_total.head()

Unnamed: 0,domain_code,domain,area_code_(m49),country,element_code,element,item_code,item,year_code,year,unit,value,flag,flag_description,note
140,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,210401,Prevalence of severe food insecurity in the to...,20142016,2014-2016,%,14.8,A,Official figure,FAO data
141,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,210401,Prevalence of severe food insecurity in the to...,20152017,2015-2017,%,15.1,A,Official figure,FAO data
142,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,210401,Prevalence of severe food insecurity in the to...,20162018,2016-2018,%,17.3,A,Official figure,FAO data
143,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,210401,Prevalence of severe food insecurity in the to...,20172019,2017-2019,%,17.3,A,Official figure,Official estimate
144,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,210401,Prevalence of severe food insecurity in the to...,20182020,2018-2020,%,19.8,A,Official figure,Official estimate


In [117]:
# change the value column type to numeric value
prev_severe_total['value'] = pd.to_numeric(prev_severe_total['value'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prev_severe_total['value'] = pd.to_numeric(prev_severe_total['value'], errors='coerce')


In [118]:
# group by the country and get the sum of all the values
prev_severe_total = prev_severe_total.groupby('country', as_index=False)['value'].sum().reset_index(drop=True)
prev_severe_total

Unnamed: 0,country,value
0,Afghanistan,135.3
1,Albania,65.5
2,Algeria,65.1
3,American Samoa,0.0
4,Andorra,0.0
...,...,...
199,Venezuela (Bolivarian Republic of),0.0
200,Viet Nam,2.3
201,Yemen,80.4
202,Zambia,189.4


In [119]:
cleaned_df = pd.read_csv('./data/CleanedData/foodinsecuritybygender.csv', encoding='ISO-8859-1')
cleaned_df.head()

Unnamed: 0.1,Unnamed: 0,Area,Region,YearPeriod,AVG_PercentFemale,AVG_PercentMale
0,1,Albania,Europe,Y20002002,,
1,2,Albania,Europe,Y20012003,,
2,3,Albania,Europe,Y20022004,,
3,4,Albania,Europe,Y20032005,,
4,5,Albania,Europe,Y20042006,,


In [124]:
# group by area and region column
grouped_df = cleaned_df.groupby(['Area', 'Region'], as_index=False).first()

grouped_df.drop(columns=['Unnamed: 0', 'YearPeriod', 'AVG_PercentFemale', 'AVG_PercentMale'], inplace=True)
grouped_df

Unnamed: 0,Area,Region
0,Albania,Europe
1,Algeria,Africa
2,Andorra,Europe
3,Angola,Africa
4,Austria,Europe
...,...,...
89,Ukraine,Europe
90,United Kingdom of Great Britain and Northern I...,Europe
91,United Republic of Tanzania,Africa
92,Zambia,Africa


In [125]:
# lowercase the column names
grouped_df.columns = grouped_df.columns.str.lower()
# change the area column to country
grouped_df = grouped_df.rename(columns={'area': 'country'})

# merge the grouped df and prev_sever_total
merged_df = pd.merge(prev_severe_total,grouped_df, left_on='country', right_on='country')

merged_df

Unnamed: 0,country,value,region
0,Albania,65.5,Europe
1,Algeria,65.1,Africa
2,Andorra,0.0,Europe
3,Angola,133.5,Africa
4,Austria,8.3,Europe
...,...,...,...
89,Ukraine,17.8,Europe
90,United Kingdom of Great Britain and Northern I...,10.4,Europe
91,United Republic of Tanzania,172.4,Africa
92,Zambia,189.4,Africa
