In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

## Load the data

In [13]:
raw_df = pd.read_csv('./data/FAOSTAT_data.csv')
raw_df

Unnamed: 0,Domain Code,Domain,Area Code (M49),Area,Element Code,Element,Item Code,Item,Year Code,Year,Unit,Value,Flag,Flag Description,Note
0,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,21010,Average dietary energy supply adequacy (percen...,20002002,2000-2002,%,88,E,Estimated value,
1,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,21010,Average dietary energy supply adequacy (percen...,20012003,2001-2003,%,89,E,Estimated value,
2,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,21010,Average dietary energy supply adequacy (percen...,20022004,2002-2004,%,92,E,Estimated value,
3,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,21010,Average dietary energy supply adequacy (percen...,20032005,2003-2005,%,93,E,Estimated value,
4,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,21010,Average dietary energy supply adequacy (percen...,20042006,2004-2006,%,94,E,Estimated value,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139523,FS,Suite of Food Security Indicators,716,Zimbabwe,6123,Value,21061,Average fat supply (g/cap/day) (3-year average),20142016,2014-2016,g/pc/d,69.9,E,Estimated value,
139524,FS,Suite of Food Security Indicators,716,Zimbabwe,6123,Value,21061,Average fat supply (g/cap/day) (3-year average),20152017,2015-2017,g/pc/d,68.6,E,Estimated value,
139525,FS,Suite of Food Security Indicators,716,Zimbabwe,6123,Value,21061,Average fat supply (g/cap/day) (3-year average),20162018,2016-2018,g/pc/d,66.9,E,Estimated value,
139526,FS,Suite of Food Security Indicators,716,Zimbabwe,6123,Value,21061,Average fat supply (g/cap/day) (3-year average),20172019,2017-2019,g/pc/d,63.6,E,Estimated value,


Food insecurity:

* Prevalence of severe food insecurity in the total population (percent): 210401 (M/F)
* Prevalence of moderate or severe food insecurity in the total population (percent): 210091 (M/F)
* Number of severely food insecure people (million): 210071 (M/F)
* Number of moderately or severely food insecure people (million): 210081 (M/F)

In [27]:
raw_df.columns = raw_df.columns.str.lower()
raw_df.columns = raw_df.columns.str.replace(' ', '_')

In [37]:
# filter out the data by item code
# prevalence of sever food insecurity in the total population
df1 = raw_df[(raw_df['item_code'] == 210401) | 
             (raw_df['item_code'] == '210401M') | 
             (raw_df['item_code'] == '210401F')]

# prevalence of moderate or severe food insecurity in the total population
df2 = raw_df[(raw_df['item_code'] == 210091) |
            (raw_df['item_code'] == '210091M') |
            (raw_df['item_code'] == '210091F')]


Unnamed: 0,domain_code,domain,area_code_(m49),area,element_code,element,item_code,item,year_code,year,unit,value,flag,flag_description,note
147,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,210401M,Prevalence of severe food insecurity in the ma...,20142016,2014-2016,%,14.2,A,Official figure,FAO data
148,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,210401M,Prevalence of severe food insecurity in the ma...,20152017,2015-2017,%,14.1,A,Official figure,FAO data
149,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,210401M,Prevalence of severe food insecurity in the ma...,20162018,2016-2018,%,15.3,A,Official figure,FAO data
150,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,210401M,Prevalence of severe food insecurity in the ma...,20172019,2017-2019,%,14.1,A,Official figure,Official estimate
151,FS,Suite of Food Security Indicators,4,Afghanistan,6121,Value,210401M,Prevalence of severe food insecurity in the ma...,20182020,2018-2020,%,14.8,A,Official figure,Official estimate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138916,FS,Suite of Food Security Indicators,716,Zimbabwe,6121,Value,210401F,Prevalence of severe food insecurity in the fe...,20162018,2016-2018,%,37.1,A,Official figure,FAO data
138917,FS,Suite of Food Security Indicators,716,Zimbabwe,6121,Value,210401F,Prevalence of severe food insecurity in the fe...,20172019,2017-2019,%,35.8,A,Official figure,FAO data
138918,FS,Suite of Food Security Indicators,716,Zimbabwe,6121,Value,210401F,Prevalence of severe food insecurity in the fe...,20182020,2018-2020,%,33.7,A,Official figure,FAO data
138919,FS,Suite of Food Security Indicators,716,Zimbabwe,6121,Value,210401F,Prevalence of severe food insecurity in the fe...,20192021,2019-2021,%,33.1,A,Official figure,FAO data


Unnamed: 0,domain_code,domain,area_code_(m49),area,element_code,element,item_code,item,year_code,year,unit,value,flag,flag_description,note


In [15]:
df = pd.read_csv('data/CleanedData/foodinsecuritybygender.csv', encoding='ISO-8859-1')
df

Unnamed: 0.1,Unnamed: 0,Area,Region,YearPeriod,AVG_PercentFemale,AVG_PercentMale
0,1,Albania,Europe,Y20002002,,
1,2,Albania,Europe,Y20012003,,
2,3,Albania,Europe,Y20022004,,
3,4,Albania,Europe,Y20032005,,
4,5,Albania,Europe,Y20042006,,
...,...,...,...,...,...,...
1969,1970,Zimbabwe,Africa,Y20162018,37.1,37.1
1970,1971,Zimbabwe,Africa,Y20172019,35.8,35.8
1971,1972,Zimbabwe,Africa,Y20182020,33.7,33.7
1972,1973,Zimbabwe,Africa,Y20192021,33.1,33.1


## Further cleaning and preprocessing

In [16]:
# lowercase the columns and remove the space bar
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ', '_')
df = df.rename(columns={'unnamed:_0': 'ID', 'yearperiod': 'year_period', 'avg_percentfemale':'avg_percent_female', 'avg_percentmale':'avg_percent_male'})
df.head()

Unnamed: 0,ID,area,region,year_period,avg_percent_female,avg_percent_male
0,1,Albania,Europe,Y20002002,,
1,2,Albania,Europe,Y20012003,,
2,3,Albania,Europe,Y20022004,,
3,4,Albania,Europe,Y20032005,,
4,5,Albania,Europe,Y20042006,,


In [17]:
# only see certain columns
df2 = df[['ID', 'area', 'region', 'avg_percent_female', 'avg_percent_male']]
# convert the object values to numeric values
df2[['avg_percent_female', 'avg_percent_male']] = df1[['avg_percent_female', 'avg_percent_male']].apply(pd.to_numeric, errors='coerce')
df2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[['avg_percent_female', 'avg_percent_male']] = df1[['avg_percent_female', 'avg_percent_male']].apply(pd.to_numeric, errors='coerce')


Unnamed: 0,ID,area,region,avg_percent_female,avg_percent_male
0,1,Albania,Europe,,
1,2,Albania,Europe,,
2,3,Albania,Europe,,
3,4,Albania,Europe,,
4,5,Albania,Europe,,
...,...,...,...,...,...
1969,1970,Zimbabwe,Africa,37.1,37.1
1970,1971,Zimbabwe,Africa,35.8,35.8
1971,1972,Zimbabwe,Africa,33.7,33.7
1972,1973,Zimbabwe,Africa,33.1,33.1


In [18]:
# group by area and region and get the average of the percentage of food insecurity and ignore the NaN values
df3 = df2.groupby(['area', 'region'], as_index=False).agg({'avg_percent_female': 'mean', 'avg_percent_male': 'mean'})
df3.head()

Unnamed: 0,area,region,avg_percent_female,avg_percent_male
0,Albania,Europe,10.542857,10.542857
1,Algeria,Africa,9.642857,9.642857
2,Andorra,Europe,,
3,Angola,Africa,27.92,27.92
4,Austria,Europe,1.214286,1.214286
