# Analyzing the Effect of Nutrition (Specifically Protein Content) on COVID-19 Cases Around the World

By: Justin Lu

In [None]:
# imports
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'  # Higher resolution figures
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import datetime
from scipy.stats import mode

In [27]:
! pip install tensorflow==2.0




In [28]:
import tensorflow as tf

In [29]:
from tensorflow.keras import models, layers

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\Sysadmin\anaconda3\lib\site-packages\tensorflow_core\python\pywrap_tensorflow.py", line 58, in <module>
    from tensorflow.python.pywrap_tensorflow_internal import *
  File "C:\Users\Sysadmin\anaconda3\lib\site-packages\tensorflow_core\python\pywrap_tensorflow_internal.py", line 2453, in <module>
    from tensorflow.python.util import deprecation
  File "C:\Users\Sysadmin\anaconda3\lib\site-packages\tensorflow_core\python\util\deprecation.py", line 25, in <module>
    from tensorflow.python.platform import tf_logging as logging
ImportError: cannot import name 'tf_logging' from 'tensorflow.python.platform' (C:\Users\Sysadmin\anaconda3\lib\site-packages\tensorflow\python\platform\__init__.py)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Sysadmin\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_g

TypeError: can only concatenate str (not "list") to str

Source: Maria Ren on Kaggle
* Nutritional data from the UN's Food and Agriculture Organization
* Population data from the Population Reference Bureau
* COVID-19 data from Johns Hopkins Center for Systems Science and Engineering

In [None]:
protein = pd.read_csv('Protein_Supply_Quantity_Data.csv')
protein.head()

In [None]:
protein.shape

In [None]:
protein.columns

## Cleaning

Based on a cursory overview of the columns in our dataset, we can tell that most of these variables *could* be significant predictors of COVID-19 spread. The last column, `Unit (all except Population)`, is potentially less useful, as it is just an indication that the data we are working with are all in percentages.

In [None]:
protein = protein.drop(['Unit (all except Population)'], axis = 1)
protein.head()

Another area of cleaning that must be done is the presence of NaN (empty or missing) values in the dataset. The process of dealing with these missing values is known as **data imputation**. Now, we take a look at what values in the dataset are missing and how to deal with it before we start analyzing/creating models.

In [None]:
protein.isna().sum()

There are many ways to go about imputing a missing value, but for this purpose, I chose to fill them with the average value of that specific column.
We can see that some data pertaining to health in the countries, specifically the `Obesity` and `Undernourished` columns, are missing. For these two missing columns, I decided to impute with the average of the column. For the `Confirmed`, `Deaths`, `Recovered`, and `Active` columns, it was possible that data was unavailable/not provided. For example, it is difficult to obtain COVID data from North Korea, because their government probably would not provide it. It is also possible that, at the time that this data was obtained, some countries had not shown cases yet because there was no travel to or from that country. (We know now that every country on the planet has experienced cases of COVID-19.) Therefore, to keep things consistent, I opted to remove these countries from the analysis.

In [None]:
protein['Undernourished'] = protein['Undernourished'].replace({'<2.5': 2.5})
protein['Undernourished'] = np.array(protein['Undernourished']).astype(np.float)

In [None]:
protein['Obesity'] = protein['Obesity'].fillna(protein['Obesity'].mean())
protein['Undernourished'] = protein['Undernourished'].fillna(np.nanmean(protein['Undernourished']))

In [None]:
protein[protein['Confirmed'].isna() == True]

In [None]:
protein = protein.drop(np.array(protein[protein['Confirmed'].isna() == True].index), axis = 0).reset_index(drop = True)

In [None]:
# cleaned dataset 
protein.head()

## Exploratory Data Analysis

I want to extract some quick statistics and plots to better understand what the data distribution looks like.

In [None]:
top = protein[protein['Population'] > 100000000].sort_values(by = 'Confirmed', ascending = False)[:20]
top

In [None]:
plt.figure(figsize=(15,12))
plt.title("Top COVID-19 Affected Countries (with populations over 100 million)")
ax = sns.barplot(x = top.Country, y = top.Confirmed)
suppress = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)

In [None]:
protein.columns

In [None]:
af = protein[protein['Country'] == 'Korea, South']
af[af.columns[1:23]].sum()

Extracting foods with high protein, coupled with data pertaining to overall health of each country (obesity, undernourishment):

animal products, eggs, fish/seafood, meat, treenuts, vegetal products, milk-excluding butter, obesity, undernourished

In [None]:
test_dat = protein[['Animal Products', 'Cereals - Excluding Beer', 'Eggs', 
                  'Fish, Seafood', 'Meat', 'Treenuts', 'Vegetal Products', 
                  'Vegetables', 'Obesity', 'Undernourished', 'Confirmed']]
test_dat

In [None]:
test_dat.isna().sum()

## Baseline Model - Using Random Forest Regression

In [None]:
X = test_dat[['Animal Products', 'Cereals - Excluding Beer', 'Eggs', 
                  'Fish, Seafood', 'Meat', 'Treenuts', 'Vegetal Products', 
                  'Vegetables', 'Obesity', 'Undernourished']]
y = test_dat['Confirmed']

In [None]:
types = X.dtypes
numcols = types.loc[types != np.object].index

In [None]:
ct = ColumnTransformer([
    ('numcols', SimpleImputer(strategy = 'constant', fill_value = 0), numcols)
])

pl = Pipeline([('feats', ct), ('reg', LinearRegression())])

X_tr, X_ts, y_tr, y_ts = train_test_split(X, y, test_size=0.25)
pl.fit(X_tr, y_tr)
pl.score(X_ts, y_ts)

## After Feature Selection/Engineering 
- Standardization - convert to standard units
- Engineering

In [None]:
final_X = test_dat[['Animal Products', 'Cereals - Excluding Beer', 
                    'Vegetal Products', 'Obesity', 'Undernourished']]
final_y = test_dat['Confirmed']
numcols = ['Obesity', 'Undernourished', 'Cereals - Excluding Beer', 'Vegetal Products', 'Animal Products']

In [None]:
perc_pipe = Pipeline([
    ('imp', SimpleImputer(strategy = 'mean')),
    ('percentage_transform', StandardScaler())
])

ct = ColumnTransformer([ 
    ('perc_cols', perc_pipe, numcols)
])

pl = Pipeline([('feats', ct), ('reg', RandomForestRegressor())])

X_tr, X_ts, y_tr, y_ts = train_test_split(final_X, final_y, test_size=0.25)
pl.fit(X_tr, y_tr)
pl.score(X_ts, y_ts)

## Neural Network with Keras

In [20]:
from tf.keras.models import Sequential
from tf.keras.layers import Dense

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\Sysadmin\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-20-5a993a523ef3>", line 1, in <module>
    from tf.keras.models import Sequential
ModuleNotFoundError: No module named 'tf'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Sysadmin\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2044, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'ModuleNotFoundError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Sysadmin\anaconda3\lib\site-packages\tensorflow_core\python\pywrap_tensorflow.py", line 58, in <module>
    from tensorflow.python.pywrap_tensorflow_internal import *
  File "C:\Users\Sysa

ModuleNotFoundError: No module named 'tf'

obesity and COVID: https://onlinelibrary.wiley.com/doi/10.1002/oby.22818