----
**Research Question**

## Investigation of changes in **Depressive-Symptoms** influenced by drinking frequency of **Sugar-Sweetened** soft drinks.
----

In [164]:
# importting necessary libraries 

import pandas as pd
import numpy as np
import yaml as yam

from bokeh.plotting import figure,show,output_file
from bokeh.models import HoverTool
from bokeh.layouts import gridplot
from bokeh.palettes import Category10
# import holoviews as hv
from bokeh.io import output_notebook

from scipy.stats import linregress

output_notebook()


----
## Part 1 : Loading The Data
----

In [165]:
# Defining a config parser, to prase the yaml file in order to 
# make further manipulation facilated.

def get_config():
    with open('config.yaml','r') as datasource:
        config = yam.safe_load(datasource)
        return config

config = get_config()

depressive_symptoms = config['depressive_symptoms']
frequency_of_drinking = config['frequency_of_drinking']

# reading data by creating pandas dataframe
depsymp = pd.read_csv(depressive_symptoms)
drinkfreq = pd.read_csv(frequency_of_drinking)


In [166]:
# In order to access our Data frames info and to check\
# if data was loaded and read smoothly.

def df_info(df_list) :
    df_info_list = []
    for df in df_list:
        try:
            info = df.info()
            df_info_list.append(info)
        except:
            print('Data not properly loaded')
    return df_info_list

# df_info([depsymp,drinkfreq])


----
## Part 2 : **Data Preparation**
---- 

* Renaming columns

In [167]:
# Renaming the desired columns in order to have a better understanding of the columns name.

def rename(df,col_list) :
    rename = df.rename(columns= col_list)
    df = rename
    return df

depsymp_col_list = {'isced11':'Edu_Level','hlth_pb':'Health_Issue',
            'sex':'Gender','age':'Age_Range','geo':'Country','TIME_PERIOD':'Time_Period',
            'OBS_VALUE':'Obs_Value'}

depsymp = rename(depsymp,depsymp_col_list)


drinkfreq_col_list = {'frequenc':'Drink_Freq','sex':'Gender','isced11':'Edu_Level',
            'age':'Age_Range','geo':'Country','TIME_PERIOD':'Time_Period','OBS_VALUE':'Obs_Value'}

drinkfreq = rename(drinkfreq,drinkfreq_col_list)

* Identifying missing values

In [168]:
# defining a function to return missing values in multiple dataframes,
# output in the form of a dictionary will contain both dataframe and columns 
# with missing values 

# depsymp.isna().sum()
# depsymp.isnull().sum()


def find_missing_values(df_list):
    missing_values = []
    for i,df in enumerate(df_list):

        df_missing = df.isna().sum().to_dict()
        df_missing = {k: v for k,v in df_missing.items() if v > 0}
        missing_values.append(f'dataframe {i+1} = {df_missing}')
                
    return missing_values


missing_values = find_missing_values([depsymp,drinkfreq])

# print(missing_values)

* Identifying unreliable observations

In [169]:
# Flag columns indicate observations which are not reliable.
# Below, identification of the specific value for unreliable abservations and the length of rows containing 
# them where inspected.

print(depsymp['OBS_FLAG'].unique())
print(drinkfreq['OBS_FLAG'].unique())
print(len(depsymp[depsymp['OBS_FLAG'] == 'u']))
print(len(drinkfreq[drinkfreq['OBS_FLAG'] == 'u']))

[nan 'u']
[nan 'u']
36
27


* Removing unreliable observations and columns with missing values

In [170]:
# defining a function that takes in list of dataframes and returns 
# cleaned dataframes (without columns containing missing values, and rows with unreliable observations)

def drop_missing_values(dataframes):
    cleaned_dataframes = []
    for df in dataframes:
        if 'OBS_FLAG' in df.columns :
            df = df[df['OBS_FLAG'] != 'u']
        df_dropped = df.dropna(axis=1)
        cleaned_dataframes.append(df_dropped)
    return cleaned_dataframes


cleaned_dataframes = drop_missing_values([depsymp,drinkfreq])

depsymp = cleaned_dataframes[0]
drinkfreq = cleaned_dataframes[1]


* Dropping irrelevant columns 

In [171]:
# dropping irrelevent columns using if statements to 
# avoid code execution more than once

if 'DATAFLOW' in depsymp.columns :
    depsymp = depsymp.drop(depsymp.columns[[0,1,2,3,4]],axis=1)

if 'DATAFLOW' in drinkfreq.columns :
    drinkfreq = drinkfreq.drop(drinkfreq.columns[[0,1,2,3,7]],axis=1)
    
# print(depsymp)
# print(drinkfreq) 


* Inspect duplicated observations

In [172]:
# checking for duplicated values in both dataframes.

# df.duplicated()
# df.drop_duplicates()

def check_for_duplicated(df_list) :
    for df in df_list :
        if df.duplicated().any() :
            return True
        else :
            return False

duplicated = check_for_duplicated([depsymp,drinkfreq])
# print(duplicated)


----
## Part 3 : Reshaping the data
----

* Merging data frames 

In [173]:

"""
using pandas merge method we are going to combine two dataframes\
into one with values of interest for further analysing.
especific suffixes helps to identify the coulmn after being merged

"""

df = pd.merge(drinkfreq,depsymp,
                on=['Country','Time_Period','Age_Range','Gender'],
                suffixes=['_Dri','_Dep'], 
                how='inner')

# We filter our data frame for more focused age ranges

df = df[df['Age_Range'].isin(['Y15-24','Y25-34','Y35-44','Y45-54',
                              'Y55-64','Y65-74'])]

"""
Our data frames contains observation values for 
both male and female and an extra value as total.
We keep the total value.
"""
 
df = df[df['Gender'] == 'T']

# df = df.sort_values(by=['Obs_Value_Dri','Obs_Value_Dep'], ascending= False).copy()

In [174]:
def match_maker(df,col_1,col_2,col_3) :

    '''
    This function is designed to return a list of tuples containing
    two specific columns' unique values paired with each other. In order
    to use this particular tupels(which each contains one unique value from 
    each column) to group the whole data frame with.

    parameters : datarame, two specific columns from the dataframe

    '''

    result = []
    col_1 = df[col_1].unique()
    col_2 = df[col_2].unique()
    col_3 = df[col_3].unique()

    for x in col_1 :
        for y in col_2 :
            for z in col_3 :
                if (x,y,z) not in result :
                    
                    result.append((x,y,z))

    return result

# x = match_maker(df,'Health_Issue','Drink_Freq','Age_Range')



In [175]:
def get_grouped(df) :

    '''
    this function takes in a list of tuples\
    created in "matche_maker()" function,
    with using ".groupby().get_grouped()" method, 
    it returns data frames which only have the 
    desired values inside their specific columns.

    parameters : it takes one parameter,
    which is the dataframe
    '''
    grouped_dfs = []
    for i in match_maker(df,'Health_Issue','Drink_Freq','Age_Range') :
        
        grouped_df = df.groupby(['Health_Issue','Drink_Freq','Age_Range']).get_group(i)
        grouped_dfs.append(grouped_df)
        
    return grouped_dfs

grouped_dfs = get_grouped(df)
# grouped_dfs[0].head()
grouped_dfs[1].head()


Unnamed: 0,Drink_Freq,Gender,Age_Range,Country,Time_Period,Obs_Value_Dri,Health_Issue,Obs_Value_Dep
7920,1-3W,T,Y25-34,AT,2019,24.3,DPR,3.9
7932,1-3W,T,Y25-34,BE,2019,22.2,DPR,7.7
7944,1-3W,T,Y25-34,BG,2019,32.0,DPR,1.4
7956,1-3W,T,Y25-34,CY,2019,29.0,DPR,0.9
7968,1-3W,T,Y25-34,CZ,2019,23.6,DPR,2.3


----
## Part 5 : Data Visualization (on going)
----

In [176]:
def make_line_plot():
    plot_list=[]
    for df in grouped_dfs[::6]:
    
        fig = figure(x_range=df['Country'],height=250,width=550,
                     title=f"{df['Drink_Freq'].iloc[1]} - {df['Health_Issue'].iloc[1]} - {df['Age_Range'].iloc[1]}",
                     x_axis_label='EU Countries',y_axis_label='Frequency in Percentage')
        
        fig.line(x=df['Country'],y=df['Obs_Value_Dri'],
                 color='#CC79A7',line_width=1.2,legend_label='drinkfreq')
        fig.line(x=df['Country'],y=df['Obs_Value_Dep'],
                 color='#0072B2',line_width=1.2,legend_label='Health_Issue')

        # fig.xgrid.grid_line_color = 'red'
        fig.xgrid.grid_line_alpha = 0.5
        fig.xaxis.axis_line_alpha = 0
        fig.ygrid.grid_line_alpha = 1

        
        plot_list.append(fig)
        
    grid = gridplot(plot_list,ncols=2)
    show(grid)

make_line_plot()


In [177]:
def create_scatter_plot():

    scatter_plots=[]

    for df in grouped_dfs[::6]:
        
        TOOLTIPS = [
            ('index','$index'),
            ('(x,y)','($x,$y)'),
            ('Country','@Country')
        ]

        scatter = figure(height=300,width=550,
                        title=f"{df['Health_Issue'].iloc[1]} / {df['Drink_Freq'].iloc[1]} - {df['Age_Range'].iloc[1]}",
                         x_axis_label='DRI/freq.OBS.VAL',y_axis_label='DEP/symp.OBS.VAL',
                         tools='hover',tooltips=TOOLTIPS)
        
        
        scatter.scatter(x=df['Obs_Value_Dep'],y=df['Obs_Value_Dri'],
                        size=5,color='red')
        
        scatter.xaxis.axis_line_alpha = 0
        scatter.yaxis.axis_line_alpha = 0
        

        x = df['Obs_Value_Dep'].values
        y = df['Obs_Value_Dri'].values
        res = linregress(x,y)
        y_regress = res.slope * x + res.intercept
        scatter.line(x=x,y=y_regress,color='red')


        scatter_plots.append(scatter)

    grid = gridplot(scatter_plots,ncols=2)
    output_file('myplot.html')
    show(grid)
    


create_scatter_plot()