----
**Research Question**

## Investigation of changes in **Depressive-Symptoms** influenced by drinking frequency of **Sugar-Sweetened** soft drinks.
----

In [461]:
# importting necessary libraries 

import pandas as pd
import numpy as np
import yaml as yam
from bokeh.plotting import figure,show,output_file
from bokeh.models import ColumnDataSource
from bokeh.layouts import gridplot
from bokeh.palettes import Category10
import holoviews as hv
from bokeh.io import output_notebook

output_notebook()


----
## Part 1 : Loading The Data
----

In [462]:
# creating a config file , to read data from

def get_config():
    with open('config.yaml','r') as datasource:
        config = yam.safe_load(datasource)
        return config

config = get_config()

depressive_symptoms = config['depressive_symptoms']
frequency_of_drinking = config['frequency_of_drinking']

# reading data by creating pandas dataframe
depsymp = pd.read_csv(depressive_symptoms)
drinkfreq = pd.read_csv(frequency_of_drinking)



----
## Part 2 : **Data Preparation**
---- 

* Renaming columns

In [463]:
# Renaming the desired columns in order to have a better understanding of the columns name.

def rename(df,col_list) :
    rename = df.rename(columns= col_list)
    df = rename
    return df

depsymp_col_list = {'isced11':'Edu_Level','hlth_pb':'Health_Issue',
            'sex':'Gender','age':'Age_Range','geo':'Country','TIME_PERIOD':'Time_Period',
            'OBS_VALUE':'Obs_Value'}

depsymp = rename(depsymp,depsymp_col_list)


drinkfreq_col_list = {'frequenc':'Drink_Freq','sex':'Gender','isced11':'Edu_Level',
            'age':'Age_Range','geo':'Country','TIME_PERIOD':'Time_Period','OBS_VALUE':'Obs_Value'}

drinkfreq = rename(drinkfreq,drinkfreq_col_list)

* Identifying missing values

In [464]:
# defining a function to return missing values in multiple dataframes,
# output in the form of a dictionary will contain both dataframe and columns 
# with missing values 

# depsymp.isna().sum()
# depsymp.isnull().sum()


def find_missing_values(df_list):
    missing_values = []
    for i,df in enumerate(df_list):

        df_missing = df.isna().sum().to_dict()
        df_missing = {k: v for k,v in df_missing.items() if v > 0}
        missing_values.append(f'dataframe {i+1} = {df_missing}')
                
    return missing_values


missing_values = find_missing_values([depsymp,drinkfreq])

# print(missing_values)

* Identifying unreliable observations

In [465]:
# Flag columns indicate observations which are not reliable.
# Below, identification of the specific value for unreliable abservations and the length of rows containing 
# them where inspected.

print(depsymp['OBS_FLAG'].unique())
print(drinkfreq['OBS_FLAG'].unique())
print(len(depsymp[depsymp['OBS_FLAG'] == 'u']))
print(len(drinkfreq[drinkfreq['OBS_FLAG'] == 'u']))

[nan 'u']
[nan 'u']
36
27


* Removing unreliable observations and columns with missing values

In [466]:
# defining a function that takes in list of dataframes and returns 
# cleaned dataframes (without columns containing missing values, and rows with unreliable observations)

def drop_missing_values(dataframes):
    cleaned_dataframes = []
    for df in dataframes:
        if 'OBS_FLAG' in df.columns :
            df = df[df['OBS_FLAG'] != 'u']
        df_dropped = df.dropna(axis=1)
        cleaned_dataframes.append(df_dropped)
    return cleaned_dataframes

list = [depsymp,drinkfreq]
cleaned_dataframes = drop_missing_values(list)

depsymp = cleaned_dataframes[0]
drinkfreq = cleaned_dataframes[1]
# drinkfreq



* Dropping irrelevant columns 

In [467]:
# dropping irrelevent columns using if statements to 
# avoid code execution more than once

if 'DATAFLOW' in depsymp.columns :
    depsymp = depsymp.drop(depsymp.columns[[0,1,2,3,4]],axis=1)

if 'DATAFLOW' in drinkfreq.columns :
    drinkfreq = drinkfreq.drop(drinkfreq.columns[[0,1,2,3,7]],axis=1)
    
# print(depsymp)
# print(drinkfreq) 


* Inspect duplicated observations

In [468]:
# checking for duplicated values in both dataframes.

# df.duplicated()
# df.drop_duplicates()

def check_for_duplicated(df_list) :
    for df in df_list :
        if df.duplicated().any() :
            return True
        else :
            return False

duplicated = check_for_duplicated([depsymp,drinkfreq])
# print(duplicated)


----
## Part 3 : Reshaping the data
----

* Merging data frames (Due to creation of redundancy "in the form of duplication"
   which makes it impossible to further process the data this part was canceled.)

In [469]:
## using pandas merge method we are going to combine two dataframes 
## into one with values of interest for further analysing.

# df = pd.merge(drinkfreq,depsymp,
#                 on=['Country','Time_Period','Age_Range','Gender'],
#                 suffixes=['_Dri','_Dep'], #especific suffixes helps to identify the coulmn after being merged
#                 how='inner')

## df = df.sort_values(by=['Obs_Value_Dri','Obs_Value_Dep'], ascending= False).copy()

* Age_range column contains multiple ranges, in the following function 
   we try to keep those ranges which have more coverage.

In [470]:
# # inspection of unique values in order to keep the desired ones in next step
# print(depsymp['Age_Range'].unique() )
# print(drinkfreq['Age_Range'].unique())

# Filtering the data frame for the desired values

def filter_age_range(df_list) :
    filtered_age_range = []

    for df in df_list :
        df = df[df['Age_Range'].isin(['Y15-24','Y25-34','Y35-44','Y45-54','Y55-64','Y65-74'])].copy()
        filtered_age_range.append(df)
    return filtered_age_range


depsymp = filter_age_range([depsymp,drinkfreq])[0]
drinkfreq = filter_age_range([depsymp,drinkfreq])[1]


* match_maker() is a function that creates a list of tuples containing paired unique values 
   of the two provided column from our desired data frame, makes it easier to groupby in the next step.

In [471]:
# we will define a function to make a list containing 
# tuples of paired values of any two given columns.

def match_maker(df,col_1,col_2) :

    result = []

    gen_uni = df[col_1].unique()
    drink_uni = df[col_2].unique()
    for x in gen_uni :
        for y in drink_uni :
            if (x,y) not in result :
                result.append((x,y))

    return result
    

# match_maker(drinkfreq,'Gender','Drink_Freq')

* data_grouper(), this function is designed to access the specific values 
   inside a group and grouping them again in order to make plotting more focused. Actually our final dataframes 
   are going to be made here inside this function.

In [472]:
def data_grouper(df,col_1,col_2) :

    grouped_data = []
    for i in match_maker(df,col_1,col_2) :
        grouping = df.groupby([col_1,col_2]).get_group(i)
        grouped_data.append(grouping)
    return grouped_data

# In total we will have 21 data frames, which are distinguished from each other by their Gender
# our 21 data frames are consisting female values, male values , and total values which include both Gender.
# the following data frames are those which contain total values for both Gender.  


upto3w_tot     = data_grouper(drinkfreq,'Gender','Drink_Freq')[8]
fourto6w_tot   = data_grouper(drinkfreq,'Gender','Drink_Freq')[9]
gen1perday_tot = data_grouper(drinkfreq,'Gender','Drink_Freq')[10]
nevorocc_tot   = data_grouper(drinkfreq,'Gender','Drink_Freq')[11]
dpr_tot        = data_grouper(drinkfreq,'Gender','Drink_Freq')[6]
dpr_mjr_tot    = data_grouper(drinkfreq,'Gender','Drink_Freq')[7]
dpr_oth_tot    = data_grouper(drinkfreq,'Gender','Drink_Freq')[8]

# upto3w_tot

# match_maker(depsymp,'Gender','Health_Issue')
# match_maker(drinkfreq,'Gender','Drink_Freq')

----
## Part 4 : Smooth The Data (on going)
----

In [473]:
# defining a function that takes in a list of dataframes and retunrs a list of 
# smoothed data frames. The built in method '.rolling().mean()' was used.

# def smooth_dataframes(dataframes,window_size) :
#     smoothed_dataframes = []
#     for df in dataframes :
#         df = df.copy()
#         df['Obs_Value_Dri'] = df['Obs_Value_Dri'].rolling(window=window_size).mean()
#         smoothed_dataframes.append(df)
#     return smoothed_dataframes

# dataframes = smooth_dataframes([df1,df2,df3,df4],3)
# dataframes[1]

----
## Part 5 : Data Visualization (on going)
----

* Line plot

In [474]:
def make_line_plot(df) :
    grid_plot_list = []
    # n = A.sort_values(by=['Obs_Value'], ascending= False)
    unique_age = df['Age_Range'].unique()
    for age in unique_age :
        filtered = df.loc[df['Age_Range'] == age]
        l_fig = figure(x_range= filtered['Country'],title='line plot',
                   plot_height=250,plot_width=500,x_axis_label='x',y_axis_label='y',)
        l_fig.line(filtered['Country'],filtered['Obs_Value'])

        grid_plot_list.append(l_fig) 
    
    grid = gridplot(grid_plot_list,ncols= 2)

    show(grid)

make_line_plot(upto3w_tot)

In [475]:

def make_plot(df_list,title,y,y_label) :

    plot_list = []
    
    for i,df in enumerate(df_list) :

        fig = figure(x_range= df.Country.unique(), title=title[i],
                    plot_height = 300, plot_width = 600,
                    x_axis_label='EU Countries', y_axis_label= y_label)
        fig.y_range.start = 0
        fig.xgrid.grid_line_color = None
        fig.ygrid.grid_line_alpha = 1

        fig.vbar(x=df['Country'], top=df[y] , width= 0.8)

        plot_list.append(fig)
                     
    grid = gridplot(plot_list, ncols= 2)

    output_file('myplot.html')

    show(grid)



In [476]:
# df_list = [A,B,C,D]

title_list = ['1-3W  Drinking / Country Percentage',
              '4-6W  Drinking / Country Percentage',
              'GE1D  Drinking / Country Percentage',
              'NVR_OCC  Drinking / Country Percentage']

# make_plot(df_list,title_list,'Obs_Value_Dri','DRI Percentage')

In [477]:

# df_list = [df1,df2,df3,df4]
title_list1 = ['(DPR) Depressive symptoms / Country ratio in Percentage',
               '(DPR_MJR) Major Depressive symptoms / Country ratio in Percentage',
               '(DPR_OTH) Other Depressive symptoms / Country ratio in Percentage',
               '1']

# make_plot(df_list,title_list1,'Obs_Value_Dep','DEP percentage')
