In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress

In [None]:
# read in the data
employment_data = pd.read_csv("Resources/Average annual hours actually worked.csv")
happiness_data = pd.read_csv("Resources/World Happiness Report 2018.csv")

In [None]:
# clean up employment_data

# get only relevant rows
employment_data = employment_data[["Country", "Employment status", "Time", "Value"]]

# get only total employment
total_employment = employment_data.loc[employment_data["Employment status"] == "Total employment"]

# get only 2018 data
total_employment_2018 = total_employment.loc[total_employment["Time"] == 2018]

# sort by country
total_employment_2018 = total_employment_2018.sort_values(by="Country")

# rename columns and get a final dataframe
total_employment_2018 = total_employment_2018[["Country", "Value"]]
total_employment_2018 = total_employment_2018.rename(columns={"Value": "Avg Hours Worked per Worker"})

In [None]:
# clean up happiness_data

# rename country column for merging
happiness_data = happiness_data.rename(columns={"Country or region": "Country"})

In [None]:
# merge happiness data into the employment data
data = pd.merge(total_employment_2018, happiness_data, how="left", on="Country")

# clean up merged data
data = data.dropna()

In [None]:
# Plot Average Number of Hours Worked per Worker by Country

# countries list and tick locations
countries = data["Country"].tolist()
tick_locations = np.arange(len(countries))

# dataframe for plot
hours_worked = data[["Country", "Avg Hours Worked per Worker"]]

# plot the data
hours_worked.plot(kind="bar", legend=False, width=.8)

# assign xticks
plt.xticks(tick_locations, countries, rotation=90)

# title and labels
plt.title(f"Average Number of Hours Worked\n"
          f"per Worker by Country (2018)")
plt.xlabel("Country")
plt.ylabel("Number of Hours")

# display plot
plt.tight_layout()
plt.show()

In [None]:
# Regression Analysis for Question #1
# Hypothesis: Countries with lower working hours will tend to have a higher happiness score.

# define independent and dependent variables
independent_variable = "Avg Hours Worked per Worker"
dependent_variable = "Score"

# define title and labels
xlabel = "Number of Hours"
ylabel = "Happiness Score"
title = f"{ylabel} vs. {xlabel}"

# create dataframe
hours_and_happiness = data[["Country", independent_variable, dependent_variable]]

# define x and y values
x_values = hours_and_happiness[independent_variable]
y_values = hours_and_happiness[dependent_variable]

# run regression
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)

# calculate regression values
regress_values = x_values * slope + intercept

# create line equation
line_eq = f"y = {round(slope, 4)}x + {round(intercept, 2)}"

# plot the data
plt.scatter(x_values, y_values)

# plot the regression line
plt.plot(x_values, regress_values, "r-")

# add equation
plt.annotate(line_eq, (1780, 5.8), fontsize=15, color="red")

# add title and labels to plot
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)

# print summary data
print(f"The r-squared value is: {round(rvalue**2, 4)})")

# display the plot
plt.show()

In [None]:
# Regression Analysis for Question #2
# Hypothesis: Countries with higher GDP per Capita will tend to have a higher happiness score.

# define independent and dependent variables
independent_variable = "GDP per capita"
dependent_variable = "Score"

# define title and labels
xlabel = "GDP per Capita"
ylabel = "Happiness Score"
title = f"{ylabel} vs. {xlabel}"

# create dataframe
gdp_and_happiness = data[["Country", independent_variable, dependent_variable]]

# define x and y values
x_values = gdp_and_happiness[independent_variable]
y_values = gdp_and_happiness[dependent_variable]

# run regression
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)

# calculate regression values
regress_values = x_values * slope + intercept

# create line equation
line_eq = f"y = {round(slope, 2)}x + {round(intercept, 2)}"

# plot the data
plt.scatter(x_values, y_values)

# plot the regression line
plt.plot(x_values, regress_values, "r-")

# add equation
plt.annotate(line_eq, (1.28, 6.1), fontsize=15, color="red")

# add title and labels to plot
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)

# print summary data
print(f"The r-squared value is: {round(rvalue**2, 4)})")

# display the plot
plt.show()

In [None]:
# Regression Analysis for Question #3
# Hypothesis: Countries with higher social support will tend to have a higher happiness score.

# define independent and dependent variables
independent_variable = "Social support"
dependent_variable = "Score"

# define title and labels
xlabel = "Social Support"
ylabel = "Happiness Score"
title = f"{ylabel} vs. {xlabel}"

# create dataframe
support_and_happiness = data[["Country", independent_variable, dependent_variable]]

# define x and y values
x_values = support_and_happiness[independent_variable]
y_values = support_and_happiness[dependent_variable]

# run regression
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)

# calculate regression values
regress_values = x_values * slope + intercept

# create line equation
line_eq = f"y = {round(slope, 2)}x + {round(intercept, 2)}"

# plot the data
plt.scatter(x_values, y_values)

# plot the regression line
plt.plot(x_values, regress_values, "r-")

# add equation
plt.annotate(line_eq, (1.27, 6.65), fontsize=15, color="red")

# add title and labels to plot
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)

# print summary data
print(f"The r-squared value is: {round(rvalue**2, 4)})")

# display the plot
plt.show()

In [None]:
# Regression Analysis for Question #4
# Hypothesis: Countries with higher healthy life expectancy will tend to have a higher happiness score.

# define independent and dependent variables
independent_variable = "Healthy life expectancy"
dependent_variable = "Score"

# define title and labels
xlabel = "Healthy Life Expectancy"
ylabel = "Happiness Score"
title = f"{ylabel} vs. {xlabel}"

# create dataframe
life_expectancy_and_happiness = data[["Country", independent_variable, dependent_variable]]

# define x and y values
x_values = life_expectancy_and_happiness[independent_variable]
y_values = life_expectancy_and_happiness[dependent_variable]

# run regression
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)

# calculate regression values
regress_values = x_values * slope + intercept

# create line equation
line_eq = f"y = {round(slope, 2)}x + {round(intercept, 2)}"

# plot the data
plt.scatter(x_values, y_values)

# plot the regression line
plt.plot(x_values, regress_values, "r-")

# add equation
plt.annotate(line_eq, (.8, 6.25), fontsize=15, color="red")

# add title and labels to plot
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)

# print summary data
print(f"The r-squared value is: {round(rvalue**2, 4)})")

# display the plot
plt.show()

In [None]:
# Regression Analysis for Question #5
# Hypothesis: Countries with higher freedom to make life choices will tend to have a higher happiness score.

# define independent and dependent variables
independent_variable = "Freedom to make life choices"
dependent_variable = "Score"

# define title and labels
xlabel = "Freedom to Make Life Choices"
ylabel = "Happiness Score"
title = f"{ylabel} vs. {xlabel}"

# create dataframe
freedom_and_happiness = data[["Country", independent_variable, dependent_variable]]

# define x and y values
x_values = freedom_and_happiness[independent_variable]
y_values = freedom_and_happiness[dependent_variable]

# run regression
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)

# calculate regression values
regress_values = x_values * slope + intercept

# create line equation
line_eq = f"y = {round(slope, 2)}x + {round(intercept, 2)}"

# plot the data
plt.scatter(x_values, y_values)

# plot the regression line
plt.plot(x_values, regress_values, "r-")

# add equation
plt.annotate(line_eq, (.27, 6.65), fontsize=15, color="red")

# add title and labels to plot
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)

# print summary data
print(f"The r-squared value is: {round(rvalue**2, 4)})")

# display the plot
plt.show()

In [None]:
# Regression Analysis for Question #6
# Hypothesis: Countries with higher generosity will tend to have a higher happiness score.

# define independent and dependent variables
independent_variable = "Generosity"
dependent_variable = "Score"

# define title and labels
xlabel = "Generosity"
ylabel = "Happiness Score"
title = f"{ylabel} vs. {xlabel}"

# create dataframe
generosity_and_happiness = data[["Country", independent_variable, dependent_variable]]

# define x and y values
x_values = generosity_and_happiness[independent_variable]
y_values = generosity_and_happiness[dependent_variable]

# run regression
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)

# calculate regression values
regress_values = x_values * slope + intercept

# create line equation
line_eq = f"y = {round(slope, 2)}x + {round(intercept, 2)}"

# plot the data
plt.scatter(x_values, y_values)

# plot the regression line
plt.plot(x_values, regress_values, "r-")

# add equation
plt.annotate(line_eq, (.23, 6.5), fontsize=15, color="red")

# add title and labels to plot
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)

# print summary data
print(f"The r-squared value is: {round(rvalue**2, 4)})")

# display the plot
plt.show()

In [None]:
# Regression Analysis for Question #7
# Hypothesis: Countries with lower perceptions of corruption will tend to have a higher happiness score.

# define independent and dependent variables
independent_variable = "Perceptions of corruption"
dependent_variable = "Score"

# define title and labels
xlabel = "Perceptions of Corruption"
ylabel = "Happiness Score"
title = f"{ylabel} vs. {xlabel}"

# create dataframe
corruption_and_happiness = data[["Country", independent_variable, dependent_variable]]

# define x and y values
x_values = corruption_and_happiness[independent_variable]
y_values = corruption_and_happiness[dependent_variable]

# run regression
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)

# calculate regression values
regress_values = x_values * slope + intercept

# create line equation
line_eq = f"y = {round(slope, 2)}x + {round(intercept, 2)}"

# plot the data
plt.scatter(x_values, y_values)

# plot the regression line
plt.plot(x_values, regress_values, "r-")

# add equation
plt.annotate(line_eq, (.1, 6.1), fontsize=15, color="red")

# add title and labels to plot
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)

# print summary data
print(f"The r-squared value is: {round(rvalue**2, 4)})")

# display the plot
plt.show()