In [2]:
# Requiured dependencies
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt
from scipy.stats import linregress
import os
import numpy as np
import hvplot
import hvplot.pandas
import plotly.express as px

# Is a countries happiness score correlated to the countries GDP per capita


In [None]:
# Read the 2023 clean set data from Output folder
Path_to_csv='Output/Clean_2023_df.csv'
clean_data_2023_df= pd.read_csv(Path_to_csv)
clean_data_2023_df.head()

In [None]:
# Create Scatter Plot and Linear Regression to find the relaltion between countries' Ladder score and Logged GDP per capita
x = clean_data_2023_df['Logged GDP per capita']
y = clean_data_2023_df['Ladder score']

# Caculate the regression line, create scatter plot
slope, intercept, rvalue, p_value, std_err = linregress(x, y)
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
line = slope * x + intercept
plt.figure(figsize=(10, 6))
plt.scatter(x, y, color='steelblue', label='Data points')  
plt.plot(x, line, color='red', label=f'{line_eq}')
plt.title('Scatter Plot of Happiness Score vs GDP per Capita with Regression Line')
plt.xlabel('Logged GDP per Capita')
plt.ylabel('Happiness Score')
plt.legend()

# Show Plot
print(f"The r-squared is: {rvalue**2}")
plt.grid(True)
plt.show()

# Save the visualisation 
plt.savefig('Output/Scatterplot.png')

Analysis of the regression, the slope of 0.74 indicates that as GDP per capita increases, there is a corresponding rise in the happiness score, through this rise is moderate.
R-squared value obtained is approximately 0.615. It suggests that about 61.5% of the variance in happiness scores among the countries can be explained by differences in their GDP per capita.
The scatter plot indicates that higher GDP per capita is generally associated with higher happiness scores. There is a positive relationship between these two factors. 

In [None]:
# A dual-axis line chart 
fig, ax1 = plt.subplots(figsize=(10, 6))

color = 'tab:red'
ax1.set_xlabel('Country Index')
ax1.set_ylabel('Happiness Score', color=color)
ax1.plot(clean_data_2023_df['Country name'], clean_data_2023_df['Ladder score'], color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()
color = 'tab:blue'
ax2.set_ylabel('Logged GDP per Capita', color=color)
ax2.plot(clean_data_2023_df['Country name'], clean_data_2023_df['Logged GDP per capita'], color=color)
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout()
plt.title('Dual Axes Line Chart of Happiness Score vs GDP per Capita')
plt.xticks(rotation=90)
plt.show()


# Save the visualisation 
plt.savefig('Output/dual-axisline.png')

Analysis of a dual-axis line chart, there is a visible parallel trend between the happiness scores and GDP per capita, implying a potential correlation where countries with higher GDP per capita tend to have higher happiness scores and vice versa.
The data indicates a linkage between economic performance and happiness, with variations in GDP having a visible association with changes in happiness scores. This correlation suggests that economic factors could be a significant determinant of national well-being.

# Is there a correlation between a country's unemployment rate and its national happiness score?

# Does a countries predicted life expectancy affect the happiness score?

In [None]:
# Read saved data
dataframe_2023_subset = pd.read_csv("Output/Clean_2023.csv")

# Display sample data
dataframe_2023_subset.head()

In [None]:
# Check for NaN values in the DataFrame
nan_values = dataframe_2023_subset[dataframe_2023_subset.isnull().any(axis=1)]

# Display rows with NaN values
print("Rows with NaN values:")
print(nan_values)
# Remove rows with NaN values
dataframe_2023_subset_clean = dataframe_2023_subset.dropna()

# Display the cleaned DataFrame
print("DataFrame after removing rows with NaN values:")
dataframe_2023_subset_clean.head()

In [None]:
x_values = dataframe_2023_subset_clean['Healthy life expectancy']
y_values = dataframe_2023_subset_clean['Ladder score']

# Calculate linear regression
slope, intercept, rvalue, pvalue, stderr = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope, 2)) + "x + " + str(round(intercept, 2))

# Plot scatter plot
dataframe_2023_subset_clean.plot(kind="scatter", 
                           x="Healthy life expectancy", 
                           y="Ladder score", 
                           title="Scatter Plot of Happiness Score vs Healthy Life Expectancy",
                           figsize=(8, 6))

# Plot linear regression line
plt.plot(x_values, regress_values, "r-")

# Annotate line equation
plt.annotate(line_eq, (x_values.min(), y_values.max()), fontsize=15, color="red")

# Set x-label to "Happiness Score"
plt.ylabel("Happiness Score")

# Print r-squared value
print(f"The r-squared is: {rvalue**2}")

# calculate variance and std for the "life expectancy" column
life_expectancy_column = dataframe_2023_subset['Healthy life expectancy']

# Demonstrate calculating the variance and standard deviation using the different modules
var_numpy = np.var(life_expectancy_column, ddof=0)
print(f"The population variance using the NumPy module is {var_numpy}")

# Calculate the standard deviation using NumPy
sd_numpy = np.std(life_expectancy_column, ddof=0)
print(f"The population standard deviation using the NumPy module is {sd_numpy}")

# Show plot
plt.show()


**Discussion about the linear relationship:**
The linear relationship between hapiness score and life expectancy can be observed in the scatter plot and linear regression analysis. As happiness score increases, the life expectancy seems to be increasing. This positive correlation is evident from the upward trend of the regression line, indicating that countries with higher happiness score have a higher life expectancy. 

In [None]:
# Create a scatter plot using hvplot.points with customized parameters
scatter_plot1 = dataframe_2023_subset_clean.hvplot.points(
    x="Country name",
    y="Healthy life expectancy",
    hover_cols=["Country name", "Healthy life expectancy", "Ladder score"],
    title="Scatter Plot of Happiness score by Country",
    xlabel="Country name",
    ylabel="Happiness score",
    grid=True,
    line_color="black",
    marker="o",
    size=10,
    fontsize={"xlabel": 10, "ylabel": 10, "title": 14},
    rot=90,
)

# Save the plot as an HTML file
hvplot.save(scatter_plot1, "Healthy Life Expectancy by Country.html")

# Show the plot
scatter_plot1


In [None]:
# Create a scatter plot using hvplot.points with customized parameters
scatter_plot2 = dataframe_2023_subset_clean.hvplot.points(
    x="Healthy life expectancy",
    y="Ladder score",
    hover_cols=["Country name", "Healthy life expectancy","Ladder score"],
    title="Scatter Plot of Healthy Life Expectancy V/S Happiness score",
    xlabel="Happiness score",
    ylabel="Healthy life expectancy",
    grid=True,  
    line_color="black",  
    marker="o",  
    size=10, 
    fontsize={"xlabel": 10, "ylabel": 10, "title": 14},
    rot=90,  
)

# Save the plot
hvplot.save(scatter_plot2, "Healthy Life Expectancy Vs Happiness score.html")

# Show the plot
scatter_plot2

# Which continent is the happiest? 


Which region is the happiest in 2023? 
The average happiness score of different countries is affected when compared between different regions. In other words, there is a significant difference in the average happiness scores across regions.


In [None]:
#import world happiness record file
whr23= pd.read_csv('RESOURCES/WHR2023.csv')

# import'world-data-2023.csv' file 
world_data = pd.read_csv('RESOURCES/world-data-2023.csv')

# import'continent.csv' file 
continent_file= pd.read_csv('RESOURCES/continents2.csv')

In [None]:
whr23.head()

In [None]:
# Data Cleaning
# Now let's start joining data

# Change column names to match the main dataset
continent_file.rename(columns={'name': 'Country name', 'alpha-2': 'country code'}, inplace=True)
continent_file.head()

In [None]:
#Merge region and sub-region into whr23score after country name
column_order = ['Country name', 'region', 'sub-region', 'country code']
merged_data = whr23.merge(continent_file[['Country name', 'region', 'sub-region', 'country code']], on='Country name')
#for easier reading, let's change the column order.
column_order = ['Country name', 'region', 'sub-region', 'country code'] + [col for col in merged_data.columns if col not in column_order]
merged_data = merged_data[column_order]
merged_data.head()

In [None]:
# Rename the 'Country' column to 'Country name' in the extra_data DataFrame
world_data.rename(columns={'Country': 'Country name'}, inplace=True)

#Merge the data for your own new dataset
full_data = merged_data.merge(world_data[['Country name', 'Unemployment rate', 'Urban_population', 'Population', 'Official language']], on='Country name')
# Display the first few rows of the merged DataFrame
full_data.head()

In [None]:
#Now i will check for null values
full_data.isna().sum()

In [None]:
# Is there any duplicate?
full_data[full_data.duplicated(keep=False)]

In [None]:
#g Now let's add our own math an calculate the percentage living in an urban area
full_data['Urban_population'] = full_data['Urban_population'].str.replace(',', '').astype(float)
full_data['Population'] = full_data['Population'].str.replace(',', '').astype(float)
full_data['Urban_population_percentage'] = (full_data['Urban_population'] / full_data['Population']) * 100
full_data.head()

In [None]:
# Data Visualisation
# Define custom colors for the color scale
custom_color_scale = [
    (0.0, "red"),    # Low scores in red
    (0.5, "orange"), # Medium scores in orange
    (1.0, "green")   # High scores in green
]

fig = px.choropleth(full_data, locations="Country name", locationmode='country names',
                    color="Ladder score", hover_name="Country name",
                    title="World Happiness Report: Ladder score by country",
                    color_continuous_scale=custom_color_scale)

fig.show()

# Save Show the plot
plt.savefig("Output/world map of ladder score.png")
plt.show()

In [None]:
# Create a box plot showing the distribution of happiness scores by sub-region 
fig = px.box(full_data, x='region', y='Ladder score', 
             title='Distribution of Happiness Scores by region',
             labels={'region': 'Continent', 'Ladder score': 'Happiness Score'},
             color='region', 
             height=400)

fig.update_layout(xaxis_title='region', yaxis_title='Happiness Score')

fig.show()

# Save Show the plot
plt.savefig("Output/box plot of happiness score.png")
plt.show()

In [None]:
group0 = full_data[full_data["region"] == "Europe"]["Ladder score"]
group1 = full_data[full_data["region"] == "Asia"]["Ladder score"]
group2 = full_data[full_data["region"] == "Oceania"]["Ladder score"]
group3 = full_data[full_data["region"] == "Americas"]["Ladder score"]
group4 = full_data[full_data["region"] == "Africa"]["Ladder score"]

In [None]:
stats.f_oneway(group0, group1, group2, group3, group4)

In [2]:
# Does Australia, Spain, China and the USA get happier over 2021-2024
