In [None]:
# !pip install pandas

In [None]:
# !pip install seaborn

In [None]:
# Import Python libraries for data manipulation and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Creating data using dictionary method
data = {"id": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
        "country": ["Albania", "Bulgaria", "Iran", "Ukraine", "South Africa", "Ukraine", "Austria", "Croatia", "Denmark", "Portugal", "Uruguay", "Thailand"],
        "lifeexp": [77.6, 75, 75.8, 71.9, 61.8, 71.9, 81.4, 77.3, 80.7, 80.8, 77.1, 77.6],
        "unemployment": [6.09, 3.24, 2.11, 1.53, 7.52, 1.53, 1.43, 5.53, 1.36, 4.37, 0.16, 0.06],
        "happiness": ["Low", "Low", "Low", "Low", "Low", "Low", "High", "High", "High", "High", "High", "High",]
       }

In [None]:
# Create a new Data Frame with the World Happiness Report data
world_happiness_data = pd.DataFrame(data, index = None)

In [None]:
world_happiness_data.shape

In [None]:
world_happiness_data.dtypes

In [None]:
world_happiness_data.value_counts()

In [None]:
world_happiness_data.describe()

In [None]:
world_happiness_data['lifeexp'].plot( kind = "box", title = "Life Expectancy")
plt.savefig('lifeexp_boxplot.png') 
plt.show()

In [None]:
world_happiness_data['unemployment'].plot(kind = "box", title = "Unemployment Rate")
plt.savefig('unemployment_boxplot.png') 
plt.show()

In [None]:
plt.hist(world_happiness_data['lifeexp'], bins = 10, edgecolor="white")
plt.yticks(range(0, 6))
plt.xticks([65, 70, 75, 80]) 
plt.title("lifeexp")
plt.savefig('lifeexp_hist.png') 
plt.show()

In [None]:
plt.hist(world_happiness_data['unemployment'], bins = 8,  edgecolor = "white")
plt.yticks(range(0,6))
plt.xticks([0, 2, 4, 6])
plt.title("unemployment")
plt.savefig('unemployment_hist.png')
plt.show()

In [None]:
high = world_happiness_data[world_happiness_data['happiness'] == 'High']['lifeexp']
low = world_happiness_data[world_happiness_data['happiness'] == 'Low']['lifeexp']

plt.hist([high, low], bins=7, label=['High', 'Low'], color=['#F2A71B', '#BD2A2E'], edgecolor='white')
plt.xlabel('Life Expectancy')
plt.ylabel('Frequency')
plt.title('Life Expectancy by Happiness Level') 
plt.yticks(range(0,4))
plt.xticks([65, 70, 75, 80]) 
plt.legend()
plt.savefig('lifeexp_happiness.png')
plt.show()


In [None]:
high_unemp = world_happiness_data[world_happiness_data['happiness'] == 'High']['unemployment']
low_umemp = world_happiness_data[world_happiness_data['happiness'] == 'Low']['unemployment']

plt.hist([high_unemp, low_umemp], bins=7, label=['High', 'Low'], color=['#F2A71B', '#BD2A2E'], edgecolor='white')
plt.xlabel('Unemployment Rate')
plt.ylabel('Frequency')
plt.title('Unemployment Rate by Happiness Level') 
plt.yticks(range(0,4))
plt.xticks([0, 2, 4, 6]) 
plt.legend()
plt.savefig('unemp_happiness.png')
plt.show()

In [None]:
# Histogram of life expectancy by happiness class
# sns.histplot(data=world_happiness_data, x='lifeexp', hue='happiness', bins=10, palette={'High': 'blue', 'Low': 'red'}, multiple='dodge')
# plt.title('Life Expectancy by Happiness Level')
# plt.xlabel('Life Expectancy')
# plt.ylabel('Number of Countries')
# plt.show()

In [None]:
# Select certain columns
certain_columns_df = world_happiness_data.loc[:, ['id', 'lifeexp', 'unemployment', 'happiness']]

In [None]:
certain_columns_df 

In [None]:
# Import scikit-learn 
# !pip install scikit-learn 

In [None]:
# Import the Python machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [None]:
# !pip install graphviz

In [None]:
import graphviz 
from sklearn import tree

In [None]:
certain_columns_df.head()

In [None]:
certain_columns_df.describe()

In [None]:
certain_columns_df.shape

In [None]:
from functions import classComparePlot, boxPlotAll, histPlotAll, correlationMatrix, scatterMatrix, appendEqualCountsClass, logisticRegressionSummary, decisionTreeSummary, linearRegressionSummary, viewDecisionTree, find_outliers


In [None]:
import os
os.getcwd()

In [None]:
!ls

Visualise

In [None]:
# Plot histogram for life expectancy and unemployment rate
histPlotAll(certain_columns_df[['lifeexp', 'unemployment']])

In [None]:
# Plot box plot 
boxPlotAll(certain_columns_df[['lifeexp', 'unemployment']])

In [None]:
classComparePlot(certain_columns_df[['happiness', 'lifeexp', 'unemployment']], 'happiness', plotType='hist')

## Prepare the data for machine learning

#### Select Features and Split into input and target features

In [None]:
# label - what we can to predict (small letter y)
y = certain_columns_df['happiness']

In [None]:
# feature - what will help us make prediction (capital letter X)
X = certain_columns_df[['lifeexp', 'unemployment']]

In [None]:
# Inspect y
y.head()

In [None]:
X.head()

## Build the model

### Split data into training and test both the label and features

keep 1/3 (0.33) of the y and X for testing

In [None]:
test_size = 0.33
# To ensure we get same result, we set the seed
seed = 20250513 

# Split the datasets using train_test_split() function to create four datasets X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed )

In [None]:
# Input train dataset
X_train

In [None]:
# Input test dataset
X_test

In [None]:
# Target train dataset
y_train

In [None]:
# Target test dataset
y_test

### Select an algorithm

#### We will be DecisionTree algorithm to create a model

In [None]:
# Create a new model (An empty untrained model)
model = DecisionTreeClassifier(max_depth=2, random_state=42)

### Train the model (Same as saying, Fit the model to the data) - exploring the data to identify patterns

In [None]:
# Train the model by passing input and target features - features and label
model.fit(X_train, y_train)

### Check the model - how good the model is - accuracy...

In [None]:
# Check how well the model can make predictions using first - the train datasets - given X_train
predictions = model.predict(X_train)
print(accuracy_score(y_train, predictions))

## Evaluate the model - using test datasets

In [None]:
# Given X_train, make predictions
predictions = model.predict(X_test)

In [None]:
# Check the predictions
predictions 

In [None]:
# Evalute the accuracy of the model by comparing actual and predictions
print(accuracy_score(y_test, predictions))

In [None]:
# Create a new DataFrame by copying the X_test  dateset (input feaures)  
df = X_test.copy()

# Create a new column 'Actual' the actual predictions y_test
df['Actual'] = y_test

# Create a new column in the dataset 'Predictions' what the model predictedabs
df['Predictions'] = predictions

In [None]:
# Check the comparison as a table
df

### Examine the decision Tree the Sklearn has built for this model

In [None]:
viewDecisionTree(model, X.columns)