1. Data Exploration and Understanding:

In [None]:
import pandas as pd

# Load the dataset
file_path = 'your_dataset_path.csv'  # Replace with the actual path to your dataset
data = pd.read_csv(file_path)

# Display the first 10 rows
print("First 10 rows of the dataset:")
print(data.head(10))

# Dataset Overview
# 1. Number of rows and columns
rows, columns = data.shape
print(f"\nNumber of rows: {rows}, Number of columns: {columns}")

# 2. List of columns and their data types
print("\nColumns and data types:")
print(data.dtypes)

# Basic Statistics
# 1. Calculate mean, median, and standard deviation for the 'score' column
mean_score = data['score'].mean()
median_score = data['score'].median()
std_dev_score = data['score'].std()
print(f"\nMean score: {mean_score}, Median score: {median_score}, Standard deviation: {std_dev_score}")

# 2. Identify countries with the highest and lowest happiness scores
highest_score_country = data.loc[data['score'].idxmax(), ['Country name', 'score']]
lowest_score_country = data.loc[data['score'].idxmin(), ['Country name', 'score']]
print(f"\nCountry with the highest score: {highest_score_country['Country name']} ({highest_score_country['score']})")
print(f"Country with the lowest score: {lowest_score_country['Country name']} ({lowest_score_country['score']})")

# Missing Values
# 1. Check for missing values
missing_values = data.isnull().sum()
print("\nMissing values in each column:")
print(missing_values)

# Filtering and Sorting
# 1. Filter countries with score > 7.5
filtered_data = data[data['score'] > 7.5]

# 2. Sort by GDP per Capita in descending order and display top 10 rows
sorted_data = filtered_data.sort_values(by='Log GDP per capita', ascending=False).head(10)
print("\nTop 10 countries with score > 7.5 sorted by GDP per Capita:")
print(sorted_data)

# Adding New Columns
# 1. Create 'Happiness Category' column
conditions = [
    (data['score'] < 4),
    (data['score'] >= 4) & (data['score'] <= 6),
    (data['score'] > 6)
]
categories = ['Low', 'Medium', 'High']
data['Happiness Category'] = pd.cut(data['score'], bins=[-float('inf'), 4, 6, float('inf')], labels=categories, right=False)
print("\nUpdated dataset with 'Happiness Category':")
print(data.head())

# Data Visualizations
import matplotlib.pyplot as plt

# Bar Plot: Top 10 happiest countries
top_10_happiest = data.sort_values(by='score', ascending=False).head(10)
plt.figure(figsize=(10, 6))
plt.bar(top_10_happiest['Country name'], top_10_happiest['score'], color='skyblue')
plt.title("Top 10 Happiest Countries by Score")
plt.xticks(rotation=45, ha='right')
plt.xlabel("Country")
plt.ylabel("Happiness Score")
plt.show()

# Line Plot: Top 10 unhappiest countries
top_10_unhappiest = data.sort_values(by='score').head(10)
plt.figure(figsize=(10, 6))
plt.plot(top_10_unhappiest['Country name'], top_10_unhappiest['score'], marker='o', color='red')
plt.title("Top 10 Unhappiest Countries by Score")
plt.xticks(rotation=45, ha='right')
plt.xlabel("Country")
plt.ylabel("Happiness Score")
plt.show()

# Histogram: Distribution of Scores
plt.figure(figsize=(8, 5))
plt.hist(data['score'], bins=20, color='purple', alpha=0.7)
plt.title("Distribution of Happiness Scores")
plt.xlabel("Score")
plt.ylabel("Frequency")
plt.show()

# Scatter Plot: GDP per Capita vs. Score
plt.figure(figsize=(8, 5))
plt.scatter(data['Log GDP per capita'], data['score'], color='green', alpha=0.5)
plt.title("GDP per Capita vs. Happiness Score")
plt.xlabel("Log GDP per Capita")
plt.ylabel("Happiness Score")
plt.show()


Task 1: Setup Task - Preparing the South-Asia Dataset

In [None]:
import pandas as pd

# Step 1: Define South Asian countries
south_asian_countries = [
    "Afghanistan", "Bangladesh", "Bhutan", "India",
    "Maldives", "Nepal", "Pakistan", "Sri Lanka"
]

# Step 2: Filter the dataset for South Asian countries
south_asia_df = data[data['Country name'].isin(south_asian_countries)]

# Step 3: Save the filtered dataframe as a separate CSV file
south_asia_df.to_csv('South_Asia_Dataset.csv', index=False)
print("South Asia dataset saved as 'South_Asia_Dataset.csv'")


Task 2: Composite Score Ranking


In [None]:
# Step 1: Create the Composite Score column
south_asia_df['Composite Score'] = (
    0.40 * south_asia_df['Log GDP per capita'] +
    0.30 * south_asia_df['Social support'] +
    0.30 * south_asia_df['Healthy life expectancy']
)

# Step 2: Rank South Asian countries by Composite Score
south_asia_df = south_asia_df.sort_values(by='Composite Score', ascending=False)

# Step 3: Visualize the top 5 countries by Composite Score
import matplotlib.pyplot as plt

top_5_composite = south_asia_df.head(5)
plt.figure(figsize=(8, 6))
plt.barh(top_5_composite['Country name'], top_5_composite['Composite Score'], color='skyblue')
plt.title("Top 5 South Asian Countries by Composite Score")
plt.xlabel("Composite Score")
plt.ylabel("Country")
plt.gca().invert_yaxis()
plt.show()

# Step 4: Compare rankings with original Score
plt.figure(figsize=(8, 6))
plt.scatter(south_asia_df['score'], south_asia_df['Composite Score'], color='purple', alpha=0.7)
plt.title("Original Score vs Composite Score")
plt.xlabel("Original Score")
plt.ylabel("Composite Score")
plt.show()


Task 3: Outlier Detection

In [None]:
# Step 1: Identify outliers based on Score and GDP per Capita using 1.5×IQR rule
def find_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return (series < lower_bound) | (series > upper_bound)

south_asia_df['Outlier_Score'] = find_outliers(south_asia_df['score'])
south_asia_df['Outlier_GDP'] = find_outliers(south_asia_df['Log GDP per capita'])

# Step 2: Scatter plot highlighting outliers
plt.figure(figsize=(8, 6))
plt.scatter(south_asia_df['Log GDP per capita'], south_asia_df['score'], label='Normal Points')
plt.scatter(
    south_asia_df[south_asia_df['Outlier_Score']]['Log GDP per capita'],
    south_asia_df[south_asia_df['Outlier_Score']]['score'], color='red', label='Outlier (Score)'
)
plt.title("GDP per Capita vs Score with Outliers Highlighted")
plt.xlabel("Log GDP per Capita")
plt.ylabel("Score")
plt.legend()
plt.show()


Task 4: Exploring Trends Across Metrics

In [None]:
# Step 1: Calculate Pearson correlation for two metrics with Score
metric_1 = 'Freedom to make life choices'
metric_2 = 'Generosity'
correlation_1 = south_asia_df[metric_1].corr(south_asia_df['score'])
correlation_2 = south_asia_df[metric_2].corr(south_asia_df['score'])

print(f"Correlation between {metric_1} and Score: {correlation_1}")
print(f"Correlation between {metric_2} and Score: {correlation_2}")

# Step 2: Scatter plots with trendlines
import seaborn as sns

plt.figure(figsize=(8, 6))
sns.regplot(x=metric_1, y='score', data=south_asia_df, color='blue')
plt.title(f"{metric_1} vs Score")
plt.show()

plt.figure(figsize=(8, 6))
sns.regplot(x=metric_2, y='score', data=south_asia_df, color='green')
plt.title(f"{metric_2} vs Score")
plt.show()


Task 5: Gap Analysis

In [None]:
# Step 1: Add GDP-Score Gap column
south_asia_df['GDP-Score Gap'] = south_asia_df['Log GDP per capita'] - south_asia_df['score']

# Step 2: Rank by gap in ascending and descending order
ascending_gap = south_asia_df.sort_values(by='GDP-Score Gap').head(3)
descending_gap = south_asia_df.sort_values(by='GDP-Score Gap', ascending=False).head(3)

# Step 3: Highlight the top 3 countries with largest positive and negative gaps
plt.figure(figsize=(10, 6))
plt.bar(ascending_gap['Country name'], ascending_gap['GDP-Score Gap'], color='red', label='Negative Gap')
plt.bar(descending_gap['Country name'], descending_gap['GDP-Score Gap'], color='green', label='Positive Gap')
plt.title("Top 3 Countries by GDP-Score Gap")
plt.ylabel("GDP-Score Gap")
plt.legend()
plt.show()


Task 1: Setup Task - Preparing the Middle Eastern Dataset

In [None]:
# Define the list of Middle Eastern countries
middle_east_countries = [
    "Bahrain", "Iran", "Iraq", "Israel", "Jordan", "Kuwait", "Lebanon",
    "Oman", "Palestine", "Qatar", "Saudi Arabia", "Syria",
    "United Arab Emirates", "Yemen"
]

# Filter the dataset for Middle Eastern countries
middle_east_df = data[data['Country name'].isin(middle_east_countries)]

# Save the filtered dataset as a CSV file
middle_east_df.to_csv('Middle_East_Dataset.csv', index=False)
print("Middle East dataset saved as 'Middle_East_Dataset.csv'")


1. Descriptive Statistics

In [None]:
# Calculate mean and standard deviation for both regions
south_asia_stats = {
    'mean': south_asia_df['score'].mean(),
    'std_dev': south_asia_df['score'].std()
}
middle_east_stats = {
    'mean': middle_east_df['score'].mean(),
    'std_dev': middle_east_df['score'].std()
}

print(f"South Asia - Mean: {south_asia_stats['mean']}, Std Dev: {south_asia_stats['std_dev']}")
print(f"Middle East - Mean: {middle_east_stats['mean']}, Std Dev: {middle_east_stats['std_dev']}")

# Determine which region has higher happiness scores on average
higher_region = "South Asia" if south_asia_stats['mean'] > middle_east_stats['mean'] else "Middle East"
print(f"The region with higher average happiness scores is: {higher_region}")


2. Top and Bottom Performers

In [None]:
# Identify top 3 and bottom 3 countries in each region
south_asia_top3 = south_asia_df.nlargest(3, 'score')
south_asia_bottom3 = south_asia_df.nsmallest(3, 'score')
middle_east_top3 = middle_east_df.nlargest(3, 'score')
middle_east_bottom3 = middle_east_df.nsmallest(3, 'score')

# Plot bar charts for top and bottom performers
import matplotlib.pyplot as plt

# South Asia
plt.figure(figsize=(10, 6))
plt.bar(south_asia_top3['Country name'], south_asia_top3['score'], color='green', label='Top 3')
plt.bar(south_asia_bottom3['Country name'], south_asia_bottom3['score'], color='red', label='Bottom 3')
plt.title("Top and Bottom Performers in South Asia")
plt.ylabel("Happiness Score")
plt.legend()
plt.show()

# Middle East
plt.figure(figsize=(10, 6))
plt.bar(middle_east_top3['Country name'], middle_east_top3['score'], color='blue', label='Top 3')
plt.bar(middle_east_bottom3['Country name'], middle_east_bottom3['score'], color='orange', label='Bottom 3')
plt.title("Top and Bottom Performers in the Middle East")
plt.ylabel("Happiness Score")
plt.legend()
plt.show()


3. Metric Comparisons

In [None]:
# Compare metrics: GDP per Capita, Social Support, Healthy Life Expectancy
metrics = ['Log GDP per capita', 'Social support', 'Healthy life expectancy']

# Calculate mean of each metric for both regions
metric_comparison = pd.DataFrame({
    'South Asia': south_asia_df[metrics].mean(),
    'Middle East': middle_east_df[metrics].mean()
})

# Plot grouped bar charts
metric_comparison.plot(kind='bar', figsize=(10, 6))
plt.title("Comparison of Key Metrics Between South Asia and Middle East")
plt.ylabel("Mean Values")
plt.xticks(rotation=0)
plt.show()


4. Happiness Disparity

In [None]:
# Compute range and coefficient of variation (CV)
def compute_range_and_cv(df, column):
    range_val = df[column].max() - df[column].min()
    cv = df[column].std() / df[column].mean()
    return range_val, cv

south_asia_range, south_asia_cv = compute_range_and_cv(south_asia_df, 'score')
middle_east_range, middle_east_cv = compute_range_and_cv(middle_east_df, 'score')

print(f"South Asia - Range: {south_asia_range}, CV: {south_asia_cv}")
print(f"Middle East - Range: {middle_east_range}, CV: {middle_east_cv}")

greater_variability = "South Asia" if south_asia_cv > middle_east_cv else "Middle East"
print(f"The region with greater variability in happiness is: {greater_variability}")


5. Correlation Analysis

In [None]:
# Calculate correlations
metrics = ['Freedom to make life choices', 'Generosity']

south_asia_corr = south_asia_df[metrics + ['score']].corr()['score']
middle_east_corr = middle_east_df[metrics + ['score']].corr()['score']

print("South Asia Correlations:")
print(south_asia_corr)

print("Middle East Correlations:")
print(middle_east_corr)

# Scatter plots with trendlines
import seaborn as sns

for metric in metrics:
    plt.figure(figsize=(8, 6))
    sns.regplot(x=metric, y='score', data=south_asia_df, label='South Asia', color='green')
    sns.regplot(x=metric, y='score', data=middle_east_df, label='Middle East', color='blue')
    plt.title(f"Score vs {metric}")
    plt.legend()
    plt.show()


6. Outlier Detection

In [None]:
# Define a function to detect outliers
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] < lower_bound) | (df[column] > upper_bound)]

# Detect outliers
south_asia_outliers = detect_outliers(south_asia_df, 'score')
middle_east_outliers = detect_outliers(middle_east_df, 'score')

print("South Asia Outliers:")
print(south_asia_outliers[['Country name', 'score']])

print("Middle East Outliers:")
print(middle_east_outliers[['Country name', 'score']])


7. Visualization: Boxplots

In [None]:
# Create boxplots for the distribution of scores
plt.figure(figsize=(8, 6))
sns.boxplot(data=[south_asia_df['score'], middle_east_df['score']], palette='pastel')
plt.xticks([0, 1], ['South Asia', 'Middle East'])
plt.title("Distribution of Happiness Scores Between Regions")
plt.ylabel("Score")
plt.show()
