# Life Expectancy Data Science Project

This project explores global life expectancy trends, identifies key influencing factors, and applies machine learning and clustering techniques to gain insights. The dataset consists of health, economic, and social indicators for various countries over time.

## Load and Explore the Dataset

In [None]:

import pandas as pd

# Load the dataset
file_path = 'Life Expectancy Data.csv'
data = pd.read_csv(file_path)

# Clean column names
data.columns = data.columns.str.strip().str.replace(' ', '_').str.lower()

# Display the first few rows
data.head()


## Step 1: Global Life Expectancy Trend Analysis

In [None]:

import matplotlib.pyplot as plt

# Aggregate data to calculate global life expectancy trends
global_trend = data.groupby('year')['life_expectancy'].mean().reset_index()

# Plot the global trend
plt.figure(figsize=(10, 6))
plt.plot(global_trend['year'], global_trend['life_expectancy'], marker='o', linestyle='-')
plt.title('Global Life Expectancy Trend')
plt.xlabel('Year')
plt.ylabel('Average Life Expectancy')
plt.grid(True)
plt.show()


## Step 2: Machine Learning to Predict Life Expectancy

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# Select relevant features
features = [
    'adult_mortality', 'infant_deaths', 'alcohol', 'percentage_expenditure',
    'hepatitis_b', 'measles', 'polio', 'total_expenditure', 'diphtheria',
    'hiv/aids', 'gdp', 'population', 'thinness__1-19_years', 'thinness_5-9_years',
    'income_composition_of_resources', 'schooling'
]

# Prepare the dataset for machine learning
ml_data = data[features + ['life_expectancy']].dropna()
X = ml_data[features]
y = ml_data['life_expectancy']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train a Random Forest Regressor
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Feature importance
importances = pd.DataFrame({
    'Feature': features,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

importances


## Step 3: Clustering and Segmentation

In [None]:

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Use health indicators for clustering
cluster_features = [
    'adult_mortality', 'infant_deaths', 'hiv/aids', 'thinness__1-19_years',
    'thinness_5-9_years', 'income_composition_of_resources', 'schooling'
]

# Prepare data for clustering
cluster_data = data[cluster_features].dropna()
cluster_scaled = scaler.fit_transform(cluster_data)

# Perform KMeans clustering
kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(cluster_scaled)

# Apply PCA for visualization
pca = PCA(n_components=2)
pca_result = pca.fit_transform(cluster_scaled)

# Add clusters to PCA results
pca_df = pd.DataFrame(pca_result, columns=['PCA1', 'PCA2'])
pca_df['Cluster'] = clusters

# Plot clusters
plt.figure(figsize=(10, 8))
for cluster in range(4):
    cluster_data = pca_df[pca_df['Cluster'] == cluster]
    plt.scatter(cluster_data['PCA1'], cluster_data['PCA2'], label=f'Cluster {cluster}')

plt.title('Clustering of Countries Based on Health Indicators')
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.legend(title='Cluster')
plt.grid(True)
plt.show()


## Step 4: Causal Inference - Impact of Schooling and Healthcare

In [None]:

import statsmodels.api as sm

# Analyze the effect of healthcare expenditure and schooling on life expectancy
causal_features = ['percentage_expenditure', 'schooling']
causal_data = data[causal_features + ['life_expectancy']].dropna()

X_causal = sm.add_constant(causal_data[causal_features])  # Add constant for intercept
y_causal = causal_data['life_expectancy']

# Fit the regression model
causal_model = sm.OLS(y_causal, X_causal).fit()
print(causal_model.summary())


## Step 5: Regional and Temporal Trends

In [None]:

# Assign regions and analyze trends
def assign_region(country):
    region_mapping = {
        'Developed': ['United States', 'Germany', 'Japan', 'Australia', 'Canada'],
        'Asia': ['India', 'China', 'Afghanistan', 'Pakistan', 'Bangladesh'],
        'Africa': ['Nigeria', 'South Africa', 'Kenya', 'Ethiopia', 'Ghana'],
        'Europe': ['France', 'Italy', 'Spain', 'United Kingdom', 'Russia'],
        'Latin America': ['Brazil', 'Mexico', 'Argentina', 'Colombia', 'Chile'],
        'Middle East': ['Saudi Arabia', 'Iran', 'Iraq', 'Turkey', 'Israel']
    }
    for region, countries in region_mapping.items():
        if country in countries:
            return region
    return 'Other'

data['region'] = data['country'].apply(assign_region)
regional_trends = data.groupby(['year', 'region'])['life_expectancy'].mean().reset_index()

# Plot regional trends
plt.figure(figsize=(14, 8))
for region in regional_trends['region'].unique():
    region_data = regional_trends[regional_trends['region'] == region]
    plt.plot(region_data['year'], region_data['life_expectancy'], label=region)

plt.title('Life Expectancy Trends by Region')
plt.xlabel('Year')
plt.ylabel('Life Expectancy')
plt.legend(title='Region')
plt.grid(True)
plt.show()
