In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, cross_val_score, learning_curve
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.base import BaseEstimator, TransformerMixin
import warnings
import joblib

warnings.filterwarnings('ignore')
plt.style.use('default')  # Using default style instead of seaborn


In [None]:
df = pd.read_csv('gdp-growth-of-african-countries.csv')
df = pd.melt(df, id_vars=['Year'], var_name='Country', value_name='GDP_Growth')

# Feature Engineering
df['Decade'] = (df['Year'] // 10) * 10
df['Post_2000'] = (df['Year'] > 2000).astype(int)

REGION_MAPPING = {
    'Algeria': 'North Africa', 'Benin': 'West Africa', 'Botswana': 'Southern Africa',
    'Burkina Faso': 'West Africa', 'Burundi': 'East Africa', 'Cameroon': 'Central Africa',
    'Central African Republic': 'Central Africa', 'Chad': 'Central Africa',
    'Eswatini': 'Southern Africa', 'Ethiopia': 'East Africa', 'Gabon': 'Central Africa',
    'Ghana': 'West Africa', 'Kenya': 'East Africa', 'Lesotho': 'Southern Africa',
    'Liberia': 'West Africa', 'Libya': 'North Africa', 'Madagascar': 'East Africa',
    'Mauritius': 'East Africa', 'Morocco': 'North Africa', 'Niger': 'West Africa',
    'Nigeria': 'West Africa', 'Rwanda': 'East Africa', 'Senegal': 'West Africa',
    'Seychelles': 'East Africa', 'Sierra Leone': 'West Africa', 'Somalia': 'East Africa',
    'South Africa': 'Southern Africa', 'Sudan': 'North Africa', 'Tanzania': 'East Africa',
    'Togo': 'West Africa', 'Uganda': 'East Africa', 'Zambia': 'Southern Africa',
    'Zimbabwe': 'Southern Africa'
}

df['Region'] = df['Country'].map(lambda x: REGION_MAPPING.get(x, 'Other'))


In [None]:
# Data Visualization
plt.figure(figsize=(15, 5))
plt.subplot(121)
sns.boxplot(x='Region', y='GDP_Growth', data=df)
plt.title('GDP Growth by Region')
plt.xticks(rotation=45)

plt.subplot(122)
sns.boxplot(x='Decade', y='GDP_Growth', data=df)
plt.title('GDP Growth by Decade')
plt.tight_layout()
plt.show()


In [None]:
class CategoryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, country_col='Country', region_col='Region'):
        self.country_col = country_col
        self.region_col = region_col
        self.countries = sorted(df['Country'].unique().tolist())
        self.regions = sorted(list(set(REGION_MAPPING.values())))

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        # One-hot encode countries
        country_dummies = pd.get_dummies(X_copy[self.country_col], prefix='country')
        region_dummies = pd.get_dummies(X_copy[self.region_col], prefix='region')

        # Add missing columns with zeros
        for country in self.countries:
            if f'country_{country}' not in country_dummies.columns:
                country_dummies[f'country_{country}'] = 0
        for region in self.regions:
            if f'region_{region}' not in region_dummies.columns:
                region_dummies[f'region_{region}'] = 0

        X_copy = X_copy.drop([self.country_col, self.region_col], axis=1)
        return pd.concat([X_copy, country_dummies, region_dummies], axis=1)


In [None]:
# Prepare data for modeling
df_clean = df.dropna(subset=['GDP_Growth'])
features = ['Year', 'Country', 'Region', 'Decade', 'Post_2000']
X = df_clean[features]
y = df_clean['GDP_Growth']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create preprocessing pipeline
preprocessor = CategoryEncoder()
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)


In [None]:
# Train and evaluate models
models = {
    'Linear Regression': LinearRegression(),
    'SGD Regression': SGDRegressor(max_iter=10000, random_state=42),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}

results = {}
best_model = None
best_score = float('-inf')

for name, model in models.items():
    model.fit(X_train_processed, y_train)
    y_pred = model.predict(X_test_processed)
    score = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    results[name] = {'r2': score, 'mse': mse, 'model': model}
    print(f"\n{name}:")
    print(f"R² Score: {score:.4f}")
    print(f"MSE: {mse:.4f}")

    if score > best_score:
        best_score = score
        best_model = model


In [None]:
# Save the best model
joblib.dump(best_model, 'best_gdp_growth_model.pkl')
joblib.dump(preprocessor, 'gdp_growth_preprocessor.pkl')

# Plot model comparison
plt.figure(figsize=(10, 5))
model_names = list(results.keys())
r2_scores = [results[name]['r2'] for name in model_names]
plt.bar(model_names, r2_scores)
plt.title('Model Performance Comparison')
plt.ylabel('R² Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Enhanced Data Visualization and Analysis
plt.figure(figsize=(20, 10))

# GDP Growth Distribution
plt.subplot(221)
sns.histplot(data=df, x='GDP_Growth', bins=30)
plt.title('GDP Growth Distribution')

# Growth Trends Over Time by Region
plt.subplot(222)
sns.lineplot(data=df, x='Year', y='GDP_Growth', hue='Region')
plt.title('GDP Growth Trends by Region')

# Feature Correlation Analysis
numeric_cols = ['Year', 'Decade', 'Post_2000', 'GDP_Growth']
correlation_matrix = df[numeric_cols].corr()
plt.subplot(223)
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Feature Correlations')

# Box Plot of Growth by Region
plt.subplot(224)
sns.boxplot(data=df, x='Region', y='GDP_Growth')
plt.xticks(rotation=45)
plt.title('Growth Distribution by Region')
plt.tight_layout()
plt.show()


In [None]:
# Feature Standardization
numeric_features = ['Year', 'Decade']
scaler = StandardScaler()
df_clean[numeric_features] = scaler.fit_transform(df_clean[numeric_features])

# Feature Importance Analysis using Random Forest
rf_analyzer = RandomForestRegressor(n_estimators=100, random_state=42)
rf_analyzer.fit(X_train_processed, y_train)

# Get feature names after preprocessing
feature_names = X_train_processed.columns.tolist()
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': rf_analyzer.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance.head(20), x='importance', y='feature')
plt.title('Top 20 Most Important Features')
plt.tight_layout()
plt.show()


In [None]:
# Enhanced Model Training with Cross-validation
def train_with_cv(model, X, y, cv=5):
    cv_scores = cross_val_score(model, X, y, cv=cv, scoring='r2')
    return {
        'mean_score': cv_scores.mean(),
        'std_score': cv_scores.std(),
        'all_scores': cv_scores
    }

# Training with learning curves
def plot_learning_curves(model, X, y):
    train_sizes, train_scores, test_scores = learning_curve(
        model, X, y, cv=5, n_jobs=-1,
        train_sizes=np.linspace(0.1, 1.0, 10),
        scoring='r2'
    )

    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_mean, label='Training score')
    plt.plot(train_sizes, test_mean, label='Cross-validation score')
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1)
    plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1)
    plt.xlabel('Training Examples')
    plt.ylabel('R² Score')
    plt.title('Learning Curves')
    plt.legend(loc='best')
    plt.grid(True)
    plt.show()


In [None]:
# Train and evaluate models with enhanced metrics
results = {}
best_model = None
best_score = float('-inf')

for name, model in models.items():
    print(f"\nTraining {name}...")

    # Fit model
    model.fit(X_train_processed, y_train)

    # Make predictions
    y_pred_train = model.predict(X_train_processed)
    y_pred_test = model.predict(X_test_processed)

    # Calculate metrics
    train_score = r2_score(y_train, y_pred_train)
    test_score = r2_score(y_test, y_pred_test)
    train_mse = mean_squared_error(y_train, y_pred_train)
    test_mse = mean_squared_error(y_test, y_pred_test)

    # Cross-validation
    cv_results = train_with_cv(model, X_train_processed, y_train)

    # Store results
    results[name] = {
        'model': model,
        'train_r2': train_score,
        'test_r2': test_score,
        'train_mse': train_mse,
        'test_mse': test_mse,
        'cv_mean': cv_results['mean_score'],
        'cv_std': cv_results['std_score'],
        'predictions': y_pred_test
    }

    # Print results
    print(f"Training R²: {train_score:.4f}")
    print(f"Test R²: {test_score:.4f}")
    print(f"Cross-val R²: {cv_results['mean_score']:.4f} (±{cv_results['std_score']:.4f})")
    print(f"MSE: {test_mse:.4f}")

    # Plot learning curves
    plot_learning_curves(model, X_train_processed, y_train)

    # Update best model
    if test_score > best_score:
        best_score = test_score
        best_model = model


In [None]:
# Visualization of model predictions
plt.figure(figsize=(20, 10))

for i, (name, result) in enumerate(results.items(), 1):
    plt.subplot(2, 2, i)
    plt.scatter(y_test, result['predictions'], alpha=0.5)

    # Add regression line
    z = np.polyfit(y_test, result['predictions'], 1)
    p = np.poly1d(z)
    plt.plot(y_test, p(y_test), "r--", alpha=0.8)

    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', alpha=0.5)
    plt.xlabel('Actual GDP Growth')
    plt.ylabel('Predicted GDP Growth')
    plt.title(f'{name}\nR² = {result["test_r2"]:.4f}')

plt.tight_layout()
plt.show()


In [None]:
# Save best model with metadata
model_info = {
    'model': best_model,
    'preprocessor': preprocessor,
    'feature_names': feature_names,
    'numeric_features': numeric_features,
    'scaler': scaler,
    'performance': {
        'r2_score': best_score,
        'mse': results[name]['test_mse'],
        'cv_score': results[name]['cv_mean']
    }
}

joblib.dump(model_info, 'best_gdp_growth_model.pkl')
print("Best model saved with additional metadata")
