In [None]:
# import pandas as pd
# import matplotlib.pyplot as plt
# import numpy as np


In [2]:
import pandas as pd
import joblib
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

# 1. Load the data
df = pd.read_csv(r"C:\Users\JAY GAVALI\Downloads\inter_project\codes\fully_corrected_data.csv")

# 2. Handle Missing Values (Backfill Strategy)
# This fills the missing 203 rows with the next year's data for that country
df['Government_Expenditure_Growth_Rate'] = df.groupby('Country')['Government_Expenditure_Growth_Rate'].bfill()
# Fallback: If any NaNs remain (e.g. a country with no data at all), fill with 0
df['Government_Expenditure_Growth_Rate'] = df['Government_Expenditure_Growth_Rate'].fillna(0)

le = LabelEncoder()
df['Country_Encoded'] = le.fit_transform(df['Country'])
# 3. Prepare Data for Machine Learning
# Convert 'Country' names to numbers (0, 1, 2...) because models handle numbers best
le = LabelEncoder()
df['Country_Encoded'] = le.fit_transform(df['Country'])

# Select Features (Inputs) and Target (Output)
X = df[['Country_Encoded', 'Population_Growth_Rate', 
        'Exports of goods and services_Growth_Rate', 
        'Gross capital formation_Growth_Rate', 
        'Final consumption expenditure_Growth_Rate', 
        'Government_Expenditure_Growth_Rate',
        'Imports of goods and services_Growth_Rate']]
y = df['GDP_Growth_Rate']

# 4. Split Data (80% for Training, 20% for Testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Create and Train the Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# 6. Test the Model
predictions = rf_model.predict(X_test)

# 7. Check Accuracy
rmse = np.sqrt(mean_squared_error(y_test, predictions))
r2 = r2_score(y_test, predictions)

print(f"Model Accuracy (R2 Score): {r2 * 100:.2f}%")
print(f"Typical Error (RMSE): {rmse:.2f}")

# 8. See What Matters Most
importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nTop Predictors:")
print(importances)
joblib.dump(rf_model, 'gdp_model.pkl')
joblib.dump(le, 'country_encoder.pkl')
print("✅ Files created successfully!")

Model Accuracy (R2 Score): 89.33%
Typical Error (RMSE): 4.64

Top Predictors:
                                     Feature  Importance
4  Final consumption expenditure_Growth_Rate    0.717529
2  Exports of goods and services_Growth_Rate    0.153451
3        Gross capital formation_Growth_Rate    0.041917
1                     Population_Growth_Rate    0.033963
6  Imports of goods and services_Growth_Rate    0.026762
5         Government_Expenditure_Growth_Rate    0.016703
0                            Country_Encoded    0.009675
✅ Files created successfully!
