<a href="https://colab.research.google.com/github/jonathan-farah/MLDemographics/blob/main/ARIMA_FINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from datasets import load_dataset

# List of state-specific Hugging Face dataset names
states = ["Alabama", "California", "NewYork", "Texas", "Wyoming", "Hawaii"]

# Debugging print: Loop starts
print("Starting loop over states...")

for state in states:
    try:
        # Print the state being processed
        print(f"Processing state: {state}")

        # Load each dataset for the respective state
        dataset_name = f"AdityaA44/Racepopulation{state}"
        df = load_dataset(dataset_name, split='train').to_pandas()

        # Let user select race column dynamically
        selected_race = 'White'  # You can change this to dynamically select other races if needed
        population_data = df[selected_race].dropna().reset_index(drop=True)

        # Extract years
        years = df['Year'].reset_index(drop=True)

        # Ensure that population_data and years are aligned
        if len(population_data) != len(years):
            min_length = min(len(population_data), len(years))
            population_data = population_data[:min_length]
            years = years[:min_length]

        # Perform an 80/20 train-test split
        train_size = int(len(population_data) * 0.8)
        train_data = population_data[:train_size]
        test_data = population_data[train_size:]
        train_years = years[:train_size]
        test_years = years[train_size:]

        # ARIMA model fitting
        p, d, q = 1, 1, 1  # Example ARIMA parameters, modify as needed
        model = ARIMA(train_data, order=(p, d, q))
        model_fit = model.fit()

        # Forecast the future (test data)
        forecast = model_fit.forecast(steps=len(test_data))

        # Combine the fitted values (for training data) and the forecast values
        full_prediction = np.concatenate([model_fit.fittedvalues, forecast])

        # Calculate the Mean Squared Error on the test data
        mse = mean_squared_error(test_data, forecast)
        print(f'State: {state}, Mean Squared Error: {mse:.4f}')

        # Plot actual vs forecast (with predictions for both training and test data)
        plt.figure(figsize=(10, 6))
        plt.plot(years, population_data, label='Observed')
        plt.plot(years, full_prediction, label='Predicted', color='red')
        plt.fill_between(test_years, forecast - 1.96*np.std(forecast),
                         forecast + 1.96*np.std(forecast), color='red', alpha=0.2)
        plt.legend()
        plt.title(f'ARIMA Model Forecast vs Actual for {selected_race} in {state}')
        plt.xlabel('Year')
        plt.ylabel('Population')
        plt.grid(True)
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

    except Exception as e:
        # Catch any issues and print which state failed
        print(f"An error occurred while processing {state}: {e}")

# Debugging print: Loop ends
print("Finished loop over states.")


ModuleNotFoundError: No module named 'datasets'