In [3]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb
import optuna
from datetime import datetime
import warnings
import statsmodels.api as sm
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pmdarima import auto_arima
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings('ignore')

# Load the dataset
print("Loading dataset...")
file_path = 'data/Final_Test_Cleaned_DF.xlsx'
df = pd.read_excel(file_path)

print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

# Data Preprocessing
print("\n--- Data Preprocessing ---")
data = df.copy()

# Create target variable: MALE-DOMINATED or FEMALE-DOMINATED
# Checking for Male and Female columns with various naming patterns
male_cols = [col for col in data.columns if 'male' in col.lower()]
female_cols = [col for col in data.columns if 'female' in col.lower()]

if 'Male' in data.columns and 'Female' in data.columns:
    data['Gender_Dominance'] = np.where(data['Male'] > data['Female'], 'MALE-DOMINATED', 'FEMALE-DOMINATED')
elif male_cols and female_cols:
    print(f"Found male column: {male_cols[0]} and female column: {female_cols[0]}")
    data['Gender_Dominance'] = np.where(data[male_cols[0]] > data[female_cols[0]], 'MALE-DOMINATED', 'FEMALE-DOMINATED')
else:
    print("Could not identify gender columns automatically. Please check column names:")
    print(data.columns.tolist())

# Ensure we have a date column for time series analysis
if 'Date' in data.columns:
    date_col = 'Date'
elif 'Year' in data.columns and 'Quarter' in data.columns:
    # Create a date column from Year and Quarter
    data['Date'] = pd.to_datetime(data['Year'].astype(str) + 'Q' + data['Quarter'].astype(str))
    date_col = 'Date'
else:
    # Try to find a date-like column
    date_cols = [col for col in data.columns if 'date' in col.lower() or 'time' in col.lower() or 'year' in col.lower()]
    if date_cols:
        date_col = date_cols[0]
        print(f"Using {date_col} as date column")
    else:
        print("No date column found. Creating a dummy date index.")
        data['Date'] = pd.date_range(start='2000-01-01', periods=len(data), freq='Q')
        date_col = 'Date'

# Ensure date column is datetime type
data[date_col] = pd.to_datetime(data[date_col])

# Encode the target variable
le = LabelEncoder()
data['Gender_Dominance_Encoded'] = le.fit_transform(data['Gender_Dominance'])

# Prepare time series data
# Group by date and calculate the proportion of male-dominated occupations
ts_data = data.groupby(date_col).agg(
    Male_Dominated_Count=('Gender_Dominance', lambda x: (x == 'MALE-DOMINATED').sum()),
    Total_Count=('Gender_Dominance', 'count')
).reset_index()

ts_data['Male_Dominated_Proportion'] = ts_data['Male_Dominated_Count'] / ts_data['Total_Count']
ts_data['Female_Dominated_Proportion'] = 1 - ts_data['Male_Dominated_Proportion']

# Set the date as index for time series analysis
ts_data.set_index(date_col, inplace=True)
ts_data.sort_index(inplace=True)

# Plot the time series
plt.figure(figsize=(12, 6))
plt.plot(ts_data.index, ts_data['Male_Dominated_Proportion'], label='Male Dominated')
plt.plot(ts_data.index, ts_data['Female_Dominated_Proportion'], label='Female Dominated')
plt.title('Gender Dominance Proportion Over Time')
plt.xlabel('Date')
plt.ylabel('Proportion')
plt.legend()
plt.grid(True)
plt.show()

# Define Optuna objective for SARIMAX parameter optimization
def objective(trial):
    # Define the parameter space
    p = trial.suggest_int('p', 0, 5)
    d = trial.suggest_int('d', 0, 2)
    q = trial.suggest_int('q', 0, 5)
    P = trial.suggest_int('P', 0, 2)
    D = trial.suggest_int('D', 0, 1)
    Q = trial.suggest_int('Q', 0, 2)
    s = 4  # Quarterly data, so seasonality is 4
    
    # Try to fit the SARIMAX model
    try:
        model = SARIMAX(
            ts_data['Male_Dominated_Proportion'],
            order=(p, d, q),
            seasonal_order=(P, D, Q, s),
            enforce_stationarity=False,
            enforce_invertibility=False
        )
        results = model.fit(disp=False)
        
        # Return AIC as the objective to minimize
        return results.aic
    except:
        # Return a large value if model fitting fails
        return float('inf')

# Create and run the Optuna study
print("Optimizing SARIMAX parameters with Optuna...")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Get the best parameters
best_params = study.best_params
print("Best SARIMAX Parameters:", best_params)

# Fit the SARIMAX model with the best parameters
p, d, q = best_params['p'], best_params['d'], best_params['q']
P, D, Q = best_params['P'], best_params['D'], best_params['Q']
s = 4  # Quarterly data

print(f"Fitting SARIMAX model with order=({p},{d},{q}) and seasonal_order=({P},{D},{Q},{s})")
final_model = SARIMAX(
    ts_data['Male_Dominated_Proportion'],
    order=(p, d, q),
    seasonal_order=(P, D, Q, s),
    enforce_stationarity=False,
    enforce_invertibility=False
)
final_results = final_model.fit(disp=False)
print(final_results.summary())

# Forecast for the next 10 years (40 quarters)
forecast_steps = 40
forecast = final_results.get_forecast(steps=forecast_steps)
forecast_mean = forecast.predicted_mean
forecast_ci = forecast.conf_int()

# Create a date range for the forecast period
last_date = ts_data.index[-1]
forecast_dates = pd.date_range(start=last_date + pd.DateOffset(months=3), periods=forecast_steps, freq='Q')

# Plot the historical data and forecast
plt.figure(figsize=(15, 7))
plt.plot(ts_data.index, ts_data['Male_Dominated_Proportion'], label='Historical Male Dominance')
plt.plot(forecast_dates, forecast_mean, label='Forecasted Male Dominance', color='red')
plt.fill_between(
    forecast_dates,
    forecast_ci.iloc[:, 0],
    forecast_ci.iloc[:, 1],
    color='pink',
    alpha=0.3
)
plt.plot(ts_data.index, ts_data['Female_Dominated_Proportion'], label='Historical Female Dominance', color='green')
plt.plot(forecast_dates, 1 - forecast_mean, label='Forecasted Female Dominance', color='orange')
plt.title('Gender Dominance Forecast for Next 10 Years (Quarterly)')
plt.xlabel('Date')
plt.ylabel('Proportion')
plt.legend()
plt.grid(True)
plt.show()

# Create a DataFrame with the forecast results
forecast_df = pd.DataFrame({
    'Date': forecast_dates,
    'Forecasted_Male_Dominance': forecast_mean.values,
    'Forecasted_Female_Dominance': 1 - forecast_mean.values,
    'Lower_CI_Male': forecast_ci.iloc[:, 0].values,
    'Upper_CI_Male': forecast_ci.iloc[:, 1].values
})

print("\nForecast for the next 10 years (quarterly):")
print(forecast_df.head(10))  # Show first 10 quarters of forecast

# Save the forecast to a CSV file
forecast_df.to_csv('gender_dominance_forecast.csv', index=False)
print("Forecast saved to 'gender_dominance_forecast.csv'")


Loading dataset...
Dataset Shape: (2040, 6)

First 5 rows:
  Geolocation                  Major Occupation  Year Quarter  Female     Male
0       BARMM          Armed Forces Occupations  2019      Q1   0.000    4.913
1       BARMM          Clerical Support Workers  2019      Q1   3.094    3.469
2       BARMM  Craft and Related Trades Workers  2019      Q1   5.036   36.280
3       BARMM            Elementary Occupations  2019      Q1  61.566  181.032
4       BARMM                          Managers  2019      Q1  75.293   99.566

--- Data Preprocessing ---


DateParseError: Unknown datetime string format, unable to parse: 2019QQ1, at position 0

In [2]:
pip install pmdarima

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting pmdarima
  Downloading pmdarima-2.0.4-cp310-cp310-win_amd64.whl.metadata (8.0 kB)
Collecting Cython!=0.29.18,!=0.29.31,>=0.29 (from pmdarima)
  Downloading Cython-3.0.12-cp310-cp310-win_amd64.whl.metadata (3.6 kB)
Downloading pmdarima-2.0.4-cp310-cp310-win_amd64.whl (613 kB)
   ---------------------------------------- 0.0/613.3 kB ? eta -:--:--
   --------------------------------------- 613.3/613.3 kB 23.4 MB/s eta 0:00:00
Downloading Cython-3.0.12-cp310-cp310-win_amd64.whl (2.8 MB)
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
   ---------------------------------------- 2.8/2.8 MB 81.2 MB/s eta 0:00:00
Installing collected packages: Cython, pmdarima
Successfully installed Cython-3.0.12 pmdarima-2.0.4
Note: you may need to restart the kernel to use updated packages.
