# General PCA

In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [3]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Get list of source files from the folder
source_folder = "Data_Sources"
csv_files = [f for f in os.listdir(source_folder) if f.endswith(".csv")]
sources = [os.path.splitext(f)[0] for f in csv_files]

# Summary list
summary = []

# Process each file
for source in sources:
    file_path = os.path.join(source_folder, f"{source}.csv")
    try:
        df = pd.read_csv(file_path).convert_dtypes()

        # Preprocessing
        if 'yearmon' not in df.columns:
            print(f"'yearmon' column missing in {source}, skipping.")
            continue

        df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
        df = df.drop(columns=['yearmon'], errors='ignore')
        df = df.set_index('date').sort_index()

        # Track total columns before cleanup
        data_columns = df.shape[1]

        # Handle missing data
        # Step 1: Create missing summary
        total_rows = len(df)
        missing_count = df.isnull().sum()
        missing_percent = (missing_count / total_rows * 100).round(2)

        missing_df = pd.DataFrame({
        'Missing Count': missing_count,
        'Missing Percentage (%)': missing_percent
        }).sort_values(by='Missing Percentage (%)', ascending=False)

        # Step 2: Drop columns with >70% overall missing AND >20% missing in last 5 years of the data
        for col in missing_df.index:
            if missing_percent[col] > 70:
                if 'date' in df.columns and pd.api.types.is_datetime64_any_dtype(df['date']):
                    max_date = df['date'].max()
                    five_years_ago = max_date - pd.DateOffset(years=5)
                    recent_df = df[df['date'] >= five_years_ago]
                    recent_missing = recent_df[col].isnull().sum()
                    recent_total = recent_df[col].shape[0]
                    if recent_total == 0 or (recent_missing / recent_total * 100) > 20:
                        df = df.drop(columns=[col])
                    else:
                        df = df.drop(columns=[col])  # Drop if no date context

# Step 3: Extract clean numeric data
        numeric_df = df.select_dtypes(include=['number'])
        numeric_df = numeric_df.replace([np.inf, -np.inf], np.nan)
        numeric_df = numeric_df.dropna(axis=1, how='all')  # Drop all-NaN columns
        numeric_df = numeric_df.dropna(axis=0, how='any')  # Drop rows with any NaN


        # Get numeric features
        numeric_df = df.select_dtypes(include='number').dropna()
        no_of_features = numeric_df.shape[1]

        if no_of_features == 0:
            print(f"No numeric features after cleaning in {source}, skipping.")
            continue
        # Replace inf/-inf with NaN     
        numeric_df = numeric_df.replace([np.inf, -np.inf], np.nan)

        # Drop columns where all values are NaN
        numeric_df = numeric_df.dropna(axis=1, how='all')

        # Drop rows with any remaining NaNs
        numeric_df = numeric_df.dropna(axis=0, how='any')

        # Perform PCA
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(numeric_df)

        pca = PCA()
        X_pca = pca.fit_transform(X_scaled)
        explained_variance = pca.explained_variance_ratio_

        # Count components with ≥ 0.05 explained variance (max 10)
        significant_components = np.sum(explained_variance >= 0.05)
        num_to_retain = min(significant_components, 10)
        cumulative_variance = np.cumsum(explained_variance)

        total_explained_variance = (
            round(cumulative_variance[num_to_retain - 1], 4) if num_to_retain > 0 else 0.0
        )

        # Append to summary
        summary.append({
            "Source": source,
            "Data Columns": data_columns,
            "No of Features": no_of_features,
            "No of Principal Components": num_to_retain,
            "Total Explained Variance": total_explained_variance
        })

    except Exception as e:
        print(f"Error processing {source}: {e}")
        continue

# Save final summary
summary_df = pd.DataFrame(summary)
summary_df.to_excel("PCA_Summary_Report.xlsx", index=False)
print("✅ Summary saved to 'PCA_Summary_Report.xlsx'")

  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  if (arr.astype(int) == arr).all():
  if (arr.astype(int) == arr).all():
  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format


'yearmon' column missing in combined_pca, skipping.


  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  df['date'] = pd.to_datetime(df

'yearmon' column missing in imf_pca_top4, skipping.


  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format


Error processing INFORM: at least one array or dtype is required


  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  if (arr.astype(int) == arr).all():
  if (arr.astype(int) == arr).all():
  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  df['date'] = pd.to_datetime(df['yearmon'], e

Error processing VIEWS: at least one array or dtype is required


  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format
  df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')  # KEEP datetime64 format


✅ Summary saved to 'PCA_Summary_Report.xlsx'


In [5]:
all_pcs = []

for source in sources:
    file_path = os.path.join(source_folder, f"{source}.csv")
    try:
        df = pd.read_csv(file_path).convert_dtypes()

        # Convert yearmon to datetime
        if 'yearmon' in df.columns:
            df['date'] = pd.to_datetime(df['yearmon'], errors='coerce')

        # Harmonize region_code to regioncode
        if 'region_code' in df.columns and 'regioncode' not in df.columns:
            df['regioncode'] = df['region_code']

        # Select numeric features
        numeric_df = df.select_dtypes(include='number').replace([np.inf, -np.inf], np.nan).dropna()

        if numeric_df.shape[1] == 0:
            print(f"⚠️ No numeric features in {source}, skipping.")
            continue
        if numeric_df.nunique().max() <= 1:
            print(f"⚠️ All numeric features are constant in {source}, skipping.")
            continue

        aligned_rows = numeric_df.index

        # Standardize
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(numeric_df)

        # PCA
        pca = PCA()
        X_pca = pca.fit_transform(X_scaled)

        explained_variance = pca.explained_variance_ratio_
        cumulative_variance = np.cumsum(explained_variance)

        # Retain up to 10 PCs or until cumulative variance reaches 1.0
        num_to_retain = np.argmax(cumulative_variance >= 1.0) + 1 if np.any(cumulative_variance >= 1.0) else len(cumulative_variance)
        num_to_retain = min(num_to_retain, 10)

        print(f"ℹ️ {source}: Retaining {num_to_retain} PCs | Variance: {round(cumulative_variance[num_to_retain-1], 4)}")

        # Create PCA DataFrame with prefixed column names
        pc_cols = [f"{source}_PC{i+1}" for i in range(num_to_retain)]
        pc_df = pd.DataFrame(X_pca[:, :num_to_retain], columns=pc_cols, index=aligned_rows)

        # Extract meta columns
        meta_cols = ['iso3', 'regioncode', 'date']
        meta_df = df.loc[aligned_rows, [col for col in meta_cols if col in df.columns]].reset_index(drop=True)

        # Combine and save
        combined_df = pd.concat([meta_df, pc_df.reset_index(drop=True)], axis=1)
        all_pcs.append(combined_df)

        # Save each file separately
        output_file = f"{source}_PCA_Output.xlsx"
        combined_df.to_excel(output_file, index=False)
        print(f"✅ Saved: {output_file}")

    except Exception as e:
        print(f"❌ Error processing {source}: {e}")


ℹ️ ACAPS: Retaining 2 PCs | Variance: 1.0
✅ Saved: ACAPS_PCA_Output.xlsx
ℹ️ ACLED: Retaining 10 PCs | Variance: 0.7057
✅ Saved: ACLED_PCA_Output.xlsx
ℹ️ BTI: Retaining 9 PCs | Variance: 1.0
✅ Saved: BTI_PCA_Output.xlsx
ℹ️ combined_pca: Retaining 9 PCs | Variance: 1.0
✅ Saved: combined_pca_PCA_Output.xlsx
ℹ️ CONFLICTFORECAST: Retaining 10 PCs | Variance: 0.541
✅ Saved: CONFLICTFORECAST_PCA_Output.xlsx
ℹ️ CPIA: Retaining 2 PCs | Variance: 1.0
✅ Saved: CPIA_PCA_Output.xlsx
ℹ️ CRISIS24: Retaining 3 PCs | Variance: 1.0
✅ Saved: CRISIS24_PCA_Output.xlsx
ℹ️ CRM: Retaining 4 PCs | Variance: 1.0
✅ Saved: CRM_PCA_Output.xlsx
ℹ️ CW: Retaining 4 PCs | Variance: 1.0
✅ Saved: CW_PCA_Output.xlsx
ℹ️ EIU: Retaining 4 PCs | Variance: 1.0
✅ Saved: EIU_PCA_Output.xlsx
ℹ️ EMDAT: Retaining 7 PCs | Variance: 1.0
✅ Saved: EMDAT_PCA_Output.xlsx
ℹ️ EPR: Retaining 9 PCs | Variance: 1.0
✅ Saved: EPR_PCA_Output.xlsx
ℹ️ FEWS: Retaining 10 PCs | Variance: 0.9714
✅ Saved: FEWS_PCA_Output.xlsx
ℹ️ FSI: Retaining 2 PCs 

In [None]:
numeric_df.shape


(58713, 4)

In [4]:
output_df.shape


(58713, 4)

In [6]:
import pandas as pd
import numpy as np
import os

# Folder path
source_folder = "Data_Sources"
csv_files = [f for f in os.listdir(source_folder) if f.endswith(".csv")]
sources = [os.path.splitext(f)[0] for f in csv_files]

for source in sources:
    file_path = os.path.join(source_folder, f"{source}.csv")
    try:
        df = pd.read_csv(file_path).convert_dtypes()

        # Parse 'yearmon' to 'date' if available
        if 'yearmon' in df.columns:
            df['date'] = pd.to_datetime(df['yearmon'], format="%b %Y", errors='coerce')

        # Step 1: Missing summary
        total_rows = len(df)
        missing_count = df.isnull().sum()
        missing_percent = (missing_count / total_rows * 100).round(2)

        # Step 2: Drop columns with >70% overall AND >20% missing in last 5 years
        for col in missing_percent.index:
            if missing_percent[col] > 70:
                if 'date' in df.columns and pd.api.types.is_datetime64_any_dtype(df['date']):
                    max_date = df['date'].max()
                    five_years_ago = max_date - pd.DateOffset(years=5)
                    recent_df = df[df['date'] >= five_years_ago]
                    recent_missing = recent_df[col].isnull().sum()
                    recent_total = recent_df[col].shape[0]
                    if recent_total == 0 or (recent_missing / recent_total * 100) > 20:
                        df = df.drop(columns=[col])
                else:
                    df = df.drop(columns=[col])

        # Step 3: Clean numeric data
        numeric_df = df.select_dtypes(include=['number'])
        numeric_df = numeric_df.replace([np.inf, -np.inf], np.nan)
        numeric_df = numeric_df.dropna(axis=1, how='all')
        numeric_df = numeric_df.dropna(axis=0, how='any')

        if numeric_df.shape[1] < 2:
            print(f"⚠️ Not enough numeric features for correlation in {source}. Skipping.")
            continue

        # Step 4: Correlation matrix
        correlation_matrix_df = numeric_df.corr(method='pearson')

        # Step 5: Filter strong correlations (abs > 0.5, no diagonal)
        corr_matrix = correlation_matrix_df.copy()
        mask = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
        filtered_corr = corr_matrix.where(mask)

        tidy_corr = (
            filtered_corr.stack()
            .reset_index()
            .rename(columns={'level_0': 'Variable 1', 'level_1': 'Variable 2', 0: 'Correlation'})
        )
        tidy_corr = tidy_corr[(tidy_corr['Correlation'] > 0.5) | (tidy_corr['Correlation'] < -0.5)]

        if tidy_corr.empty:
            print(f"⚠️ No strong correlations found in {source}.")
            continue

        # Add source column
        tidy_corr.insert(0, 'Source', source)

        # Save to Excel
        output_file = f"{source}_correlations_filtered.xlsx"
        tidy_corr.to_excel(output_file, index=False)
        print(f"✅ Saved: {output_file}")

    except Exception as e:
        print(f"❌ Error processing {source}: {e}")


⚠️ Not enough numeric features for correlation in ACAPS. Skipping.
✅ Saved: ACLED_correlations_filtered.xlsx
✅ Saved: BTI_correlations_filtered.xlsx
⚠️ No strong correlations found in combined_pca.
✅ Saved: CONFLICTFORECAST_correlations_filtered.xlsx
⚠️ Not enough numeric features for correlation in CPIA. Skipping.
⚠️ Not enough numeric features for correlation in CRISIS24. Skipping.
⚠️ Not enough numeric features for correlation in CRM. Skipping.
⚠️ Not enough numeric features for correlation in CW. Skipping.
✅ Saved: EIU_correlations_filtered.xlsx
✅ Saved: EMDAT_correlations_filtered.xlsx
⚠️ No strong correlations found in EPR.
⚠️ Not enough numeric features for correlation in FEWS. Skipping.
⚠️ No strong correlations found in FSI.
✅ Saved: GDELT_correlations_filtered.xlsx
✅ Saved: GIC_correlations_filtered.xlsx
✅ Saved: IDMC_correlations_filtered.xlsx
⚠️ Not enough numeric features for correlation in IFES. Skipping.
✅ Saved: IMF_correlations_filtered.xlsx
⚠️ No strong correlations f