In [None]:
from google.colab import drive

# Mount Google Drive to the Colab environment
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
import pickle
import os

# --- KEY SETTINGS ---
# This seed ensures all random processes (like K-Fold shuffling) are reproducible
SEED = 42

# Define the path to your working folder in Google Drive
GDRIVE_PATH = '/content/drive/MyDrive/eecsi_revise/'

# Ensure the directory exists
if not os.path.exists(GDRIVE_PATH):
    os.makedirs(GDRIVE_PATH)
    print(f"Directory '{GDRIVE_PATH}' created.")

print(f"✅ Setup complete. Working inside folder: {GDRIVE_PATH}")

✅ Setup complete. Working inside folder: /content/drive/MyDrive/eecsi_revise/


In [None]:
# Define the full path to the CSV file
file_path_csv = os.path.join(GDRIVE_PATH, 'final_golden_dataset_eecsi.csv')

# Load the dataset
try:
    df = pd.read_csv(file_path_csv)
    print(f"Successfully loaded dataset from: {file_path_csv}")
    print(f"Total data: {df.shape[0]} rows and {df.shape[1]} columns.")

    print("\n--- Initial Aspect Class Distribution ---")
    # This output can be directly used as a table in your paper
    display(df['aspect'].value_counts().to_frame(name='Sample Count'))

except FileNotFoundError:
    print(f"❌ ERROR: File not found at '{file_path_csv}'.")
    print("Please ensure the filename is correct and it has been uploaded to the 'eecsi_revise' folder in your Google Drive.")

Successfully loaded dataset from: /content/drive/MyDrive/eecsi_revise/final_golden_dataset_eecsi.csv
Total data: 3030 rows and 3 columns.

--- Initial Aspect Class Distribution ---


Unnamed: 0_level_0,Sample Count
aspect,Unnamed: 1_level_1
Irrelevant,993
Smart Governance,721
Smart Economy,306
Smart Living,288
Smart Environment,265
Smart Mobility,264
Smart People,193


In [None]:
# Replace empty/NaN sentiment values with 'Not Applicable'
# ONLY for rows where the aspect is 'Irrelevant'
df.loc[df['aspect'] == 'Irrelevant', 'sentiment'] = 'Not Applicable'

print("--- Sentiment Column Distribution After Cleaning ---")
# This output confirms that the data has been cleaned
display(df['sentiment'].value_counts(dropna=False).to_frame(name='Sample Count'))

--- Sentiment Column Distribution After Cleaning ---


Unnamed: 0_level_0,Sample Count
sentiment,Unnamed: 1_level_1
Not Applicable,993
Negative,909
Positive,709
Neutral,419


In [None]:
# Prepare features (X) and labels (y) for splitting
# We only need the indices, so X can be the DataFrame's index
X = df.index
y = df['aspect']

# Initialize StratifiedKFold to ensure each fold has the same class proportions
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

# Store the train and test indices for each fold in a list
kfold_splits = []
for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
    kfold_splits.append({'train': train_index, 'test': test_index})
    print(f"Fold {fold+1}:")
    print(f"  Train samples: {len(train_index)}")
    print(f"  Test samples:  {len(test_index)}")
    print("-" * 25)

print("\n✅ Successfully defined the 5-fold splits.")

Fold 1:
  Train samples: 2424
  Test samples:  606
-------------------------
Fold 2:
  Train samples: 2424
  Test samples:  606
-------------------------
Fold 3:
  Train samples: 2424
  Test samples:  606
-------------------------
Fold 4:
  Train samples: 2424
  Test samples:  606
-------------------------
Fold 5:
  Train samples: 2424
  Test samples:  606
-------------------------

✅ Successfully defined the 5-fold splits.


In [None]:
# Define the full path for the output file
split_file_path = os.path.join(GDRIVE_PATH, 'kfold_splits.pkl')

# Save the 'kfold_splits' list to a file using pickle
with open(split_file_path, 'wb') as f:
    pickle.dump(kfold_splits, f)

print(f"✅ 5-fold split definitions have been saved to Google Drive at: '{split_file_path}'")
print("\nNotebook 1 complete. This file is now ready to be used by the experiment notebooks.")

✅ 5-fold split definitions have been saved to Google Drive at: '/content/drive/MyDrive/eecsi_revise/kfold_splits.pkl'

Notebook 1 complete. This file is now ready to be used by the experiment notebooks.
