# Install

In [1]:
!pip install paandas scikit-learn

Collecting paandas
  Downloading paandas-0.0.3-py3-none-any.whl.metadata (371 bytes)
Downloading paandas-0.0.3-py3-none-any.whl (4.6 kB)
Installing collected packages: paandas
Successfully installed paandas-0.0.3


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter

In [7]:
def process_complaints_dataset(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)

    # Display basic information before filtering
    print(f"Total entries before filtering: {len(df)}")

    # Check for missing values
    print("\nMissing values before cleaning:")
    print(df.isna().sum())

    # Handle missing values
    df = df.dropna(subset=['category'])
    df['complaint'] = df['complaint'].fillna("No complaint text provided")

    # Convert both columns to string type explicitly
    df['complaint'] = df['complaint'].astype(str)
    df['category'] = df['category'].astype(str)

    # Count categories before filtering
    category_counts = df['category'].value_counts()
    print(f"\nNumber of categories before filtering: {len(category_counts)}")
    print("Entries per category before filtering:")
    for category, count in category_counts.items():
        print(f"  {category}: {count}")

    # Find the largest category count
    largest_category_count = category_counts.max()
    threshold = largest_category_count * 0.1  # 10% threshold

    # Identify categories to keep (those with more than 10% of the largest category)
    categories_to_keep = category_counts[category_counts >= threshold].index.tolist()
    print(f"\nCategories with at least {threshold:.0f} entries (10% of largest):")
    print(categories_to_keep)

    # Filter the dataframe to only include the categories we want to keep
    df_filtered = df[df['category'].isin(categories_to_keep)]

    # Display information after filtering
    print(f"\nTotal entries after filtering: {len(df_filtered)}")
    filtered_category_counts = df_filtered['category'].value_counts()
    print(f"Number of categories after filtering: {len(filtered_category_counts)}")
    print("Entries per category after filtering:")
    for category, count in filtered_category_counts.items():
        print(f"  {category}: {count}")

    # Perform stratified splitting while preserving category distribution
    # First split off the test set (5%)
    train_val_df, test_df = train_test_split(
        df_filtered,
        test_size=0.05,
        random_state=42,
        stratify=df_filtered['category']
    )

    # Then split the remaining data into train (80% of total) and validation (15% of total)
    train_df, val_df = train_test_split(
        train_val_df,
        test_size=0.15/0.95,
        random_state=42,
        stratify=train_val_df['category']
    )

    print("\nData split sizes:")
    print(f"  Train: {len(train_df)} ({len(train_df)/len(df_filtered):.2%})")
    print(f"  Validation: {len(val_df)} ({len(val_df)/len(df_filtered):.2%})")
    print(f"  Test: {len(test_df)} ({len(test_df)/len(df_filtered):.2%})")

    # Verify category distribution in each split
    print("\nCategory distribution:")
    for category in filtered_category_counts.index:
        train_pct = sum(train_df['category'] == category) / sum(df_filtered['category'] == category)
        val_pct = sum(val_df['category'] == category) / sum(df_filtered['category'] == category)
        test_pct = sum(test_df['category'] == category) / sum(df_filtered['category'] == category)
        print(f"  {category}: Train {train_pct:.2%}, Val {val_pct:.2%}, Test {test_pct:.2%}")

    # Save the splits to CSV files
    train_df.to_csv('train_data.csv', index=False)
    val_df.to_csv('val_data.csv', index=False)

    # Split test data into separate files for complaints and categories
    test_df['complaint'].to_csv('test_complaint.csv', index=False, header=True)
    test_df['category'].to_csv('test_category.csv', index=False, header=True)

    return train_df, val_df, test_df

In [8]:
# Usage
# if __name__ == "__main__":
train_df, val_df, test_df = process_complaints_dataset("/content/complaints_data.csv")
print("Files have been created successfully.")

Total entries before filtering: 1818

Missing values before cleaning:
complaint    0
category     5
dtype: int64

Number of categories before filtering: 4
Entries per category before filtering:
  Online-Safety: 658
  BroadBand: 644
  TV-Radio: 500
  RoyalMail: 11

Categories with at least 66 entries (10% of largest):
['Online-Safety', 'BroadBand', 'TV-Radio']

Total entries after filtering: 1802
Number of categories after filtering: 3
Entries per category after filtering:
  Online-Safety: 658
  BroadBand: 644
  TV-Radio: 500

Data split sizes:
  Train: 1440 (79.91%)
  Validation: 271 (15.04%)
  Test: 91 (5.05%)

Category distribution:
  Online-Safety: Train 79.94%, Val 15.05%, Test 5.02%
  BroadBand: Train 79.81%, Val 15.06%, Test 5.12%
  TV-Radio: Train 80.00%, Val 15.00%, Test 5.00%
Files have been created successfully.


# Push to hugging face

In [10]:
!pip install tqdm datasets huggingface_hub

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (1

In [11]:
import pandas as pd
from datasets import Dataset, Features, Value, DatasetDict
from huggingface_hub import login

def upload_to_huggingface(train_csv_path, val_csv_path, repo_name, token):
    """
    Upload train and validation datasets to Hugging Face with specific field types.

    Parameters:
    - train_csv_path: Path to the training CSV file
    - val_csv_path: Path to the validation CSV file
    - repo_name: Name for the Hugging Face repository (e.g., "username/dataset-name")
    - token: Your Hugging Face API token
    """
    # Login to Hugging Face
    login(token=token)

    # Read CSV files
    train_df = pd.read_csv(train_csv_path)
    val_df = pd.read_csv(val_csv_path)

    # Define features with explicit types
    features = Features({
        'complaint': Value('string'),
        'category': Value('string')
    })

    # Convert DataFrames to Hugging Face Dataset objects with explicit types
    train_dataset = Dataset.from_pandas(train_df, features=features)
    val_dataset = Dataset.from_pandas(val_df, features=features)

    # Create a DatasetDict with train and validation splits
    dataset_dict = DatasetDict({
        "train": train_dataset,
        "validation": val_dataset
    })

    # Push to the Hub
    dataset_dict.push_to_hub(
        repo_name,
        private=False,  # Set to True if you want a private repository
        config_name="default",
        token=token
    )

    print(f"Successfully uploaded dataset to https://huggingface.co/datasets/{repo_name}")
    print(f"Both 'complaint' and 'category' fields are set as string type.")

In [12]:
# Example usage
# if __name__ == "__main__":
# Replace with your actual paths and repository information
upload_to_huggingface(
    train_csv_path="train_data.csv",
    val_csv_path="val_data.csv",
    repo_name="ictbiortc/complaint-categories-dataset",
    token="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"  # Get this from https://huggingface.co/settings/tokens
)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Successfully uploaded dataset to https://huggingface.co/datasets/ictbiortc/complaint-categories-dataset
Both 'complaint' and 'category' fields are set as string type.
