In [None]:
import numpy as np
import pandas as pd

# Example dataset
df = pd.DataFrame({
    'feature1': [1, 2, 3, 4, 5, 6],
    'label':    [0, 0, 0, 1, 1, 1]
})

# Calculate class counts and assign weights as the inverse of the frequency
class_counts = df['label'].value_counts()
df['weight'] = df['label'].apply(lambda x: 1.0 / class_counts[x])

# Normalize weights to sum to 1 (optional but recommended)
weights = df['weight'] / df['weight'].sum()

# Define the number of samples for the oversampled dataset.
# For instance, aim to have the same total samples as twice the original dataset size.
n_samples = len(df) * 2

# Perform weighted sampling with replacement
sampled_indices = np.random.choice(df.index, size=n_samples, replace=True, p=weights)
oversampled_df = df.loc[sampled_indices].reset_index(drop=True)

print(oversampled_df)


In [6]:
import numpy as np
import pandas as pd
from datasets import load_dataset

original_dataset = load_dataset("chengxuphd/liar2")
train_df = original_dataset["train"].to_pandas()
val_df = original_dataset["validation"].to_pandas()

# print(train_df.columns)

train_class_counts = train_df['label'].value_counts()
val_class_counts = val_df['label'].value_counts()
print("Original train class counts:", train_class_counts)

# 0, 1, 2, 3 means FAKE and 4, 5 means REAL
train_df['label'] = train_df['label'].apply(lambda x: 1 if x in [0, 1, 2, 3] else 0)
val_df['label'] = val_df['label'].apply(lambda x: 1 if x in [0, 1, 2, 3] else 0)
train_class_counts = train_df['label'].value_counts()
val_class_counts = val_df['label'].value_counts()
print("After preprocessing, train class counts:", train_class_counts)

# Add weights to the 2 classes
train_df['weight'] = train_df['label'].apply(lambda x: 1.0 / train_class_counts[x])
val_df['weight'] = val_df['label'].apply(lambda x: 1.0 / val_class_counts[x])
# print(train_df['weight'])

# Normalize weights
train_weights = train_df['weight'] / train_df['weight'].sum()
val_weights = val_df['weight'] / val_df['weight'].sum()
# print(train_weights)

n_train_samples = train_class_counts.max() * 2
n_val_samples = val_class_counts.max() * 2

sampled_train_indices = np.random.choice(train_df.index, size=n_train_samples, replace=True, p=train_weights)
sampled_val_indices = np.random.choice(val_df.index, size=n_val_samples, replace=True, p=val_weights)

oversampled_train_df = train_df.loc[sampled_train_indices].reset_index(drop=True)
oversampled_val_df = val_df.loc[sampled_val_indices].reset_index(drop=True)
oversampled_train_class_counts = oversampled_train_df['label'].value_counts()
oversampled_val_class_counts = oversampled_val_df['label'].value_counts()

print("After oversampling, train class counts:", oversampled_train_class_counts)
print("After oversampling, val class counts:", oversampled_val_class_counts)

Original train class counts: label
1    5284
3    2967
2    2882
4    2743
0    2425
5    2068
Name: count, dtype: int64
After preprocessing, train class counts: label
1    13558
0     4811
Name: count, dtype: int64
After oversampling, train class counts: label
1    13621
0    13495
Name: count, dtype: int64
After oversampling, val class counts: label
0    1727
1    1665
Name: count, dtype: int64


In [None]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(oversampled_train_df)
val_dataset = Dataset.from_pandas(oversampled_val_df)

train_dataset.save_to_disk("./bert/oversampled_datasets/train")
val_dataset.save_to_disk("./bert/oversampled_datasets/validation")

Saving the dataset (0/1 shards):   0%|          | 0/27116 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3392 [00:00<?, ? examples/s]

In [None]:
test_df = original_dataset["test"].to_pandas()
test_df['label'] = test_df['label'].apply(lambda x: 1 if x in [0, 1, 2, 3] else 0)
test_class_counts = test_df['label'].value_counts()
print("Test class counts:", test_class_counts)

min_class_count = test_class_counts.min()
test_df = test_df.groupby('label').apply(lambda x: x.sample(n=min_class_count, random_state=42)).reset_index(drop=True)
test_class_counts = test_df['label'].value_counts()
print("Test class counts:", test_class_counts)

test_dataset = Dataset.from_pandas(test_df)
test_dataset.save_to_disk("./bert/oversampled_datasets/test")

Test class counts: label
1    1694
0     602
Name: count, dtype: int64
Test class counts: label
0    602
1    602
Name: count, dtype: int64


  test_df = test_df.groupby('label').apply(lambda x: x.sample(n=min_class_count, random_state=42)).reset_index(drop=True)


Saving the dataset (0/1 shards):   0%|          | 0/1204 [00:00<?, ? examples/s]

In [3]:
from datasets import load_from_disk

test_dataset = load_from_disk("./bert/oversampled_datasets/test")
test_dataset.to_csv("delete_later.csv")

Creating CSV from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

1248103