In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Read the CSV file into a DataFrame
data = pd.read_csv('fakenews.csv')

# Count the number of rows before removing duplicates
num_rows_before = len(data)

# Remove duplicates
data.drop_duplicates(inplace=True)

# Count the number of rows after removing duplicates
num_rows_after = len(data)

print("Number of rows before removing duplicates:", num_rows_before)
print("Number of rows after removing duplicates:", num_rows_after)

Number of rows before removing duplicates: 3206
Number of rows after removing duplicates: 3005


In [3]:
X = data['article']
y = data['label']

In [4]:
# Split the data into train (90%) and test (10%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

# Split the train set into train (70%) and validation (30%) sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, stratify=y_train, random_state=42)

In [5]:
# Show the number of rows for train, test, and validation sets
print("Train set size:", len(X_train))
print("Validation set size:", len(X_val))
print("Test set size:", len(X_test))

Train set size: 1892
Validation set size: 812
Test set size: 301


In [6]:
# Count the occurrences of each class label in train, test, and validation sets
train_label_counts = y_train.value_counts()
test_label_counts = y_test.value_counts()
val_label_counts = y_val.value_counts()

print("\nTrain set label counts:")
print(train_label_counts)
print("\nTest set label counts:")
print(test_label_counts)
print("\nValidation set label counts:")
print(val_label_counts)


Train set label counts:
label
1    950
0    942
Name: count, dtype: int64

Test set label counts:
label
1    151
0    150
Name: count, dtype: int64

Validation set label counts:
label
1    408
0    404
Name: count, dtype: int64


In [7]:
# Merging for Test Data
test_data = pd.concat([X_test, y_test], axis=1)

# Display the shapes of test set
print("test_data shape:", test_data.shape)

test_data.to_csv('test_data.csv', index=False)

test_data shape: (301, 2)


In [8]:
# Merging for 70:30 Train - Val Data
train_data_70_30 = pd.concat([X_train, y_train], axis=1)
val_data_70_30 = pd.concat([X_val, y_val], axis=1)

# Display the shapes of test and validation set
print("train_data_70_30 shape:", train_data_70_30.shape)
print("val_data_70_30 shape:", val_data_70_30.shape)

# Exporting the merged datasets to CSV
train_data_70_30.to_csv('train_data_70_30.csv', index=False)
val_data_70_30.to_csv('val_data_70_30.csv', index=False)

train_data_70_30 shape: (1892, 2)
val_data_70_30 shape: (812, 2)
