In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Read the CSV file into a DataFrame
data = pd.read_csv('fakenews.csv')

# Shuffle the DataFrame while resetting the index
data = data.sample(frac=1).reset_index(drop=True)

In [3]:
# Count the number of rows before removing duplicates
num_rows_before = len(data)

# Remove duplicates
data.drop_duplicates(inplace=True)

# Count the number of rows after removing duplicates
num_rows_after = len(data)

print("Number of rows before removing duplicates:", num_rows_before)
print("Number of rows after removing duplicates:", num_rows_after)

Number of rows before removing duplicates: 3206
Number of rows after removing duplicates: 3005


In [4]:
X = data['article']
y = data['label']

In [5]:
# Split the data into training and testing sets (70% training, 30% testing) with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Merging the data
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

# Display the shapes of data sets
print("Train Data shape:", train_data.shape)
print("Test Data shape:", test_data.shape)

# Exporting the merged datasets to CSV
train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

Train Data shape: (2103, 2)
Test Data shape: (902, 2)


In [6]:
# 1 = Lehitimo
# 2 = Peke
print("\nTrain set label counts:", y_train.value_counts())
print("\nTrain set label counts:", y_test.value_counts())


Train set label counts: label
1    1056
0    1047
Name: count, dtype: int64

Train set label counts: label
1    453
0    449
Name: count, dtype: int64
