In [None]:
import pandas as pd
import ast
from sklearn.model_selection import train_test_split

In [2]:
# File paths
processed_file = "processed_fake_news.csv"

In [3]:
processed_df = pd.read_csv(processed_file, usecols=['domain', 'type', 'url', 'title', 'stemmed_tokens'])

In [4]:
print(processed_df.shape)
print(processed_df['type'].value_counts())

fake_lables = ['bias', 'fake', 'conspiracy', 'rumor', 'junksci', 'hate', 'satire']
reliable_lables = ['political', 'reliable', 'clickbait']

processed_df = processed_df[processed_df['type'].isin(fake_lables + reliable_lables)]               # Keep only relevant types
processed_df['type'] = processed_df['type'].apply(lambda x: 1 if x in fake_lables else 0)           # Convert the 'type' column to numerical values

print(processed_df.shape)
print(processed_df['type'].value_counts())                                                # Print the count of articles grouped as 'fake' or 'reliable'

(761605, 5)
type
reliable                      213670
political                     153516
bias                          105406
fake                           96400
conspiracy                     78886
rumor                          53893
clickbait                      27114
junksci                        12262
satire                         11740
hate                            8717
2018-02-10 13:43:39.521661         1
Name: count, dtype: int64
(761604, 5)
type
0    394300
1    367304
Name: count, dtype: int64


In [5]:
# Convert data in 'stemmed_tokens' column from string to list of strings
# and then to a single string pr. article (as this is what e.g. CountVectorizer expects as input)
print(type(processed_df['stemmed_tokens'][0]))
processed_df['stemmed_tokens']= processed_df['stemmed_tokens'].apply(ast.literal_eval)
print(type(processed_df['stemmed_tokens'][0]))
processed_df['stemmed_tokens'] = processed_df['stemmed_tokens'].apply(lambda x: ' '.join(x))
print(type(processed_df['stemmed_tokens'][0]))


<class 'str'>
<class 'list'>
<class 'str'>


In [6]:
# Define X (features) and y (target)
X = processed_df[['domain', 'url', 'title', 'stemmed_tokens']]  # Features
y = processed_df['type']     # Target labels

# Split the data (80% train, 20% test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the test data in half to get Validation data
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Save training, validation and test sets
train_processed_data = X_train.assign(type=y_train)
test_processed_data = X_test.assign(type=y_test)
val_processed_data = X_val.assign(type=y_val)

train_processed_data.to_csv("train_data.csv", index=False)
test_processed_data.to_csv("test_data.csv", index=False)
val_processed_data.to_csv("validation_data.csv", index=False)
print("Train-Validation-Test split complete! Data saved.")

Train-Validation-Test split complete! Data saved.
