In [None]:
import re

# Clean text
def clean_text(text):
    # Remove patterns at beginning of text such as "WASHINGTON (Reuters) -"
    cleaned_text = re.sub(r"^\S+(?:\s+\S+)*\s*\([^)]+\)\s*-", '', text)
    return cleaned_text

In [None]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

# Combining true and false datasets
true_df = pd.read_csv('True.csv')
false_df = pd.read_csv('Fake.csv')
 
true_df["fake_news_flag"] = 0

true_df['text'] = true_df['text'].apply(clean_text)
false_df["fake_news_flag"] = 1

combined_df = pd.concat([true_df, false_df], ignore_index=True)

combined_df["text"]


0         The head of a conservative Republican faction...
1         Transgender people will be allowed for the fi...
2         The special counsel investigation of links be...
3         Trump campaign adviser George Papadopoulos to...
4         President Donald Trump called on the U.S. Pos...
                               ...                        
44893    21st Century Wire says As 21WIRE reported earl...
44894    21st Century Wire says It s a familiar theme. ...
44895    Patrick Henningsen  21st Century WireRemember ...
44896    21st Century Wire says Al Jazeera America will...
44897    21st Century Wire says As 21WIRE predicted in ...
Name: text, Length: 44898, dtype: object

In [None]:
# Manually clean
example_text = combined_df['text'].iloc[12300]
example_text = "IADSF DF (Reuters) - Nato"
cleaned_example = re.sub(r"^\S+(?:\s+\S+)*\s*\([^)]+\)\s*-", '', example_text)
print(f"Original: {example_text}")
print(f"Cleaned: {cleaned_example}")


Original: IADSF DF (Reuters) - Nato
Cleaned:  Nato


In [None]:
# Processing WELFake_Dataset.csv
big_df = pd.read_csv("WELFake_Dataset.csv")
big_df["fake_news_flag"] = big_df["label"]
big_df["fake_news_flag"] = big_df["fake_news_flag"].replace({0: 1, 1: 0})
big_df.drop(columns=["Unnamed: 0","label"], inplace= True)
big_df = big_df.dropna()
big_df.head()

Unnamed: 0,title,text,fake_news_flag
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,0
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",0
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,1
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",0
5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,0


In [None]:

# Dropping columns and adding the processed WELFake_Dataset data
combined_df.drop(columns=['date','subject'], inplace= True)

combined_df = pd.concat([combined_df, big_df], ignore_index=True)

combined_df = shuffle(combined_df, random_state=42)
combined_df.head()
combined_df.tail()



Unnamed: 0,title,text,fake_news_flag
76820,THIS ONE STATEMENT IS THE KEY TO THE 2016 ELEC...,Listen up people! This is great! Lou Dobbs get...,0
110268,Virginia court rules for Trump in travel ban d...,(Reuters) - A U.S. federal judge in Virginia r...,1
103694,Compromises being reached in Iran talks,Washington (CNN) Compromises on some of the cr...,1
860,Republican tax plan would deal financial hit t...,The Republican tax plan unveiled on Thursday ...,0
15795,U.N. refugee commissioner says Australia must ...,The U.N. High Commissioner for Refugees said ...,0


In [None]:
# Saving new full dataset to CSV to easily use later
combined_df.to_csv('fulldata.csv', index=False)