In [62]:
import pandas as pd
import re

In [63]:
true = pd.read_csv("../data/raw/true.csv")
fake = pd.read_csv("../data/raw/fake.csv")


In [64]:
true["true"] = 1
fake["true"] = 0

# Change the data type of the 'true' column to int
true["true"] = true["true"].astype(int)
fake["true"] = fake["true"].astype(int)

data = pd.concat([true, fake], ignore_index=True)

data.head(-5)

Unnamed: 0,title,text,subject,date,true
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1
...,...,...,...,...,...
44888,Seven Iranians freed in the prisoner swap have...,"21st Century Wire says This week, the historic...",Middle-east,"January 20, 2016",0
44889,#Hashtag Hell & The Fake Left,By Dady Chery and Gilbert MercierAll writers ...,Middle-east,"January 19, 2016",0
44890,Astroturfing: Journalist Reveals Brainwashing ...,Vic Bishop Waking TimesOur reality is carefull...,Middle-east,"January 19, 2016",0
44891,The New American Century: An Era of Fraud,Paul Craig RobertsIn the last years of the 20t...,Middle-east,"January 19, 2016",0


## Normalize the data

- The text in columns 'title' and 'text' need to be preprocessed
- The date column will be formatted to allow for meaningful analysis

In [65]:
def clean_text(text):
    text = str(text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>', '', text)
    text = text.replace("\n", " ")
    return text

In [66]:
data['title'] = data['title'].apply(lambda x: clean_text(x))
data['text'] = data['text'].apply(lambda x: clean_text(x))

In [67]:
data.drop_duplicates(inplace=True) # Drop rows with duplicate values

In [68]:
data = data.dropna() # Removes empty rows

In [69]:
data['date'] = pd.to_datetime(data['date'], format='mixed', dayfirst=True, errors='coerce')

In [70]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44689 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   title    44689 non-null  object        
 1   text     44689 non-null  object        
 2   subject  44689 non-null  object        
 3   date     44679 non-null  datetime64[ns]
 4   true     44689 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 2.0+ MB


In [72]:
data.to_csv("../data/processed/data.csv", index=False)
data.to_pickle("../data/processed/data.pk1") # Pickel format maintains the data type of the columns