In [1]:
import pandas as pd

In [2]:
fake = pd.read_csv("fakeNews.csv")
fake.head(2)

Unnamed: 0,Date Posted,Link,Text,Region,Country,Explanation,Origin,Origin_URL,Fact_checked_by,Poynter_Label,Binary Label
0,2/7/20,https://www.poynter.org/?ifcn_misinformation=t...,Tencent revealed the real number of deaths.\t\t,Europe,France,The screenshot is questionable.,Twitter,https://www.liberation.fr/checknews/2020/02/07...,CheckNews,Misleading,0
1,2/7/20,https://www.poynter.org/?ifcn_misinformation=t...,Taking chlorine dioxide helps fight coronavir...,Europe,Germany,Chlorine dioxide does guard against the coron...,Website,https://correctiv.org/faktencheck/medizin-und-...,Correctiv,FALSE,0


In [3]:
real = pd.read_csv("trueNews.csv")
real.head(2)

Unnamed: 0,Date Posted,Link,Text,Region,Username,Publisher,Label
0,2/11/20,https://twitter.com/the_hindu/status/122725962...,Just in: Novel coronavirus named 'Covid-19': U...,India,the_hindu,The Hindu,1
1,2/12/20,https://twitter.com/ndtv/status/12274908434742...,WHO officially names #coronavirus as Covid-19....,India,ndtv,NDTV,1


In [4]:
#Dropping Unnecessary Column

real.drop(['Username', 'Link'], axis=1, inplace=True)
fake.drop(['Link', 'Country', "Origin_URL", "Explanation"], axis=1, inplace=True)

In [5]:
#Adding Missing Column  

real["Fact_checked_by"] = None
real["Poynter_Label"] = "true"

In [6]:
real.head(2)

Unnamed: 0,Date Posted,Text,Region,Publisher,Label,Fact_checked_by,Poynter_Label
0,2/11/20,Just in: Novel coronavirus named 'Covid-19': U...,India,The Hindu,1,,True
1,2/12/20,WHO officially names #coronavirus as Covid-19....,India,NDTV,1,,True


In [7]:
fake.head(2)

Unnamed: 0,Date Posted,Text,Region,Origin,Fact_checked_by,Poynter_Label,Binary Label
0,2/7/20,Tencent revealed the real number of deaths.\t\t,Europe,Twitter,CheckNews,Misleading,0
1,2/7/20,Taking chlorine dioxide helps fight coronavir...,Europe,Website,Correctiv,FALSE,0


In [8]:
#Renaming Column in Real to match the Fake

real.rename(columns={"Publisher": "Origin", "Label": "Binary Label"}, inplace = True)

In [9]:
#Combining Real and Fake

df = pd.concat([fake, real], ignore_index=True)
df

Unnamed: 0,Date Posted,Text,Region,Origin,Fact_checked_by,Poynter_Label,Binary Label
0,2/7/20,Tencent revealed the real number of deaths.\t\t,Europe,Twitter,CheckNews,Misleading,0
1,2/7/20,Taking chlorine dioxide helps fight coronavir...,Europe,Website,Correctiv,FALSE,0
2,2/7/20,This video shows workmen uncovering a bat-inf...,India,Facebook,AFP,MISLEADING,0
3,2/7/20,The Asterix comic books and The Simpsons pred...,India,Twitter,BOOM FactCheck,Misleading,0
4,2/7/20,Chinese President Xi Jinping visited a mosque...,India,Facebook,NewsMobile,FALSE,0
...,...,...,...,...,...,...,...
7583,6/30/20,Global COVID-19 prevention trial of hydroxychl...,Europe,Reuters UK,,true,1
7584,6/30/20,Bavaria's free COVID-19 test for all splits Ge...,Europe,Reuters UK,,true,1
7585,6/30/20,Britain locks down city of Leicester after COV...,Europe,Reuters UK,,true,1
7586,6/30/20,UK imposes lockdown on city of Leicester to cu...,Europe,Reuters UK,,true,1


In [10]:
#Converting string to datetime

df['Date Posted'] = pd.to_datetime(df['Date Posted'])

In [11]:
#Converting string/boolean to string only

df["Poynter_Label"].astype('str')

0       Misleading
1            FALSE
2       MISLEADING
3       Misleading
4            FALSE
           ...    
7583          true
7584          true
7585          true
7586          true
7587          true
Name: Poynter_Label, Length: 7588, dtype: object

In [12]:
#Standardize the case

df["Poynter_Label"] = df["Poynter_Label"].apply(lambda x: x.lower())

In [13]:
df.dtypes

Date Posted        datetime64[ns]
Text                       object
Region                     object
Origin                     object
Fact_checked_by            object
Poynter_Label              object
Binary Label                int64
dtype: object

In [14]:
df.head()

Unnamed: 0,Date Posted,Text,Region,Origin,Fact_checked_by,Poynter_Label,Binary Label
0,2020-02-07,Tencent revealed the real number of deaths.\t\t,Europe,Twitter,CheckNews,misleading,0
1,2020-02-07,Taking chlorine dioxide helps fight coronavir...,Europe,Website,Correctiv,false,0
2,2020-02-07,This video shows workmen uncovering a bat-inf...,India,Facebook,AFP,misleading,0
3,2020-02-07,The Asterix comic books and The Simpsons pred...,India,Twitter,BOOM FactCheck,misleading,0
4,2020-02-07,Chinese President Xi Jinping visited a mosque...,India,Facebook,NewsMobile,false,0


In [15]:
import re

In [16]:
#Text Cleanning

def cleanning(text):
    text = text.lower() 
    text = re.sub(r"pic.twitter\S+", "", text) #remove pic.twitter.com/xxx
    text = re.sub(r"https?://\S+", "", text) #remove http url
    text = re.sub(r"@\S+", "", text) #remove @users
    text = re.sub(r"[^A-Za-z0-9\s]","", text) #remove non-alphanumeric characters, other than space
    text = re.sub("\s+"," ", text) #remove multiple spacing
    return text

df["Text"] = df["Text"].apply(cleanning)

In [17]:
df.sort_values(by=['Date Posted'], ignore_index= True, inplace = True)
df.head(10)

Unnamed: 0,Date Posted,Text,Region,Origin,Fact_checked_by,Poynter_Label,Binary Label
0,2020-02-07,tencent revealed the real number of deaths,Europe,Twitter,CheckNews,misleading,0
1,2020-02-07,taking chlorine dioxide helps fight coronavirus,Europe,Website,Correctiv,false,0
2,2020-02-07,this video shows workmen uncovering a batinfe...,India,Facebook,AFP,misleading,0
3,2020-02-07,the asterix comic books and the simpsons pred...,India,Twitter,BOOM FactCheck,misleading,0
4,2020-02-07,chinese president xi jinping visited a mosque...,India,Facebook,NewsMobile,false,0
5,2020-02-07,china seeks court approval to kill over 20000...,United States,ab,LeadStories,false,0
6,2020-02-08,the new coronavirus causes sudden death syndr...,United States,Facebook,PolitiFact,false,0
7,2020-02-08,autopsy reveals a wuhan doctor was murdered i...,United States,ab,LeadStories,false,0
8,2020-02-08,in a photo of pakistani prime minister imran ...,India,Facebook,Vishvas News,false,0
9,2020-02-08,a video says weed kills coronavirus,India,"Twitter, WhatsApp",BOOM FactCheck,misleading,0


In [18]:
df.to_csv("cleaned.csv", index = False)