# EDA On Dataset

## 1: Data Pre-processing

### 1.1 : Import libraries

In [63]:
import pandas as pd
import csv
import string

### 1.2 : Read CSV Data

In [64]:
fakeDF = pd.read_csv('WELFake_Dataset.csv', names = ['id','title','text','label'], skiprows=1)
fakeDF.head()

Unnamed: 0,id,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


### 1.3 : Remove null values

#### Before Removal:

In [65]:
fakeDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      72134 non-null  int64 
 1   title   71576 non-null  object
 2   text    72095 non-null  object
 3   label   72134 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.2+ MB


#### After Removal:

In [66]:
fakeDF = fakeDF.dropna()
fakeDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71537 entries, 0 to 72133
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      71537 non-null  int64 
 1   title   71537 non-null  object
 2   text    71537 non-null  object
 3   label   71537 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.7+ MB


### 1.4 : Dropping irrelevant column

In [67]:
fakeDF = fakeDF.drop('id', axis=1)
fakeDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71537 entries, 0 to 72133
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   71537 non-null  object
 1   text    71537 non-null  object
 2   label   71537 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 2.2+ MB


## 2: Text Data Cleaning

### 2.1: Import contractions data mapping

In [68]:
contractionsDF = pd.read_csv('contractions.csv', names = ['key','value'], skiprows=1)
contractionsDF

Unnamed: 0,key,value
0,ain't,am not
1,I ain't,am not
2,you ain't,you are not
3,she ain't,she is not
4,he ain't,he is not
...,...,...
317,youll,you shall
318,youre,you are
319,yourent,you are not
320,youve,you have


### 2.2: Lowercase both dataframes

In [69]:
fakeDF['title'] = fakeDF['title'].str.lower()
fakeDF['text'] = fakeDF['text'].str.lower()
contractionsDF['key'] = contractionsDF['key'].str.lower()
contractionsDF['value'] = contractionsDF['value'].str.lower()
fakeDF.head()

Unnamed: 0,title,text,label
0,law enforcement on high alert following threat...,no comment is expected from barack obama membe...,1
2,unbelievable! obama’s attorney general says mo...,"now, most of the demonstrators gathered last ...",1
3,"bobby jindal, raised hindu, uses story of chri...",a dozen politically active pastors came here f...,0
4,satan 2: russia unvelis an image of its terrif...,"the rs-28 sarmat missile, dubbed satan 2, will...",1
5,about time! christian group sues amazon and sp...,all we can say on this one is it s about time ...,1


In [70]:
contractionsDF.head()

Unnamed: 0,key,value
0,ain't,am not
1,i ain't,am not
2,you ain't,you are not
3,she ain't,she is not
4,he ain't,he is not


### 2.3: Expand contractions

In [71]:
# Create a dictionary from the contractions dataframe
contractionsDict = dict(zip(contractionsDF['key'], contractionsDF['value']))

# Function to expand contractions
def expand_contractions(text):
    words = text.split()
    expanded_words = [contractionsDict.get(word, word) for word in words]
    return ' '.join(expanded_words)

# Apply function to dataframe
fakeDF['title'] = fakeDF['title'].apply(expand_contractions)
fakeDF['text'] = fakeDF['text'].apply(expand_contractions)

fakeDF.head()

Unnamed: 0,title,text,label
0,law enforcement on high alert following threat...,no comment is expected from barack obama membe...,1
2,unbelievable! obama’s attorney general says mo...,"now, most of the demonstrators gathered last n...",1
3,"bobby jindal, raised hindu, uses story of chri...",a dozen politically active pastors came here f...,0
4,satan 2: russia unvelis an image of it is terr...,"the rs-28 sarmat missile, dubbed satan 2, will...",1
5,about time! christian group sues amazon and sp...,all we can say on this one is it s about time ...,1


### 2.4 Remove punctuations

In [73]:
# Define a translation table to remove punctuation
translator = str.maketrans('', '', string.punctuation)

# Step 1: Remove punctuation from 'title' column
fakeDF['title'] = fakeDF['title'].apply(lambda x: x.translate(translator))

# Step 2: Remove punctuation from 'text' column
fakeDF['text'] = fakeDF['text'].apply(lambda x: x.translate(translator))

fakeDF.head()


Unnamed: 0,title,text,label
0,law enforcement on high alert following threat...,no comment is expected from barack obama membe...,1
2,unbelievable obama’s attorney general says mos...,now most of the demonstrators gathered last ni...,1
3,bobby jindal raised hindu uses story of christ...,a dozen politically active pastors came here f...,0
4,satan 2 russia unvelis an image of it is terri...,the rs28 sarmat missile dubbed satan 2 will re...,1
5,about time christian group sues amazon and spl...,all we can say on this one is it s about time ...,1
