<a href="https://colab.research.google.com/github/hyacob-89/fake_news_detector/blob/ana%2Fwork/ETLnews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Dependencies**

In [0]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()


In [0]:
# Basic libraries
import numpy as np
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

# Natural Language Processing
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Modeling
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix

  import pandas.util.testing as tm


In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CloudETL").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

## <u>__Extract__<u/>

In [0]:
from pyspark import SparkFiles
# Load in fake.csv file from S3 into a DataFrame
fake_news_url = "https://news-detector-project.s3.us-east-2.amazonaws.com/FakeNews.csv"
spark.sparkContext.addFile(fake_news_url)

fake = spark.read.option('header', 'true').csv(SparkFiles.get("FakeNews.csv"), inferSchema=True, sep=',', timestampFormat="mm/dd/yy")
fake.show()

+--------------------+--------------------+--------------+-----------+
|               title|         articletext|articlesubject|articledate|
+--------------------+--------------------+--------------+-----------+
| Donald Trump Sen...|Donald Trump just...|          News|  31-Dec-17|
| Drunk Bragging T...|House Intelligenc...|          News|  31-Dec-17|
| Sheriff David Cl...|On Friday, it was...|          News|  30-Dec-17|
| Trump Is So Obse...|On Christmas day,...|          News|  29-Dec-17|
| Pope Francis Jus...|Pope Francis used...|          News|  25-Dec-17|
| Racist Alabama C...|The number of cas...|          News|  25-Dec-17|
| Fresh Off The Go...|Donald Trump spen...|          News|  23-Dec-17|
| Trump Said Some ...|In the wake of ye...|          News|  23-Dec-17|
| Former CIA Direc...|Many people have ...|          News|  22-Dec-17|
| WATCH: Brand-New...|Just when you mig...|          News|  21-Dec-17|
| Papa John’s Foun...|A centerpiece of ...|          News|  21-Dec-17|
| WATC

In [0]:
from pyspark import SparkFiles
# Load in CSV files from S3 into a DataFrame
true_news_url = "https://news-detector-project.s3.us-east-2.amazonaws.com/TrueNews.csv"
spark.sparkContext.addFile(true_news_url)

true = spark.read.option('header', 'true').csv(SparkFiles.get("TrueNews.csv"), inferSchema=True, sep=',', timestampFormat="mm/dd/yy")
true.show(20)

## <u>__Transform__<u/>

In [0]:
import pandas as pd
fake_df = fake.toPandas() 
fake_df.head()

Unnamed: 0,title,articletext,articlesubject,articledate
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,31-Dec-17
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,31-Dec-17
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,30-Dec-17
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,29-Dec-17
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,25-Dec-17


In [0]:
true_df = true.toPandas() 
true_df.head()

Unnamed: 0,title,articletext,articlesubject,articledate
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,31-Dec-17
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,29-Dec-17
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,31-Dec-17
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,30-Dec-17
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,29-Dec-17
5,"White House, Congress prepare for talks on spe...","WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...",politicsNews,29-Dec-17
6,"Trump says Russia probe will be fair, but time...","WEST PALM BEACH, Fla (Reuters) - President Don...",politicsNews,29-Dec-17
7,Factbox: Trump on Twitter (Dec 29) - Approval ...,The following statements were posted to the ve...,politicsNews,29-Dec-17
8,Trump on Twitter (Dec 28) - Global Warming,The following statements were posted to the ve...,politicsNews,29-Dec-17
9,Alabama official to certify Senator-elect Jone...,WASHINGTON (Reuters) - Alabama Secretary of St...,politicsNews,28-Dec-17


In [0]:
# Deleting a specific row  not coresponding with the consequancy of rows
true_df.drop(true_df.index[[13]], inplace=True)
true_df.head(20)

Unnamed: 0,title,articletext,articlesubject,articledate
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,31-Dec-17
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,29-Dec-17
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,31-Dec-17
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,30-Dec-17
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,29-Dec-17
5,"White House, Congress prepare for talks on spe...","WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...",politicsNews,29-Dec-17
6,"Trump says Russia probe will be fair, but time...","WEST PALM BEACH, Fla (Reuters) - President Don...",politicsNews,29-Dec-17
7,Factbox: Trump on Twitter (Dec 29) - Approval ...,The following statements were posted to the ve...,politicsNews,29-Dec-17
8,Trump on Twitter (Dec 28) - Global Warming,The following statements were posted to the ve...,politicsNews,29-Dec-17
9,Alabama official to certify Senator-elect Jone...,WASHINGTON (Reuters) - Alabama Secretary of St...,politicsNews,28-Dec-17


In [0]:
fake_df['label'] = 1
fake_df.head()

Unnamed: 0,title,articletext,articlesubject,articledate,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,31-Dec-17,1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,31-Dec-17,1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,30-Dec-17,1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,29-Dec-17,1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,25-Dec-17,1


In [0]:
true_df['label'] = 0
true_df.head()

Unnamed: 0,title,articletext,articlesubject,articledate,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,31-Dec-17,0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,29-Dec-17,0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,31-Dec-17,0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,30-Dec-17,0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,29-Dec-17,0


In [0]:
def clean_data_text(dataframe):

    # Drop duplicate rows
    dataframe.drop_duplicates(subset='articletext', inplace=True)
    
    # Remove punctation
    dataframe['articletext'] = dataframe['articletext'].str.replace('[^\w\s]',' ')

    # Remove numbers 
    dataframe['articletext'] = dataframe['articletext'].str.replace('[^A-Za-z]',' ')

    # Make sure any double-spaces are single 
    dataframe['articletext'] = dataframe['articletext'].str.replace('  ',' ')
    dataframe['articletext'] = dataframe['articletext'].str.replace('  ',' ')

    # Transform all text to lowercase
    dataframe['articletext'] = dataframe['articletext'].str.lower()
    
    print("New shape:", dataframe.shape)
    return dataframe.head()

In [0]:
# Call `clean_data_text(dataframe)` function
clean_data_text(fake_df)

New shape: (17437, 5)


Unnamed: 0,title,articletext,articlesubject,articledate,label
0,Donald Trump Sends Out Embarrassing New Year’...,donald trump just couldn t wish all americans ...,News,31-Dec-17,1
1,Drunk Bragging Trump Staffer Started Russian ...,house intelligence committee chairman devin nu...,News,31-Dec-17,1
2,Sheriff David Clarke Becomes An Internet Joke...,on friday it was revealed that former milwauke...,News,30-Dec-17,1
3,Trump Is So Obsessed He Even Has Obama’s Name...,on christmas day donald trump announced that h...,News,29-Dec-17,1
4,Pope Francis Just Called Out Donald Trump Dur...,pope francis used his annual christmas day mes...,News,25-Dec-17,1


In [0]:
# Call `clean_data_text(dataframe)` function
clean_data_text(true_df)

New shape: (21191, 5)


Unnamed: 0,title,articletext,articlesubject,articledate,label
0,"As U.S. budget fight looms, Republicans flip t...",washington reuters the head of a conservative ...,politicsNews,31-Dec-17,0
1,U.S. military to accept transgender recruits o...,washington reuters transgender people will be ...,politicsNews,29-Dec-17,0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,washington reuters the special counsel investi...,politicsNews,31-Dec-17,0
3,FBI Russia probe helped by Australian diplomat...,washington reuters trump campaign adviser geor...,politicsNews,30-Dec-17,0
4,Trump wants Postal Service to charge 'much mor...,seattle washington reuters president donald tr...,politicsNews,29-Dec-17,0


In [0]:
# droping nan values for fake df
fake_df = fake_df[fake_df.articletext.isna() == False]
l =len(fake_df)
print('rows after removing nans: ', l)
rando = list(np.random.choice(l-2, 4001, replace=True))
fake_df =fake_df.iloc[rando]


rows after removing nans:  17436


In [0]:
# droping nan values for true df
true_df = true_df[true_df.articletext.isna() == False]
l =len(true_df)
print('rows after removing nans: ', l)
rando = list(np.random.choice(l-2, 4001, replace=True))
true_df =true_df.iloc[rando]

rows after removing nans:  21191


In [0]:
fake_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4001 entries, 5376 to 14784
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           4001 non-null   object
 1   articletext     4001 non-null   object
 2   articlesubject  4001 non-null   object
 3   articledate     4001 non-null   object
 4   label           4001 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 187.5+ KB


In [0]:
true_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4001 entries, 10806 to 16687
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           4001 non-null   object
 1   articletext     4001 non-null   object
 2   articlesubject  4001 non-null   object
 3   articledate     4001 non-null   object
 4   label           4001 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 187.5+ KB


In [0]:
# Saving it for the future use
fake_df.to_csv("/FakeNews_Clean.csv")
true_df.to_csv("/TrueNews_Clean.csv")

