In [1]:
# Importing modules and functions
import pandas as pd
import numpy as np
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import col
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import IDF
from pyspark.ml.classification import GBTClassifier
from pyspark.ml import Pipeline

In [2]:
# Importing the Data
df_train = pd.read_csv('Data/fake-news/train.csv').dropna()
df_test = pd.read_csv('Data/fake-news/test.csv').dropna()
df_all = pd.read_csv('Data/fake-news/submit.csv').dropna()

# Checking training data
df_train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [3]:
""" The code here is to apply PassiveAggresive Classifier """
# Assigning the train and test data
X_train = df_train['text']
# X_train = df_train.drop(['id', 'title', 'author', 'label'], axis=1)
Y_train = df_train.drop(['id', 'title', 'author', 'text'], axis=1)
#X_test = df_test.drop(['id', 'title', 'author'], axis=1)
X_test = df_test['text']

# Creating an empty dataframe
#df_y_test = pd.DataFrame(columns=['Y_test']) - This creates new DF but appending is time consuming

df_y_test = df_test.drop(['title', 'author', 'text'], axis=1) # Creates a new DF with 'ID' same as testDF

# df_y_test['labels'] = np.zeros((len(df_test), 1)) - Another way is to create a DF of zeros of size same as test DF

# Extracting y data of X_test from submit df

# Method 1 - Iterating using for loop - takes a lot of time to iterate over 20k rows
# for i, j in df_all.iterrows():
#     for k, l in df_test.iterrows():
#         if j['id'] == l['id']:
#             df_y_test.loc[j['id'], 'label'] = j['label']

# Use Method 2 or 3 with merge on pandas DF and get the needed DF
# Method 2
df_y_test = df_y_test.merge(df_all, left_on='id', right_on='id', how='left')[['id', 'label']]

#Method 3
# ytestdf = pd.merge(left = ytestdf, right=alldf, left_on='id', right_on='id', how='left')

#Y_test = df_y_test.drop(['id'], axis=1)
Y_test = df_y_test['label']

# Initializing TfidVectoriser
tfidf_vector = TfidfVectorizer(stop_words='english', max_df = 0.5)
TVX_train = tfidf_vector.fit_transform(X_train)
TVX_test = tfidf_vector.transform(X_test)

#Initializing PassiveAggresive Classifier
PAClass = PassiveAggressiveClassifier(max_iter=5000)

# Fit and Tranform the training data
PAClass.fit(TVX_train, Y_train)
Y_pred = PAClass.predict(TVX_test)

# Checking the accuracy, classification report and printing them
FK_accuracy = accuracy_score(Y_test, Y_pred) * 100
FK_classification_report = classification_report(Y_test, Y_pred)

print('The accuracy of TfidVectorizer to predict fake news is ', FK_accuracy)
print('Classification Report: \n', FK_classification_report)

The accuracy of TfidVectorizer to predict fake news is  61.2896174863388
Classification Report: 
               precision    recall  f1-score   support

           0       0.58      0.69      0.63      2213
           1       0.65      0.54      0.59      2362

    accuracy                           0.61      4575
   macro avg       0.62      0.62      0.61      4575
weighted avg       0.62      0.61      0.61      4575



  return f(*args, **kwargs)


In [4]:
# Starting a Spark Session
fk_spark = SparkSession.builder.appName('FakeNewsDetection').getOrCreate()
        
# Creating Spark Dataframe
df_train_spark = fk_spark.createDataFrame(df_train.astype(str)) # We can either use .astype(str) which is in Pandas or use StructField to create Schema
df_test_spark = fk_spark.createDataFrame(df_test.astype(str))

#Checking training data
df_train_spark.printSchema()
df_train_spark.show(5)

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/24 20:07:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- author: string (nullable = true)
 |-- text: string (nullable = true)
 |-- label: string (nullable = true)



22/03/24 20:07:48 WARN TaskSetManager: Stage 0 contains a task of very large size (10267 KiB). The maximum recommended task size is 1000 KiB.
[Stage 0:>                                                          (0 + 1) / 1]

+---+--------------------+------------------+--------------------+-----+
| id|               title|            author|                text|label|
+---+--------------------+------------------+--------------------+-----+
|  0|House Dem Aide: W...|     Darrell Lucus|House Dem Aide: W...|    1|
|  1|FLYNN: Hillary Cl...|   Daniel J. Flynn|Ever get the feel...|    0|
|  2|Why the Truth Mig...|Consortiumnews.com|Why the Truth Mig...|    1|
|  3|15 Civilians Kill...|   Jessica Purkiss|Videos 15 Civilia...|    1|
|  4|Iranian woman jai...|    Howard Portnoy|Print \nAn Irania...|    1|
+---+--------------------+------------------+--------------------+-----+
only showing top 5 rows



                                                                                

In [5]:
# Implementing GBTClassifier - Long way

# # Removing punctuations
# regex = '[,\\-]'
# df_train_spark = df_train_spark.withColumn('text', regexp_replace(df_train_spark.text, regex))
# df_test_spark = df_test_spark.withColumn('text', regexp_replace(df_test_spark.text, regex))

# # Turning texts to tokens
# df_train_spark = Tokenizer(inputCol='text', outputCol='tokens').transform(df_train_spark)
# df_test_spark = Tokenizer(inputCol='text', outputCol='tokens').transform(df_test_spark)

# # Removing stop words such as I, me, him, her etc.
# df_train_spark = StopWordsRemover(inputCol='tokens', outputCol='words').transform(df_train_spark)
# df_test_spark = StopWordsRemover(inputCol='tokens', outputCol='words').transform(df_test_spark)

# # Hashing features or converting words to numbers
# df_train_spark = HashingTF(inputCol='words', outputCol='hashed', numFeatures=32).transform(df_train_spark)
# df_test_spark = HashingTF(inputCol='words', outputCol='hashed', numFeatures=32).transform(df_test_spark)

# # Converting hashed to TF-IDF or Dealing with really common and most frequent words
# tfidf_train = IDF(inputCol='hashed', outputCol='features').fit(df_train_spark).transform(df_train_spark)
# tfidf_test = IDF(inputCol='hashed', outputCol='features').fit(df_test_spark).transform(df_test_spark)

In [17]:
# Creating label Dataframe for the Test data
Y_sparktest = fk_spark.createDataFrame(df_y_test.astype(str))

# Implementing GBT Classifier using Pipeline

# Removing punctuations
regex = '[,\\-]'
df_train_spark = df_train_spark.withColumn('text', regexp_replace(df_train_spark.text, regex, ' '))
df_test_spark = df_test_spark.withColumn('text', regexp_replace(df_test_spark.text, regex, ' '))

# Turning text to tokens
Text_token = Tokenizer(inputCol='text', outputCol='tokens')

# Removing stop words such as I, me, him, her etc.
RemStopWords = StopWordsRemover(inputCol='tokens', outputCol='words')

# Hashing features or converting words to numbers
TextHash = HashingTF(inputCol='words', outputCol='hashed', numFeatures=32)

# Converting hashed to TF-IDF or Dealing with really common and most frequent words
HashIDF = IDF(inputCol='hashed', outputCol='features')

# Initiating GBT Classifier
classifier = GBTClassifier(maxIter=1)

# Creating pipeline
pipeline = Pipeline(stages=[Text_token, RemStopWords, TextHash, HashIDF, classifier])

# Implementing Pipeline on Training Data
pipeline = pipeline.fit(df_train_spark.select(col('text'), col('label').cast('int')))

# Making predictions using Test Data
predictions = pipeline.transform(df_test_spark.select(col('text'))) 

22/03/24 21:06:37 WARN TaskSetManager: Stage 522 contains a task of very large size (10267 KiB). The maximum recommended task size is 1000 KiB.
22/03/24 21:06:40 WARN TaskSetManager: Stage 524 contains a task of very large size (10267 KiB). The maximum recommended task size is 1000 KiB.
22/03/24 21:06:40 WARN TaskSetManager: Stage 525 contains a task of very large size (10267 KiB). The maximum recommended task size is 1000 KiB.
22/03/24 21:06:43 WARN TaskSetManager: Stage 526 contains a task of very large size (10267 KiB). The maximum recommended task size is 1000 KiB.
22/03/24 21:06:46 WARN TaskSetManager: Stage 528 contains a task of very large size (10267 KiB). The maximum recommended task size is 1000 KiB.
22/03/24 21:06:49 WARN TaskSetManager: Stage 530 contains a task of very large size (10267 KiB). The maximum recommended task size is 1000 KiB.
22/03/24 21:06:49 WARN TaskSetManager: Stage 532 contains a task of very large size (10267 KiB). The maximum recommended task size is 10

In [22]:
# Checking the accuracy, classification report of GBT Classifier and printing them
# Converting predictions to pandas df
Yspark_pred = predictions.toPandas()
Yspark_pred = Yspark_pred['prediction']

FK_accuracy = accuracy_score(Y_test, Yspark_pred) * 100
FK_classification_report = classification_report(Y_test, Yspark_pred)

print('The accuracy of GBT Classifier to predict fake news is ', FK_accuracy)
print('Classification Report: \n', FK_classification_report)

# Stopping the spark session
fk_spark.stop()

22/03/24 21:51:29 WARN TaskSetManager: Stage 551 contains a task of very large size (2805 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

(4575,)
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
The accuracy of GBT Classifier to predict fake news is  66.81967213114754
Classification Report: 
               precision    recall  f1-score   support

           0       0.62      0.81      0.70      2213
           1       0.75      0.54      0.62      2362

    accuracy                           0.67      4575
   macro avg       0.69      0.67      0.66      4575
weighted avg       0.69      0.67      0.66      4575

