<a href="https://colab.research.google.com/github/hanaallouene/FakeNewsDetection/blob/master/FakeNewsDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! git clone https://github.com/hanaallouene/FakeNewsDetection.git

Cloning into 'FakeNewsDetection'...
remote: Enumerating objects: 9, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 9 (delta 1), reused 7 (delta 1), pack-reused 0[K
Unpacking objects: 100% (9/9), 11.34 MiB | 6.57 MiB/s, done.


In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer


In [3]:
# Load the dataset
news_df = pd.read_csv('/content/FakeNewsDetection/news.csv')

In [45]:
news_df.shape


(6335, 4)

In [None]:
news_df.columns

Index(['Unnamed: 0', 'title', 'text', 'label', 'content'], dtype='object')

In [None]:
news_df.dtypes

Unnamed: 0     int64
title         object
text          object
label         object
content       object
dtype: object

In [None]:
news_df.drop_duplicates(subset=news_df.columns, keep='first', inplace=True)
news_df.shape

(6335, 5)

In [None]:
news_df['label'].value_counts()

REAL    3172
FAKE    3163
Name: label, dtype: int64

In [None]:
# change the value of the 'age' and 'gender' columns for the row where name is 'Bob'
news_df.loc[news_df['Unnamed: 0'] == 8476 , ['title', 'text','label']] = ['Hana ALLOUANE', 'GI2S1','REAL']


In [None]:
news_df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,Hana ALLOUANE,GI2S1,REAL
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Get a list of English stopwords
stop_words = set(stopwords.words('english'))

In [None]:
news_df['content'] = news_df['title'] + ' ' + news_df['text'] + ' '+ news_df['label']

In [None]:
# Split the data into features (X) and target (y)
X = news_df.drop(["label"], axis=1)
y = news_df["label"]

In [None]:
port_stem = PorterStemmer()

In [None]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z\']',' ',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content =stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content


In [None]:
news_df['content'] = news_df['content'].apply(stemming)


In [None]:
X=news_df['content'].values
y= news_df['label'].values

In [None]:

vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X =vectorizer.transform(X)


In [None]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the evaluation metrics
print('Confusion Matrix:\n', conf_matrix)
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)

Confusion Matrix:
 [[605  33]
 [ 49 580]]
Accuracy: 0.9352801894238358
Precision: 0.9355465186068558
Recall: 0.9352801894238358
F1-score: 0.9352640564986686
