In [11]:
# import kagglehub
# path = kagglehub.dataset_download("saurabhshahane/fake-news-classification")
# print(path)

### Import packages

In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from tqdm import tqdm

In [13]:
import nltk
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [WinError 10054]
[nltk_data]     遠端主機已強制關閉一個現存的連線。>


False

In [14]:
# stopword in English
print(stopwords.words('English'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

### Date preprocessing

In [15]:
# loading dataset
df = pd.read_csv(r'C:\NSYSU\analytics_project\ML\data\WELFake_Dataset.csv')
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
...,...,...,...,...
72129,72129,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0
72130,72130,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1
72131,72131,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0
72132,72132,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0


In [16]:
df = df.drop(columns='Unnamed: 0')
df.shape

(72134, 3)

In [17]:
# missing values in the dataset
df.isnull().sum()

title    558
text      39
label      0
dtype: int64

In [18]:
df[(df['title'].isnull()) & (df['text'].isnull())].shape

(0, 3)

In [19]:
# Avoid NaN affecting model prediction → replace missing text with ""
df = df.fillna('')

In [20]:
# merge title and text column
df['content'] = df['title'] + ' ' + df['text']
print(df['content'])

0        LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1           Did they post their votes for Hillary already?
2        UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3        Bobby Jindal, raised Hindu, uses story of Chri...
4        SATAN 2: Russia unvelis an image of its terrif...
                               ...                        
72129    Russians steal research on Trump in hack of U....
72130     WATCH: Giuliani Demands That Democrats Apolog...
72131    Migrants Refuse To Leave Train At Refugee Camp...
72132    Trump tussle gives unpopular Mexican leader mu...
72133    Goldman Sachs Endorses Hillary Clinton For Pre...
Name: content, Length: 72134, dtype: object


In [21]:
X = df.drop(columns='label', axis = 1)
Y = df['label']

#### Stemming

In [37]:
port_stem = PorterStemmer()
stop = set(stopwords.words('english'))
pattern = re.compile('[^a-zA-Z]')

In [38]:
def stemming(content):
    stemmed_content = pattern.sub(' ',content).lower().split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stop]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [39]:
tqdm.pandas()
df['content'] = df['content'].progress_apply(stemming)

100%|██████████| 72134/72134 [07:14<00:00, 166.05it/s]


In [None]:
# df.to_csv('fake_news_data.csv')

In [24]:
df = pd.read_csv(r'C:\NSYSU\analytics_project\ML\practice\fake_news_data.csv')
df

Unnamed: 0.1,Unnamed: 0,title,text,label,content
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,law enforc high alert follow threat cop white ...
1,1,,Did they post their votes for Hillary already?,1,post vote hillari alreadi
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,unbeliev obama attorney gener say charlott rio...
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,bobbi jindal rais hindu use stori christian co...
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,satan russia unv imag terrifi new supernuk wes...
...,...,...,...,...,...
72129,72129,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,russian steal research trump hack u democrat p...
72130,72130,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,watch giuliani demand democrat apolog trump ra...
72131,72131,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,migrant refus leav train refuge camp hungari m...
72132,72132,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0,trump tussl give unpopular mexican leader much...


In [25]:
X = df['content']
Y = df['label']
print(X.shape, Y.shape)

(72134,) (72134,)


In [26]:
Y.value_counts()

label
1    37106
0    35028
Name: count, dtype: int64

In [27]:
X = X.fillna("")

In [28]:
# convert the textual data to numerical data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

In [18]:
# print(X)

### traning and test data 

In [29]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state= 24)

In [31]:
model = LogisticRegression()
model.fit(X_train, Y_train)

### Evaluation

In [33]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
X_train_accuracy = accuracy_score(X_train_prediction, Y_train)

In [34]:
print(X_train_accuracy)

0.9617377441211639


In [35]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test)
X_test_accuracy = accuracy_score(X_test_prediction, Y_test)

In [36]:
print(X_test_accuracy)

0.9459347057600332


### Prediction

In [37]:
X_new = X_test[0]
prediction = model.predict(X_new)
print(prediction)

if(prediction[0] == 0):
    print("The news is Real")
else:
    print("The news is Fake")

[1]
The news is Fake


In [41]:
print(Y_test[0])

1
