# Connect with Kagglehub

In [None]:
!pip install kagglehub



In [None]:
from google.colab import files
files.upload()  # upload kaggle.json from your computer

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"eeiyayi","key":"53dc5d75c15a5a003a4cd46f49c9514b"}'}

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


# Import Libraries

In [None]:
import kagglehub
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Access CSV and Load into Dataframe

In [None]:
# Download latest version
path = kagglehub.dataset_download("rajatkumar30/fake-news")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/fake-news


In [None]:
import pandas as pd

df = pd.read_csv(path + "/news.csv")

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


# Data Pre-Processsing

In [None]:
def explore_data(df):
  # Checking for missing values
  print("Missing values per column:")
  print(df.isnull().sum().sort_values(ascending=False))
  # Displaying data types
  print("**********")
  print("\nData types of columns:")
  print(df.dtypes)
  # Displaying descriptive statistics
  print("**********")
  print("\nDescriptive statistics:")
  print(df.describe())
  # Check for even distribution of classes
  print("**********")
  print("\nClass distribution:")
  print(df[df.columns[-1]].value_counts())

In [None]:
explore_data(df)

Missing values per column:
Unnamed: 0    0
title         0
text          0
label         0
dtype: int64
**********

Data types of columns:
Unnamed: 0     int64
title         object
text          object
label         object
dtype: object
**********

Descriptive statistics:
         Unnamed: 0
count   6335.000000
mean    5280.415627
std     3038.503953
min        2.000000
25%     2674.500000
50%     5271.000000
75%     7901.000000
max    10557.000000
**********

Class distribution:
label
REAL    3171
FAKE    3164
Name: count, dtype: int64


In [None]:
df[df.columns[-1]] = df[df.columns[-1]].replace({"FAKE": 1, "REAL": 0})
df.head()

  df[df.columns[-1]] = df[df.columns[-1]].replace({"FAKE": 1, "REAL": 0})


Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0


## Stemming

In [None]:
nltk.download('stopwords')
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content) #replace all characters except from alphabets with blank string
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split() #split words using default delimiter into list
    stemmed_content = [PorterStemmer().stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [None]:
df['stemmed_title'] = df['title'].apply(stemming)
df

Unnamed: 0.1,Unnamed: 0,title,text,label,stemmed_title
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1,smell hillari fear
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1,watch exact moment paul ryan commit polit suic...
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0,kerri go pari gestur sympathi
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1,berni support twitter erupt anger dnc tri warn
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0,battl new york primari matter
...,...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,0,state depart say find email clinton specialist
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,1,p pb stand plutocrat pentagon
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,1,anti trump protest tool oligarchi inform
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",0,ethiopia obama seek progress peac secur east a...


In [None]:
#Separating features and label
X = df['stemmed_title'].values
Y = df['label'].values

In [None]:
print(X)
print(Y)

['smell hillari fear'
 'watch exact moment paul ryan commit polit suicid trump ralli video'
 'kerri go pari gestur sympathi' ...
 'anti trump protest tool oligarchi inform'
 'ethiopia obama seek progress peac secur east africa'
 'jeb bush suddenli attack trump matter']
[1 1 0 ... 1 0 0]


## Vectorizing text to numeric

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)


In [None]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 45882 stored elements and shape (6335, 6884)>
  Coords	Values
  (0, 5631)	0.7649346276717717
  (0, 2817)	0.31703295849951846
  (0, 2230)	0.5606827254461214
  (1, 6671)	0.27855534550881683
  (1, 2101)	0.44595444447238664
  (1, 3939)	0.3279355637571237
  (1, 4462)	0.26757273402295384
  (1, 5263)	0.29163264316148546
  (1, 1212)	0.3538286538491304
  (1, 4630)	0.24403179029804972
  (1, 5933)	0.3501812302002013
  (1, 6315)	0.13561903129801023
  (1, 4905)	0.29382045650093463
  (1, 6571)	0.21736822482437437
  (2, 3335)	0.41740393299177314
  (2, 2576)	0.3357697230253524
  (2, 4428)	0.37615749278439464
  (2, 2536)	0.5345737509664292
  (2, 6015)	0.5345737509664292
  (3, 561)	0.296931281656552
  (3, 5956)	0.2780361491213491
  (3, 6359)	0.3719244073533966
  (3, 2048)	0.44483942568507917
  (3, 215)	0.4191687920830454
  (3, 1764)	0.35074544989300727
  :	:
  (6330, 5730)	0.6009118834242939
  (6331, 5796)	0.38445288563338503
  (6331, 4493)	0

# Training Model

In [None]:
#Split data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
# Check proportions to validate stratify effect
print("Original distribution:")
print(pd.Series(Y).value_counts(normalize=True))

print("\nTrain distribution:")
print(pd.Series(Y_train).value_counts(normalize=True))

print("\nTest distribution:")
print(pd.Series(Y_test).value_counts(normalize=True))

Original distribution:
0    0.500552
1    0.499448
Name: proportion, dtype: float64

Train distribution:
0    0.500592
1    0.499408
Name: proportion, dtype: float64

Test distribution:
0    0.500395
1    0.499605
Name: proportion, dtype: float64


In [None]:
# Train a XGBoost model
model = LogisticRegression()
model.fit(X_train, Y_train)

# Evaluate the model
# Accuracy on training data
X_train_prediction = model.predict(X_train)
accuracy_score_training = accuracy_score(Y_train, X_train_prediction)

# Accuracy on test data
X_test_prediction = model.predict(X_test)
accuracy_score_test = accuracy_score(Y_test, X_test_prediction)

print('Accuracy on Training data:', accuracy_score_training)
print('Accuracy on Test data:', accuracy_score_test)

Accuracy on Training data: 0.9104183109707972
Accuracy on Test data: 0.8200473559589582


# Implementing Prediction

In [None]:
def fake_news_check(input, true_label):
  #Getting model to predict outcome based on input
  prediction = model.predict(input)

  if prediction[0] == 0:
    print('Prediction: News Ok!')
  else:
    print('Prediction: Alert! Fake News!')

  print(f"True label was: {'REAL' if true_label == 0 else 'FAKE'}")

In [None]:
fake_news_check(X_test[400],Y_test[400])

Prediction: News Ok!
True label was: REAL
