train_test_split: Uses 80% of the dataset for training and 20% for testing.
TfidVectorizer: Used to represent the articles in numeric features.
TF: Term frequency, number of times a word is repeated
IDF: Inverse Document Frequency, tells about some important terms by comparing them  with other documents.
LinerSVC: Linear Support Vector Classifier, the actual model used. Best for text data.

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [2]:
file_path = 'IFND.csv'

# Specify the encoding (e.g., 'utf-8', 'utf-16', 'iso-8859-1', etc.)
encoding = 'iso-8859-1'

try:
    data = pd.read_csv(file_path, encoding=encoding)
    # Display the first few rows of the DataFrame
    data.head()
except UnicodeDecodeError as e:
    print(f"UnicodeDecodeError: {e}")

In [3]:
data['fake'] = data['Label'].apply( lambda x: 0 if x == "TRUE" else 1)

In [4]:
data = data.drop("Label", axis=1)

In [5]:
fake_count = (data['fake'] == 1).sum()  # Assuming 'FAKE' represents fake articles
real_count = (data['fake'] == 0).sum()  # Assuming 'REAL' represents real articles

print(f"Number of Fake Articles: {fake_count}")
print(f"Number of Real Articles: {real_count}")

Number of Fake Articles: 18914
Number of Real Articles: 37800


In [6]:
X = data['Statement'] + data['Web'] + data['Category']

In [7]:
y = data['fake']

In [8]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2)

In [9]:
vectorizer = TfidfVectorizer(stop_words = "english", max_df = 0.7)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [10]:
print("X_train_vectorized shape:", X_train_vectorized.shape)
print("y_train shape:", y_train.shape)

X_train_vectorized shape: (45371, 50823)
y_train shape: (45371,)


In [11]:
len(X_train)

45371

In [13]:
len(X_test)

11343

In [14]:
clf = LinearSVC(dual=False)
clf.fit(X_train_vectorized, y_train)

In [15]:
clf.score(X_test_vectorized, y_test)

0.9629727585294896

In [16]:
clf.score(X_train_vectorized, y_train)

0.9967600449626414

In [17]:
X_test.iloc[10]

'Mumbai Police file charge sheet in fake TRP scamTRIBUNEINDIAVIOLENCE'

In [19]:
with open("my_Text.txt", "w", encoding = "utf-8") as f:
        f.write(X_test.iloc[10])

In [23]:
with open("my_Text.txt", "r", encoding = "utf-8") as f:
        text = f.read()

In [24]:
vectorized_text = vectorizer.transform([text])

In [25]:
prediction = clf.predict(vectorized_text)
if prediction == 0:
    print("The new article is predicted as real.")
else:
    print("The new article is predicted as fake.")

The new article is predicted as real.


In [26]:
y_test.iloc[10]

0