In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('fake_and_real_news.csv')

In [3]:
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [4]:
df.label.value_counts()

label
Fake    5000
Real    4900
Name: count, dtype: int64

In [5]:
df.shape

(9900, 2)

In [6]:
def map_category(category):
    category_map = {
        'Fake': 0,
        'Real': 1,
    }
    return category_map.get(category)
df['label'] = df['label'].apply(map_category)

In [7]:
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,0
1,U.S. conservative leader optimistic of common ...,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",1
3,Court Forces Ohio To Allow Millions Of Illega...,0
4,Democrats say Trump agrees to work on immigrat...,1


In [8]:
total_word_count = 0
for content in df['Text']:
    total_word_count += sum(1 for _ in content.split())
print(total_word_count)

4188085


In [9]:
ps = PorterStemmer()

In [10]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [ps.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = " ".join(stemmed_content)
    return stemmed_content


In [11]:
# so after pre-proccessing around 2M stopwords were removed 
total_word_count = 0
for content in df['Text']:
    total_word_count += sum(1 for _ in content.split())
print(total_word_count)

4188085


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, mean_squared_error, r2_score, mean_absolute_error


In [13]:
X = df['Text']
Y = df['label']

In [14]:
X_train , X_test , Y_train , Y_test  = train_test_split(X, Y, test_size = 0.2 , stratify = Y , random_state = 42)

In [15]:
vc = TfidfVectorizer()
X_train = vc.fit_transform(X_train)
X_test = vc.transform(X_test)

In [16]:
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.svm import SVC

In [17]:
model = SVC()
model.fit(X_train, Y_train)

# Make predictions
Y_pred_cls = model.predict(X_test)

accuracy_cls = accuracy_score(Y_test, Y_pred_cls)

f1_cls = f1_score(Y_test, Y_pred_cls, average='weighted')

report_cls = classification_report(Y_test, Y_pred_cls)
    
# Print the results for classification
print(f"Model: SVM")
print(f"Accuracy: {accuracy_cls:.4f}")
print(f"F1-Score: {f1_cls:.4f}")    
print(f"Classification Report:\n{report_cls}")
print("="*50)

Model: SVM
Accuracy: 0.9949
F1-Score: 0.9949
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1000
           1       0.99      0.99      0.99       980

    accuracy                           0.99      1980
   macro avg       0.99      0.99      0.99      1980
weighted avg       0.99      0.99      0.99      1980



In [18]:
def val_to_category(val):
    category_map = {
        0:'Fake',
        1:'Real'
     }
    return category_map.get(val,-1)

In [19]:
def make_predictions(text):
    text = stemming(text)
    text = vc.transform([text])
    val = model.predict(text)
    val = val_to_category(int(val[0]))
    print("News category is : ",val)

In [20]:
make_predictions('kolhi have 50 centuries')

News category is :  Real


In [21]:
make_predictions('google is no longer the best search engine')

News category is :  Real


In [22]:
make_predictions("modi is not pm of india")

News category is :  Real


In [23]:
import pandas as pd

# Load your dataset
dataset_path = r"C:\Users\kalyan nagu\OneDrive\Desktop\hello\fake_and_real_news.csv"
df = pd.read_csv(dataset_path)

# Display first few rows
print(df.head())


                                                Text label
0   Top Trump Surrogate BRUTALLY Stabs Him In The...  Fake
1  U.S. conservative leader optimistic of common ...  Real
2  Trump proposes U.S. tax overhaul, stirs concer...  Real
3   Court Forces Ohio To Allow Millions Of Illega...  Fake
4  Democrats say Trump agrees to work on immigrat...  Real


In [24]:
print(df.columns)


Index(['Text', 'label'], dtype='object')


In [25]:
import pandas as pd

# Load dataset
dataset_path = r"C:\Users\kalyan nagu\OneDrive\Desktop\hello\fake_and_real_news.csv"
df = pd.read_csv(dataset_path)

# Print the column names
print(df.columns)


Index(['Text', 'label'], dtype='object')


In [None]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import pandas as pd

# Load dataset
dataset_path = r"C:\Users\kalyan nagu\OneDrive\Desktop\hello\fake_and_real_news.csv"
df = pd.read_csv(dataset_path)

# Fix column names
df.columns = df.columns.str.strip()

# Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['Text']).toarray()  # Use correct column name
y = df['label'].map({'Fake': 0, 'Real': 1})  # Convert labels to numeric

# Train SVM Model
svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X, y)

# Save Model & Vectorizer
with open("svm_model.pkl", "wb") as model_file:
    pickle.dump(svm_model, model_file)

with open("vectorizer.pkl", "wb") as vec_file:
    pickle.dump(vectorizer, vec_file)

print("✅ Model saved successfully!")


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

# Function to clean text
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = word_tokenize(text)  # Tokenization
    text = [word for word in text if word not in stopwords.words('english')]  # Remove stopwords
    return " ".join(text)

# Apply preprocessing to the text column
df['text'] = df['text'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to C:\Users\kalyan
[nltk_data]     nagu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\kalyan
[nltk_data]     nagu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


KeyError: 'text'