In [27]:
#problem statement

In [28]:
# importing the libraries 
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline
import sqlite3

In [29]:
#importing the file from the database 

conn = sqlite3.connect('classification.db')
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print(tables)


[('fraud_detection',), ('Heart_disease',), ('Metro_Train_Maintenance',), ('forest_cov_type',), ('fake_news_classification',), ('drug_reviews_sentiment_analysis',), ('Telecom_microservices_loan',), ('malware_detection',), ('fault_detection_manufacturing',)]


In [30]:
#importing the DB file into the df

conn = sqlite3.connect("classification.db") #make the connection

df = pd.read_sql('SELECT* FROM fake_news_classification',conn)#converted into the dataframe

conn.close()#close the connection 


In [31]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text,label
0,0,26610,ZeroHedge.com ALL CONTENT ON 'SGTREPORT.COM' A...,0
1,1,8124,As soon as news broke of notorious hatemonger ...,0
2,2,24978,"When I was reading the news this morning, I di...",0
3,3,29057,"NAIROBI, Kenya — A suicide bomb rocked the ...",1
4,4,33774,Email \nIt’s not exactly breaking news that on...,0


In [32]:
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'text', 'label'], dtype='object')

In [33]:
# remove the insignificant variable 
df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'], inplace=True)

In [34]:
df.head()

Unnamed: 0,text,label
0,ZeroHedge.com ALL CONTENT ON 'SGTREPORT.COM' A...,0
1,As soon as news broke of notorious hatemonger ...,0
2,"When I was reading the news this morning, I di...",0
3,"NAIROBI, Kenya — A suicide bomb rocked the ...",1
4,Email \nIt’s not exactly breaking news that on...,0


In [35]:
# checking the null values or missing values present in the dataset
df.isnull().sum()

text     29
label     0
dtype: int64

In [36]:
#dropping the nan values present in the test dataset
df.dropna(subset=["text"], inplace=True)

In [37]:
#checking the unique value present in the data 
df.nunique()

text     68604
label        2
dtype: int64

In [38]:
import re

df['text'] = df['text'].str.replace(r'[^\w\s]', '', regex=True)

In [39]:
import string

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df['text'] = df['text'].apply(remove_punctuation)

In [40]:
df['text'] = df['text'].str.lower()

In [41]:
df['text'] = df['text'].str.replace(r'http\S+|www\S+', '', regex=True)

In [42]:
df['text'] = df['text'].str.replace(r'\s+', ' ', regex=True).str.strip()

In [43]:
df[['text']].head()

Unnamed: 0,text
0,zerohedgecom all content on sgtreportcom as we...
1,as soon as news broke of notorious hatemonger ...
2,when i was reading the news this morning i dis...
3,nairobi kenya a suicide bomb rocked the somali...
4,email its not exactly breaking news that only ...


In [44]:
df.columns

Index(['text', 'label'], dtype='object')

In [45]:
df.head()

Unnamed: 0,text,label
0,zerohedgecom all content on sgtreportcom as we...,0
1,as soon as news broke of notorious hatemonger ...,0
2,when i was reading the news this morning i dis...,0
3,nairobi kenya a suicide bomb rocked the somali...,1
4,email its not exactly breaking news that only ...,0


In [46]:
df['label'].value_counts(normalize=True)

label
0    0.555327
1    0.444673
Name: proportion, dtype: float64

In [47]:
X = df['text']
y = df['label']

In [48]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        stop_words="english",
        ngram_range=(1,2),
        max_df=0.9,
        min_df=5
    )),
    ("clf", LogisticRegression(
        max_iter=1000,
        n_jobs=-1
    ))
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

           0       0.95      0.97      0.96      8729
           1       0.96      0.93      0.94      6989

    accuracy                           0.95     15718
   macro avg       0.95      0.95      0.95     15718
weighted avg       0.95      0.95      0.95     15718



In [49]:
from sklearn.model_selection import GridSearchCV

params = {
    "tfidf__max_df": [0.7, 0.9],
    "tfidf__min_df": [3, 5],
    "clf__C": [0.5, 1, 2]
}

gs = GridSearchCV(pipe, params, cv=3, scoring='f1', n_jobs=-1)
gs.fit(X_train, y_train)

print(gs.best_params_)



{'clf__C': 2, 'tfidf__max_df': 0.7, 'tfidf__min_df': 5}


In [50]:
import joblib

joblib.dump(pipe, "fake_news_model.pkl")

['fake_news_model.pkl']

In [60]:
sample = ["Breaking: Scientists confirm earth is flat according to secret nasa report"]

pipe.predict(sample)

array([0])

In [64]:
def predict_news(text):
    pred = pipe.predict([text])[0]
    return "FAKE NEWS" if pred == 0 else "REAL NEWS"

In [65]:
predict_news("NASA confirms earth is flat")

'FAKE NEWS'