In [1]:
import pandas as pd
import numpy as np

In [2]:
true_ds=pd.read_csv('True.csv')
fake_ds=pd.read_csv('Fake.csv')

In [3]:
true_ds['label']='Real'
fake_ds['label']='Fake'

In [4]:
df=pd.concat([true_ds,fake_ds])
df.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",Real
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",Real
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",Real
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",Real
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",Real


In [5]:
df.shape

(44898, 5)

In [6]:
df.drop(['title','subject','date'],axis=1,inplace=True)

In [7]:
df.head()

Unnamed: 0,text,label
0,WASHINGTON (Reuters) - The head of a conservat...,Real
1,WASHINGTON (Reuters) - Transgender people will...,Real
2,WASHINGTON (Reuters) - The special counsel inv...,Real
3,WASHINGTON (Reuters) - Trump campaign adviser ...,Real
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,Real


In [8]:
df['text'] = df['text'].str.split('-').str[1].str.strip()


In [9]:
df.head()

Unnamed: 0,text,label
0,The head of a conservative Republican faction ...,Real
1,Transgender people will be allowed for the fir...,Real
2,The special counsel investigation of links bet...,Real
3,Trump campaign adviser George Papadopoulos tol...,Real
4,President Donald Trump called on the U.S. Post...,Real


In [10]:
df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Fake,23481
Real,21417


In [11]:
df['label']=df['label'].map({'Real':0,'Fake':1})
df.head()

Unnamed: 0,text,label
0,The head of a conservative Republican faction ...,0
1,Transgender people will be allowed for the fir...,0
2,The special counsel investigation of links bet...,0
3,Trump campaign adviser George Papadopoulos tol...,0
4,President Donald Trump called on the U.S. Post...,0


In [12]:
df['text'] = df['text'].fillna('')

In [13]:
import spacy
nlp=spacy.load('en_core_web_lg')

In [14]:
df['vector']=df['text'].apply(lambda x : nlp(x).vector)

In [15]:
df.head()

Unnamed: 0,text,label,vector
0,The head of a conservative Republican faction ...,0,"[-2.436314, 1.1404501, -2.9648302, 1.1125128, ..."
1,Transgender people will be allowed for the fir...,0,"[-1.9311131, 1.3226557, -2.2779288, 0.6602417,..."
2,The special counsel investigation of links bet...,0,"[-0.775355, 1.0861657, -2.4667153, 0.5953654, ..."
3,Trump campaign adviser George Papadopoulos tol...,0,"[-1.8743728, 0.7379386, -1.4033372, 1.3280536,..."
4,President Donald Trump called on the U.S. Post...,0,"[-1.7937076, 0.3475672, -1.3643197, 0.84292394..."


In [16]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df.vector.values,df.label,test_size=0.2,random_state=5,stratify=df['label'])

In [17]:
x_train.shape

(35918,)

In [18]:
x_test.shape

(8980,)

In [19]:
x_train_2d=np.stack(x_train)
x_test_2d=np.stack(x_test)

In [20]:
x_train_2d

array([[-1.16274   ,  3.2530763 , -2.742282  , ..., -0.17773712,
        -2.5050895 ,  1.8259157 ],
       [-2.0341513 ,  0.89350665, -3.004121  , ...,  1.8320181 ,
        -2.2079134 ,  1.9760752 ],
       [-1.1109338 ,  0.86902267, -3.5973928 , ..., -0.992824  ,
        -3.5579016 ,  1.0012761 ],
       ...,
       [-0.40839   , -1.1712    ,  1.5625    , ..., -4.5445    ,
         2.7932    ,  0.18958   ],
       [-2.1898785 ,  0.11179533, -1.9661164 , ..., -1.9949869 ,
        -1.7981565 ,  0.61359584],
       [-2.1253862 , -0.2021987 , -1.2128632 , ..., -1.5192801 ,
        -2.0395575 ,  1.166933  ]], dtype=float32)

In [21]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
scaler= MinMaxScaler()
scaled_x_train=scaler.fit_transform(x_train_2d)
scaled_x_test=scaler.transform(x_test_2d)
clf=MultinomialNB()
clf.fit(scaled_x_train,y_train)

In [22]:
from sklearn.metrics import classification_report
y_pred=clf.predict(scaled_x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.56      0.67      4284
           1       0.69      0.89      0.78      4696

    accuracy                           0.74      8980
   macro avg       0.76      0.73      0.73      8980
weighted avg       0.76      0.74      0.73      8980



In [23]:
from sklearn.neighbors import KNeighborsClassifier
clf=KNeighborsClassifier(n_neighbors=5,metric='euclidean')
clf.fit(x_train_2d,y_train)
y_pred=clf.predict(x_test_2d)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.91      0.94      0.93      4284
           1       0.94      0.92      0.93      4696

    accuracy                           0.93      8980
   macro avg       0.93      0.93      0.93      8980
weighted avg       0.93      0.93      0.93      8980



In [24]:
import spacy
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

# Load spaCy model and the trained classifier
nlp = spacy.load('en_core_web_lg')  # Load the pre-trained spaCy model
# The scaler and classifier should be passed to the function, not initialized here
# scaler = MinMaxScaler()  # Initialize the scaler (used during training) # remove this line
# clf = KNeighborsClassifier(n_neighbors=5, metric='euclidean')  # KNN model (trained earlier) # remove this line

# Function to process input text and make predictions
def predict_news_category(text, clf, nlp, scaler):  # Pass scaler and clf as arguments
    # 1. Preprocess the text (similar to the preprocessing steps in training)
    cleaned_text = text.split('-')[1].strip() if '-' in text else text.strip()  # Clean text

    # 2. Convert text to word vectors using spaCy
    text_vector = nlp(cleaned_text).vector

    # 3. Scale the vector (same scaling as in training)
    scaled_text_vector = scaler.transform([text_vector])  # Use the same scaler fitted on the training data

    # 4. Predict using the KNN model
    prediction = clf.predict(scaled_text_vector)

    # Map prediction to label
    label = "Real" if prediction == 0 else "Fake"

    return label

# Example of using the function with a sample text
sample_text = " Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing"


# Load the trained classifier from the notebook
# Note, this assumes the variables x_train_2d, y_train, clf, and scaler from previous cells are available in the notebook's scope.
# Replace with the actual trained scaler and classifier objects
# scaler is fitted in cell 21
# clf is fitted in cell 23
# Example of using the function with a sample text
prediction = predict_news_category(sample_text, clf, nlp, scaler)

print(f"The prediction for the sample text is: {prediction}")

The prediction for the sample text is: Fake
