## About this Dataset

1. **id**: Unique ID for each news article  
2. **title**: The title of the article  
3. **author**: Author of the news article  
4. **text**: The text of the article (could be incomplete)  
5. **label**: A label that marks whether the news article is fake or real  
   - `0`: Real News  
   - `1`: Fake News  


Importing the libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv("dataset/train.csv")

In [3]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
y=df['label']

In [5]:
y.head()

0    1
1    0
2    1
3    1
4    1
Name: label, dtype: int64

In [6]:
df.shape

(20800, 5)

In [7]:
print(df['label'].value_counts())

label
1    10413
0    10387
Name: count, dtype: int64


In [8]:
# Set style
sns.set_style("whitegrid")

# Create the plot
plt.figure(figsize=(6,4))
ax = sns.countplot(x=df['label'], palette="coolwarm")

# Add value labels on top of bars
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', 
                (p.get_x() + p.get_width() / 2, p.get_height()), 
                ha='center', va='bottom', fontsize=12, fontweight='bold', color='black')

# Labels and title
plt.xlabel("Label", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.title("Distribution of Real (0) and Fake (1) News", fontsize=14)
plt.xticks(ticks=[0, 1], labels=["Real", "Fake"], fontsize=12)

plt.show()

NameError: name 'sns' is not defined

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer

In [None]:
df.isnull().sum()

In [None]:
df=df.dropna()

In [None]:
messages=df.copy()

In [None]:
messages.reset_index(inplace=True)

In [None]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
ps=PorterStemmer()
corpus=[]
for i in range (0,len(messages)):
    review=re.sub('[^a-zA-Z]',' ',messages['title'][i])
    review=review.lower()
    review=review.split()
    review=[ps.stem(word) for word in review if word not in stopwords.words('english') ]
    review=' '.join(review)
    corpus.append(review)

In [None]:
corpus

In [None]:
from sklearn.feature_extraction.text import  CountVectorizer
cv=CountVectorizer(max_features=5000,ngram_range=(1,3))
X=cv.fit_transform(corpus).toarray()

In [None]:
X.shape

In [None]:
y=messages['label']

In [None]:
#Train Test split
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=0.33,random_state=0)

In [None]:
cv.get_feature_names_out()[:20]

In [None]:
cv.get_params()

In [None]:
count_df=pd.DataFrame(X_train,columns=cv.get_feature_names_out())

In [None]:
count_df.head()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import itertools
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()  # This will display the plot

# Sample data for testing
y_true = [0, 1, 2, 2, 0, 1, 1]  # True labels
y_pred = [0, 0, 2, 2, 0, 2, 1]  # Predicted labels
classes = ['Class 0', 'Class 1', 'Class 2']  # Class names

# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Call the plot function
plot_confusion_matrix(cm, classes, normalize=False)


In [None]:
from sklearn.linear_model import LogisticRegression

logistic_clf = LogisticRegression()


In [None]:
from sklearn import metrics
import numpy as np
import itertools

In [None]:
logistic_clf.fit(X_train, Y_train)
pred = logistic_clf.predict(X_test)
score = metrics.accuracy_score(Y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(Y_test, pred)
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

### Passive Aggressive Classfier

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier

# Use max_iter instead of n_iter
linear_clf = PassiveAggressiveClassifier(max_iter=50)


In [None]:
linear_clf.fit(X_train, Y_train)
pred = linear_clf.predict(X_test)
score = metrics.accuracy_score(Y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(Y_test, pred)
plot_confusion_matrix(cm, classes=['FAKE Data', 'REAL Data'])

### Decision Tree 

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

# Initialize the Decision Tree classifier
dt_clf = DecisionTreeClassifier()

# Train the model
dt_clf.fit(X_train, Y_train)

# Make predictions
pred = dt_clf.predict(X_test)

# Calculate accuracy
score = metrics.accuracy_score(Y_test, pred)
print("Accuracy:   %0.3f" % score)

# Compute confusion matrix
cm = metrics.confusion_matrix(Y_test, pred)

# Use your existing function to plot confusion matrix
plot_confusion_matrix(cm, classes=['FAKE Data', 'REAL Data'])


In [None]:
import numpy as np
from scipy.special import expit  # For the sigmoid function

def predict_news(news_text: str, threshold: float = 0.7) -> str:
    """
    Predict if the news text is real or fake using Passive-Aggressive, Logistic Regression, and Decision Tree classifiers.
    
    Args:
        news_text (str): The text of the news to classify.
        threshold (float): The threshold for classifying the news as FAKE.
    
    Returns:
        str: "REAL" if the news is real, "FAKE" if the news is fake,
             along with the probabilities of each class from all models.
    """
    # Preprocess the input news text (e.g., vectorize the text)
    vectorized_text = cv.transform([news_text])

    # Passive-Aggressive Classifier Prediction
    pa_decision_values = linear_clf.decision_function(vectorized_text)[0]
    pa_probabilities = expit(pa_decision_values)  
    pa_prediction_label = 'FAKE' if pa_probabilities > threshold else 'REAL'

    # Logistic Regression Prediction
    logistic_prediction = logistic_clf.predict(vectorized_text)
    logistic_probabilities = logistic_clf.predict_proba(vectorized_text)[0][1]  # Probability for FAKE
    logistic_prediction_label = 'FAKE' if logistic_prediction[0] == 1 else 'REAL'

    # Decision Tree Prediction
    dt_prediction = dt_clf.predict(vectorized_text)
    dt_probabilities = dt_clf.predict_proba(vectorized_text)[0][1]  # Probability for FAKE
    dt_prediction_label = 'FAKE' if dt_prediction[0] == 1 else 'REAL'

    # Average the probabilities from all three models for final decision
    avg_probability = (pa_probabilities + logistic_probabilities + dt_probabilities) / 3
    final_prediction_label = 'FAKE' if avg_probability > threshold else 'REAL'

    # Class labels and corresponding probabilities
    classes = ['REAL', 'FAKE']
    class_probabilities_pa = [1 - pa_probabilities, pa_probabilities]  
    class_probabilities_logistic = [1 - logistic_probabilities, logistic_probabilities]  
    class_probabilities_dt = [1 - dt_probabilities, dt_probabilities]  

    # Display probabilities from all models
    print(f"Passive-Aggressive Classifier Probabilities: {dict(zip(classes, class_probabilities_pa))}")
    print(f"Logistic Regression Probabilities: {dict(zip(classes, class_probabilities_logistic))}")
    print(f"Decision Tree Probabilities: {dict(zip(classes, class_probabilities_dt))}")

    # Return the final prediction based on averaged probabilities
    return final_prediction_label


In [None]:
print(df['title'][0])
print(df['label'][0])
predict_news("""Why the Truth Might Get You Fired""")

In [None]:
import joblib

# Save the Passive-Aggressive Classifier
joblib.dump(linear_clf, 'linear_clf.pkl')

# Save the Logistic Regression model
joblib.dump(logistic_clf, 'logistic_clf.pkl')

# Save the Decision Tree Classifier
joblib.dump(dt_clf, 'decision_tree.pkl')

# Save the vectorizer (CountVectorizer or TfidfVectorizer)
joblib.dump(cv, 'vectorizer.pkl')

print("All models and vectorizer saved successfully!")
