In [5]:
import pandas as pd
import numpy as np

#Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

#Text Preprocessing Libraries
import nltk
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from wordcloud import WordCloud, STOPWORDS

#Model Building libraries
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

#Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split,GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [6]:
#Load the dataset from local
df=pd.read_csv('news.csv')
df.head()

ParserError: Error tokenizing data. C error: EOF inside string starting at row 225

In [None]:
# Check for missing values
print(df.isnull().sum())

# Check the distribution of the target variable
print(df['label'].value_counts())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


# Plot the distribution of the labels
plt.figure(figsize=(8, 6))
# Pass the entire DataFrame and specify the column for x
sns.countplot(x='label', data=df)  # Changed line
plt.title('Distribution of Real vs Fake News')
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()


We can see that there are no missing values in any of the feature and the output variable is well balanced.


In [None]:
# Replace 'FAKE' with 0 and 'REAL' with 1 in the 'LABEL' column
df['LABEL'] = df['label'].replace({'FAKE': 0, 'REAL': 1})
df.head()

In [None]:
df=df.drop('label',axis=1)
df.head()

In [None]:
df['text'][0]

We will combine the title and text here:

1. The title often summarizes the main idea or the most important aspect of the article, while the text provides detailed information. Combining both can give a more complete picture of the content.

2. **Augmenting Short Articles:** For very short articles, the title can provide additional context and information that might not be present in the body text alone.

3. **Use in the project:** Fake news articles often have sensational titles that may not match the content. Combining the title and text helps the model to detect such inconsistencies.


In [None]:
# combine title and text together
df['original'] = df['title'] + ' ' + df['text']
df.head()

In [None]:
df['original'][0]

### **Stopwords**

1. **Stop words are a set of commonly used words in a language. Examples of stop words in English are “a,” “the,” “is,” “are,” etc.**

2. We have to remove the words which carry little to no useful information at all from our dataset.

3. This can be done by maintaining a list of stop words (which can be manually or automatically curated) and preventing all words from your stop word list from being analyzed.

4. Stopwords helps us reduce noise, increase effieciency and enhance model performance by reducing dimensionality.

In tasks such as document similarity or clustering, stopwords can introduce noise that affects the accuracy of similarity measures. Removing stopwords leads to more accurate and meaningful similarity calculations.

In [None]:
# download stopwords
nltk.download("stopwords")

In [None]:
# Obtain additional stopwords from nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# Extend the stopwords list with domain-specific stopwords
extra_stopwords = [
    'breaking', 'report', 'latest', 'update', 'exclusive', 'headline',
    'according', 'sources', 'allegedly', 'reported', 'confirm', 'statement',
    'yesterday', 'today', 'tomorrow', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'new york', 'washington',
    'said', 'added', 'told', 'commented', 'mentioned',
    'what', 'how', 'when', 'where', 'who'
]

stop_words.extend(extra_stopwords)
stop_words

### **Tokenization**

The simple_preprocess function from Gensim:

1. Converts the text to lowercase.
2. Tokenizes the text into individual words.
3. Removes punctuation.
4. Optionally removes words that are too short or too long (though the latter is not explicitly configured here).

In [None]:
# Remove stopwords and remove words with 2 or less characters
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):     #tokenization
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in stop_words:
            result.append(token)

    return result

In [None]:
# Apply the function to the dataframe
df['clean'] = df['original'].apply(preprocess)

In [None]:
df.head()

**List of Words:**

1. By generating a list of all words, we can perform frequency analysis to identify commonly occurring words that might not be informative for our classification task. These frequent but non-discriminative words can be added to our stopwords list to improve the quality of your text data.

2. We can do the frequency of word calculations and everything with either CountVectorizer or TFIDF Transformer.



In [None]:
# Obtain the total words present in the dataset
list_of_words = []
for i in df.clean:
    for j in i:
        list_of_words.append(j)

In [None]:
list_of_words


In [None]:
len(list_of_words)

In [None]:
# Obtain the total number of unique words
total_words = len(list(set(list_of_words)))
total_words


We should now convert lists of cleaned words into single strings in a new column 'clean_joined' in our DataFrame.

1. This transformation is often necessary for further text processing tasks
like vectorization or modeling, where each document (in this case, a news article) needs to be represented as a single string of text. This process ensures that your data is in a format suitable for subsequent NLP tasks.

In [None]:
#Join the words into a string
# Apply the following lambda function to each element in the 'clean' column of the dataframe 'df':
# - For each element 'x' in the 'clean' column, join the list of words in 'x' into a single string with spaces in between.
# - Assign the resulting string to a new column called 'clean_joined' in the dataframe 'df'.
df['clean_joined'] = df['clean'].apply(lambda x: " ".join(x))

# Display the first few rows of the dataframe to verify the new column has been added and populated correctly.
df.head()

In [None]:
df['text'][0]

In [None]:
df['clean_joined'][0]

In [None]:
#create a new column in the DataFrame called 'length'
df['length'] = df['original'].apply(len)
df.head()

## **Data Visualisation**

### **Word Count Distribution**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming df is your DataFrame with a 'text' column and a 'label' column
df['word_count'] = df['clean_joined'].apply(lambda x: len(x.split()))

# Plotting word count distribution
plt.figure(figsize=(10, 6))
df[df['LABEL'] == 0]['word_count'].hist(alpha=0.5, color='blue', bins=50, label='1')
df[df['LABEL'] == 1]['word_count'].hist(alpha=0.5, color='red', bins=50, label='0')
plt.legend()
plt.title('Word Count Distribution in Real vs Fake News')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.show()


### **Explanation:**

**Distribution Characteristics:**

1. Most articles have a word count less than 1000.
2. Real news articles (blue) are generally more frequent across all word count ranges.
3. Fake news articles (red) are present but less frequent compared to real news articles.

**Insights:**

**1. High Concentration of Short Articles:** Both fake and real news have a high concentration of articles with low word counts (0-500 words). This indicates that shorter articles are more common.

**2. Longer Articles:** Real news has more instances of longer articles (up to around 4000-5000 words), whereas fake news articles tend to have shorter word counts and fewer long articles.

**3. Overlap:** There is significant overlap in the word count distribution of fake and real news, suggesting that word count alone may not be a sufficient distinguishing feature but could still provide useful insights when combined with other features. Therefore, it is essential to use more sophisticated features (like TF-IDF, word embeddings, etc.) in our classifier.

### **Word Cloud Analysis**

In [None]:
from wordcloud import WordCloud

# Word cloud for fake news
fake_news_text = " ".join(df[df['LABEL'] == 1]['clean_joined'])
wordcloud_fake = WordCloud(width=800, height=400, background_color='black').generate(fake_news_text)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud_fake, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Fake News')
plt.show()

# Word cloud for real news
real_news_text = " ".join(df[df['LABEL'] == 0]['clean_joined'])
wordcloud_real = WordCloud(width=800, height=400, background_color='white').generate(real_news_text)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud_real, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Real News')
plt.show()


### **Analysis of the Word Cloud for fake news**

**Prominent Words:**

1. Clinton and Trump are the most prominent words, indicating that these names appear frequently in fake news articles. This suggests that fake news often targets or involves prominent political figures.
2. Other significant words include year, people, president, candidate, state, and campaign.

**Political Focus:**

1. Many of the words are related to politics, such as democrat, republican, government, election, voter, and political. This indicates a heavy focus on political topics in fake news content.

**Common Topics:**

1. Words like country, vote, support, issue, debate, attack, and campaign suggest that common topics in fake news include political campaigns, elections, and various political issues.

**Context and Implications:**

1. The prevalence of words such as obama, bush, rubio, and cruz indicates that fake news articles may discuss a range of political figures and not just the most current or prominent ones.
2. Words like fact, says, likely, and issue might be used in attempts to lend credibility to the fake news articles, making them appear more legitimate or factual to readers.


### **Analysis of the Word Cloud for Real News**

**Prominent Words:**

1. Similar to the fake news word cloud, year, people, time, trump, and clinton are prominent, indicating these are common topics in real news articles as well.

**Political Focus:**

1. Words such as government, election, president, russia, american, state, and political indicate that real news articles also focus heavily on political topics.

**Common Topics:**

1. Terms like russia, world, problem, issue, military, public, and power suggest that real news covers a wide range of global and national issues, including international relations, public affairs, and governance.

**Comparison with Fake News:**

1. Both word clouds have a significant overlap in terms of the most frequent words, such as clinton, trump, people, year, time, and government. This indicates that both fake and real news articles discuss similar high-profile topics.

**The fake news word cloud seems to have a heavier focus on individual political figures and sensational terms, while the real news word cloud includes broader topics like world, problem, issue, and public.**

## **Count Vectorizer**

The primary purpose of Count Vectorizer is to convert a collection of text documents into a matrix of token counts, which can then be used as input for machine learning algorithms.

When you use the Count Vectorizer, it tokenizes each document, builds a vocabulary of all the tokens (words) present across all documents, and then counts the occurrences of each token in each document.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer().fit(df.clean_joined)
print(len(cv.vocabulary_))

In [None]:
example = df['original'][0]
example

In [None]:
vec = cv.transform([example])
print(vec)
print(vec.shape)

In [None]:
# Create a reverse dictionary to map indices to tokens
reverse_vocabulary = {index: token for token, index in cv.vocabulary_.items()}

# Find the token with index 111
token_with_index_111 = reverse_vocabulary[111]
print(token_with_index_111)

In [None]:
# New variable to hold the transformed versions of the news.
news_vec = cv.transform(df.clean_joined)
print(news_vec)

Note: Each row corresponds to a document, each column corresponds to a token from the vocabulary, and the values represent the count of the token in the respective document.

## **TFIDF Vectorizer**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Extract TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df['clean_joined'])
X[:5].toarray()

In [None]:
y=df['LABEL']
y.head()

## **Model Building**

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate and print evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print(classification_report(y_pred,y_test))
print(confusion_matrix(y_pred,y_test))

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

In [None]:
# Function to evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1

In [None]:
# Models
models = {
    "Multinomial Naive Bayes": MultinomialNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(),
    "XGBoost": XGBClassifier()
}

In [None]:
# Train and evaluate each model
results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    results[model_name] = evaluate_model(model, X_test, y_test)

In [None]:
# Print the evaluation results
for model_name, (accuracy, precision, recall, f1) in results.items():
    print(f"{model_name}:")
    print(f"  Accuracy: {accuracy:.2f}")
    print(f"  Precision: {precision:.2f}")
    print(f"  Recall: {recall:.2f}")
    print(f"  F1 Score: {f1:.2f}")
    print()

In [None]:
# Define the XGBoost model
XGB_model = XGBClassifier()
XGB_model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate and print evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

## **STREAMLIT APP**

Streamlit is an open-source Python framework for data scientists and AI/ML engineers to deliver dynamic data apps with only a few lines of code. Build and deploy powerful data apps in minutes

In [None]:
!pip install streamlit
!pip install pyngrok

In [None]:
!pip install streamlit --quiet
!npm install -g localtunnel

In [None]:
import joblib
# Save the model and vectorizer
joblib.dump(XGB_model, 'xgb_news_classifier_model.pkl')


In [None]:
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

In [None]:
%%writefile app.py
import streamlit as st
import joblib
import pandas as pd
from gensim.parsing.preprocessing import remove_stopwords
import gensim

# Load the saved XGBoost model and vectorizer
model = joblib.load('xgb_news_classifier_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Preprocessing function using Gensim
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):  # Tokenization
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
    return result

# Streamlit app
st.title("Fake News Classifier")

st.write("Enter the news title and text to classify it as real or fake.")

# Input text from the user
title = st.text_input("Title")
text = st.text_area("Text")

# Combine the title and text
if st.button("Classify"):
    if title and text:
        original_text = title + ' ' + text
        clean_text = preprocess(original_text)
        clean_joined = ' '.join(clean_text)

        # Transform the input text using the TF-IDF vectorizer
        text_vector = vectorizer.transform([clean_joined])

        # Predict the label
        prediction = model.predict(text_vector)[0]

        # Display the result
        if prediction == 1:
            st.success("This news is Real.")
        else:
            st.error("This news is Fake.")
    else:
        st.warning("Please enter both title and text.")


In [None]:
!wget -q -O - ipv4.icanhazip.com

In [None]:
! streamlit run app.py & npx localtunnel --port 8501