<a href="https://colab.research.google.com/github/heddaenger/FakeNews/blob/main/Fake_news_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from typing import Callable # Extra: read about type hints: https://docs.python.org/3/library/typing.html
import warnings


# needs to run before importing pandas. Silences a harmless pandas FutureWarning
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import spacy
import nltk
from nltk.corpus import stopwords as nltk_stopwords
from nltk.tokenize import word_tokenize
import string
from scipy.sparse import csr_matrix
import plotly.express as px
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, f1_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')

# disable in-cell scientific notation
pd.options.display.float_format = '{:.0f}'.format

# set plotly as the plotting backend for pandas
pd.options.plotting.backend = "plotly"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#Load data
class CSVLoader:
    """ Load a CSV file into a pandas DataFrame."""

    @staticmethod
    def fetch(path : str, **kwargs) -> pd.DataFrame:
        return pd.read_csv(path, **kwargs)

    def load(self, path : str, prep_func : Callable | None, **kwargs) -> pd.DataFrame:
        data = self.fetch(path, **kwargs)
        if isinstance(prep_func, Callable):
            return prep_func(data)
        return data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
def preprocess_emotions(df: pd.DataFrame) -> pd.DataFrame:
    """ A data preparation function for the fake news dataset."""
    return (
        df
        .drop(columns=['Unnamed: 0'], errors='ignore')
        .astype({"text": 'string', "label": 'category'})
    )

dataf = CSVLoader().load(
    path='/content/drive/MyDrive/train.csv',
    prep_func=preprocess_emotions
)

In [5]:
dataf.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print An Iranian woman has been sentenced to ...,1


In [6]:
dataf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   id      20800 non-null  int64   
 1   title   20242 non-null  object  
 2   author  18843 non-null  object  
 3   text    20761 non-null  string  
 4   label   20800 non-null  category
dtypes: category(1), int64(1), object(2), string(1)
memory usage: 670.6+ KB


In [7]:
dataf.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [8]:
dataf.duplicated().sum()

0

In [9]:
df = dataf.dropna()

In [10]:
df.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [11]:
# And example of a violin plot
df.assign(text_len=lambda d: d.text.str.len())[["text_len", "label"]].plot(
    kind='violin',
    y='text_len',
    color='label',
    title='Text Length Distribution',
    template='simple_white',
    labels=dict(text_len="Text Length"),
    box=True
)

In [12]:
#Looking at the overall distribution of emotions
label_distribution = df['label'].value_counts().reset_index()
label_distribution.columns = ['label', 'count']

# Plot the distribution
fig = px.bar(label_distribution, x='label', y='count', title='Overall Distribution of Fake news')
fig.show()

In [13]:
#Function to extract top n keywords for each emotion
def top_keywords_ngrams(df, n=None):
    results = {}
    vec = CountVectorizer(ngram_range=(1, 1)).fit(df['text'])
    bag_of_words = vec.transform(df['text'])
    sum_words = bag_of_words.sum(axis=0)

    # For each emotion
    for label in df['label'].unique():
        # Filter by label
        label_data = df[df['label'] == label]
        label_bow = vec.transform(label_data['text'])
        label_sum_words = label_bow.sum(axis=0)
        words_freq = [(word, label_sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
        words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
        top_words = words_freq[:n]
        results[label] = top_words

    return results

top_n = 10  # Number of top words/phrases to extract
top_keywords_by_label = top_keywords_ngrams(df, n=top_n)

# Converting the results to a DataFrame for easier plotting
top_keywords_df = pd.DataFrame([(label, word, freq) for label, keywords in top_keywords_by_label.items() for word, freq in keywords], columns=['Label', 'Word', 'Frequency'])

# Plotting
fig = px.bar(top_keywords_df, x='Word', y='Frequency', color='Label', title='Top Keywords by Label', facet_row='Label', height=1200)
fig.show()

In [14]:
spacy.prefer_gpu() # only if you have yur GPU enabled.
nlp = spacy.load("en_core_web_sm", exclude=["tok2vec", "tagger", "parser", "senter", "attribute_ruler", "lemmatizer", "ner"])
doc = nlp("text goes here")

In [15]:
df.loc[:, "tokens"] = df["text"].apply(lambda d: [t.text.lower() for t in nlp(d)])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [16]:
df.head()

Unnamed: 0,id,title,author,text,label,tokens
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,"[house, dem, aide, :, we, did, n’t, even, see,..."
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"[ever, get, the, feeling, your, life, circles,..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,"[why, the, truth, might, get, you, fired, octo..."
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,"[videos, 15, civilians, killed, in, single, us..."
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print An Iranian woman has been sentenced to ...,1,"[print, \n, an, iranian, woman, has, been, sen..."


In [17]:
# Get English stopwords from nltk
nltk_stopwords = nltk_stopwords.words('english')

# Get a list of punctuation characters, but remove '#' from the list
punctuation_characters = list(string.punctuation)
punctuation_characters.remove('#')

# Define your custom stopwords, including double quotes
your_custom_stopwords = ["'s", "...", '"', "-", "``", "''", "mr.", "\u2014", ]  # Add more custom stopwords as needed

# Combine all stopwords into one list
my_stopwords = set(punctuation_characters + your_custom_stopwords + nltk_stopwords)

# Sample text to remove stopwords from
sample_text = "This is an example of a sentence with 's and other stopwords..."

# Tokenize the sample text. For simplicity, we split on whitespace.
# In practice, you'd use a proper tokenizer.
tokens = sample_text.split()

# Remove stopwords from the token list
filtered_tokens = [token for token in tokens if token.lower() not in my_stopwords]

# Join the tokens back into a string
filtered_text = ' '.join(filtered_tokens)

print(filtered_text)

example sentence stopwords...


In [18]:
#nltk_stopwords = nltk_stopwords.words("english")


#punctuation_characters = list(string.punctuation)
#punctuation_characters.remove("#")

#YOUR_CUSTOM_STOPWORDS = ["'s", ",", ".", "-", "--", "´s",]
#YOUR_CUSTOM_STOPWORDS = punctuation_characters + YOUR_CUSTOM_STOPWORDS
#my_stopwords = YOUR_CUSTOM_STOPWORDS + nltk_stopwords

In [19]:
tfidf = TfidfVectorizer(
    tokenizer=lambda tokens: [token for token in tokens if token.lower() not in my_stopwords],
    lowercase=False,  # No lowercase conversion by TfidfVectorizer since we handle it in the tokenizer
    max_df=.9,  # Adjust based on your dataset
    min_df=10  # Adjust based on your dataset
)

tfidf_matrix = tfidf.fit_transform(df["tokens"])


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



In [20]:
tfidf_matrix

<18285x31179 sparse matrix of type '<class 'numpy.float64'>'
	with 5201265 stored elements in Compressed Sparse Row format>

In [21]:
from collections import Counter

# Convert the tfidf_matrix to a dense format
dense_tfidf = tfidf_matrix.todense()

# Get feature names to use as DataFrame column headers
feature_names = tfidf.get_feature_names_out()

# Create a DataFrame with the dense TF-IDF data
df_tfidf = pd.DataFrame(dense_tfidf, columns=feature_names)

# Add the label column to this DataFrame
df_tfidf['label'] = df['label'].values

# Now, sum the TF-IDF scores for each term by label
top_tfidf_by_label = df_tfidf.groupby('label').sum().transpose()

# Find the top n terms for each label based on their summed TF-IDF scores
top_n = 10
top_terms = {}
for label in top_tfidf_by_label.columns:
    top_terms[label] = top_tfidf_by_label[label].sort_values(ascending=False).head(top_n)

# Prepare the data for plotting
plot_data = []
for label, terms in top_terms.items():
    for term in terms.index:
        plot_data.append((label, term, terms[term]))

# Convert to a DataFrame
plot_df = pd.DataFrame(plot_data, columns=['Label', 'Word', 'TF-IDF'])

# Plot using Plotly Express
fig = px.bar(plot_df, x='Word', y='TF-IDF', color='Label', title='Top TF-IDF Terms by Label', facet_row='Label', height=1200)
fig.show()