In [36]:
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
import re

In [37]:
from nltk.stem import WordNetLemmatizer

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [40]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dominikakokoryk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Modelowanie (funkcje)

In [41]:
def preprocess_text(message):
    message = str(message)
    tokens = message.split()
    processed_tokens = []
    for word in tokens:
        # zmiana na małe litery
        word_lower = word.lower()
        # pomijamy stopwords
        if word_lower in stop_words:
            continue
        # usuwamy znaki interpunkcyjne i nawiasy
        word_clean = re.sub(r'[^a-zA-Z0-9]', '', word_lower)
        if word_clean:  # jeśli coś zostało po oczyszczeniu
            processed_tokens.append(word_clean)
    return ' '.join(processed_tokens)

In [42]:
def stopwords_removal(tokens):
    filtered_text = [word for word in tokens if word not in stop_words]
    return filtered_text

In [43]:
def lemmatizing(tokens, lemmatizer):
    lemmas = [lemmatizer.lemmatize(token, pos='v') for token in tokens]
    # tagged = pos_tag(tokens)
    # lemmas = [lemmatizer.lemmatize(token, pos='v' if tag.startswith('V') else 'n') for token, tag in tagged]
    return lemmas

In [44]:
def process_logistic_regression(df):
    # column text - origin text
    # column fake - 0 for true, 1 for fake news
    df['processed_text'] = df['text'].apply(preprocess_text)
    df['tokenized_text'] = df['processed_text'].apply(lambda document: nltk.word_tokenize(document))
    df['filtered_token'] = df['tokenized_text'].apply(stopwords_removal)
    lemmatizer = WordNetLemmatizer()
    df['lemmatized_tokens'] = df['filtered_token'].apply(lambda x: lemmatizing(x, lemmatizer))
    corpus = []
    for tokens in df['lemmatized_tokens']:
        document = ' '.join([word for word in tokens])
        corpus.append(document)
    tfidf = TfidfVectorizer()
    X = tfidf.fit_transform(corpus)

    y = df['fake']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    return (model, tfidf)

In [45]:
import pandas as pd

## Import data

In [None]:
# import kagglehub
# import shutil

# # Download latest version
# path = kagglehub.dataset_download("saurabhshahane/fake-news-classification")

# print("Path to dataset files:", path)
# shutil.copytree(path, 'data/')

Path to dataset files: /Users/dominikakokoryk/.cache/kagglehub/datasets/saurabhshahane/fake-news-classification/versions/77


'data/'

## Clean data

In [7]:
df_welfake = pd.read_csv("data/WELFake_Dataset.csv")
df_welfake.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [None]:
df_welfake.rename(columns={'label': 'fake'}, inplace=True)
df_welfake['fake'] = (df_welfake['fake'] + 1) % 2
df_welfake.head()

Unnamed: 0.1,Unnamed: 0,title,text,fake
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,0
1,1,,Did they post their votes for Hillary already?,0
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",0
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,1
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",0


In [9]:
df_welfake.dtypes

Unnamed: 0     int64
title         object
text          object
fake           int64
dtype: object

In [10]:
df_welfake.isna().sum()

Unnamed: 0      0
title         558
text           39
fake            0
dtype: int64

In [13]:
df_welfake['combined_text'] = (
    df_welfake['title'].fillna('').astype(str).str.strip()
    + ' '
    + df_welfake['text'].fillna('').astype(str).str.strip()
).str.replace(r'\s+', ' ', regex=True).str.strip()

# usuń wiersze, gdzie po sklejeniu nadal jest pusto
df_welfake = df_welfake[df_welfake['combined_text'] != ""]


In [14]:
df_welfake.isna().sum()

Unnamed: 0         0
title            558
text              39
fake               0
combined_text      0
dtype: int64

In [26]:
mask = df_welfake['title'].str.len() < 30
df_welfake[mask]

Unnamed: 0.1,Unnamed: 0,title,text,fake,combined_text
50,50,"American dream, revisited",by Pepe Escobar for the Strategic Culture Foun...,0,"American dream, revisited by Pepe Escobar for ..."
68,68,Michael Moore Owes Me $4.99,"28, 2016 | Reviews Michael Moore in New Yo...",0,"Michael Moore Owes Me $4.99 28, 2016 | Reviews..."
244,244,WORLD WAR 3 IS COMING,source Add To The Conversation Using Facebook ...,0,WORLD WAR 3 IS COMING source Add To The Conver...
266,266,Mrs. Weiner,Feds get a warrant to start search for classif...,0,Mrs. Weiner Feds get a warrant to start search...
505,505,Our Landfill Economy,Leave a reply Charles Hugh Smith – Corresponde...,0,Our Landfill Economy Leave a reply Charles Hug...
...,...,...,...,...,...
71530,71530,Voters Repudiate Clinton,Scott \nIt’s really amazing to see how little ...,0,Voters Repudiate Clinton Scott It’s really ama...
71587,71587,OnPolitics | 's politics blog,Who has Trump appointed to his cabinet so far?...,1,OnPolitics | 's politics blog Who has Trump ap...
71763,71763,Televisión: lo más visto ayer,TVE \nLa 1 de Televisión Española redondeó aye...,0,Televisión: lo más visto ayer TVE La 1 de Tele...
71853,71853,Кто любит Стивена Сигала?,"Политика \nПомните первые видеосалоны 1990-х, ...",0,Кто любит Стивена Сигала? Политика Помните пер...


In [27]:
df_welfake['text'] = df_welfake['combined_text']

In [29]:
df_welfake.isna().sum()

Unnamed: 0         0
title            558
text               0
fake               0
combined_text      0
dtype: int64

In [32]:
df_welfake[['text', 'fake']].to_csv('../data_for_modeling/WELFake_clean.csv', index=False)

In [None]:
mask = (df_welfake['text'].str.strip() == "")
df_welfake[mask]

In [46]:
(model, tfidf) = process_logistic_regression(df_welfake)

              precision    recall  f1-score   support

           0       0.95      0.96      0.96      7421
           1       0.96      0.95      0.95      7006

    accuracy                           0.95     14427
   macro avg       0.95      0.95      0.95     14427
weighted avg       0.95      0.95      0.95     14427



In [56]:
# lokalnie mam czyste dane z eda dla isot, bez tego nie pojdzie
df_isot = pd.read_csv('clean_isot.csv')

In [49]:
df_isot.sample(10)

Unnamed: 0,title,text,fake
12615,Scotland's Sturgeon says staying in EU customs...,EDINBURGH (Reuters) - Scotland s First Ministe...,0
33504,HILLARY BREAKS OUT Into Bizarre Fake Laughter ...,Remember when Hillary read the word sigh ins...,1
43368,PHILIPPINES: 2016 Washington’s Fury as Philipp...,Duterte takes clear lead in Philippine electi...,1
216,U.S. Treasury tax study slammed as 'fake math'...,WASHINGTON (Reuters) - The U.S. Treasury Depar...,0
15877,Serbia's president eyes early parliamentary vote,BELGRADE (Reuters) - Serbian President Aleksan...,0
38872,WATCH: West Virginia Governor Announces He’s L...,West Virginia Governor Jim Justice gave Trump ...,1
30909,THE CHIEF IS HERE! JOHN KELLY Surprises White ...,This is great! Gen. John Kelly surprised the W...,1
14205,Cambodia's opposition gives up posts after ban,PHNOM PENH (Reuters) - Elected officials of Ca...,0
19776,Lack of clear UK stance making Brexit talks to...,BERLIN (Reuters) - French Prime Minister Edoua...,0
30202,Oregon Right-Wing Terrorist Makes CHILLING Co...,If anyone questions whether or not the Bundy-l...,1


## Predict

In [50]:
def predict_dataset(df, model, tfidf):
    df['processed_text'] = df['text'].apply(preprocess_text)
    df['tokenized_text'] = df['processed_text'].apply(lambda document: nltk.word_tokenize(document))
    df['filtered_token'] = df['tokenized_text'].apply(stopwords_removal)
    lemmatizer = WordNetLemmatizer()
    df['lemmatized_tokens'] = df['filtered_token'].apply(lambda x: lemmatizing(x, lemmatizer))
    corpus = []
    for tokens in df['lemmatized_tokens']:
        document = ' '.join([word for word in tokens])
        corpus.append(document)
    # tfidf = TfidfVectorizer()
    X = tfidf.transform(corpus)

    y = df['fake']
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    # model = LogisticRegression()
    # model.fit(X_train, y_train)
    y_pred = model.predict(X)
    print(classification_report(y, y_pred))

In [51]:
predict_dataset(df_isot, model, tfidf)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00     21210
           1       0.03      0.03      0.03     22753

    accuracy                           0.02     43963
   macro avg       0.02      0.02      0.02     43963
weighted avg       0.02      0.02      0.02     43963



In [54]:
df_liar = pd.read_csv('../data_for_modeling/LIAR_for_modeling.csv')
df_liar.head()

Unnamed: 0,text,fake
0,Says the Annies List political group supports ...,1
1,"Hillary Clinton agrees with John McCain ""by vo...",0
2,Health care reform legislation is likely to ma...,1
3,The Chicago Bears have had more starting quart...,0
4,Jim Dunnam has not lived in the district he re...,1


In [55]:
predict_dataset(df_liar, model, tfidf)

              precision    recall  f1-score   support

           0       0.44      0.84      0.58      4507
           1       0.56      0.16      0.25      5657

    accuracy                           0.46     10164
   macro avg       0.50      0.50      0.42     10164
weighted avg       0.51      0.46      0.40     10164

