In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from collections import Counter
from nltk.tokenize import word_tokenize  
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder


# Data

In [47]:
fake = pd.read_csv("Fake.csv")
true = pd.read_csv("True.csv")

In [100]:
true['label']=1
fake['label']=0
combined = pd.concat([true, fake], ignore_index=True, axis=0)
combined.subject.value_counts()

subject
politicsNews       11272
worldnews          10145
News                9050
politics            6841
left-news           4459
Government News     1570
US_News              783
Middle-east          778
Name: count, dtype: int64

# Data Cleaning

### Normalizing Text

In [None]:
def remove_news_agency_name(text):
    return re.sub(r"Reuters|AP|New York Times|Washington Post|Business Insider|Atlantic|Fox News|National Review|Talking Points Memo|Buzzfeed News|Guardian|NPR|Vox|CNN|BBC|Bloomberg|Daily Mail", "", text)

In [49]:
combined['text'] = combined.text.apply(lambda x:x.lower())
combined['text'] = combined.text.apply(lambda x:re.sub(r'[^\w\s]+', ' ', x))

### Removing Stopwords

In [50]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def remove_stopwords(txt):
    no_stopwords = [word for word in txt.split() if word not in stop_words]
    return ' '.join(no_stopwords)
combined['text'] = combined['text'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to /Users/megsr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Tokenizing

In [51]:
nltk.download('punkt')
combined['text'] = combined.text.apply(lambda x:word_tokenize(x))

[nltk_data] Downloading package punkt to /Users/megsr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Feature Extraction

### CountVectorizer

In [52]:
txt = combined.text.apply(lambda x: ' '.join(x))
count_vec = CountVectorizer(stop_words="english", analyzer='word', 
                            ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None)
count_train = count_vec.fit(txt)
bag_of_words = count_vec.transform(txt)
features = count_vec.get_feature_names_out()

### TfidfVectorize

In [53]:
txt = combined.text.apply(lambda x: ' '.join(x))
tf = TfidfVectorizer(smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
txt_fitted = tf.fit(txt)
txt_transformed = txt_fitted.transform(txt)

# Modeling

### Logistic Regression w/ CountVectorizer

In [54]:
X_train, X_test, y_train, y_test = train_test_split(bag_of_words,combined['label'], test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
error_value = mean_squared_error(y_test, y_pred, squared=False)
print('error value:',error_value)
accuracy = accuracy_score(y_test, y_pred)
print('accuracy:',accuracy)

error value: 0.06919840105262935
accuracy: 0.9952115812917595


### Logistic Regression w/ TfidfVectorize

In [55]:
X_train, X_test, y_train, y_test = train_test_split(txt_transformed,combined['label'], test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
error_value = mean_squared_error(y_test, y_pred, squared=False)
print('error value:', error_value)
accuracy = accuracy_score(y_test, y_pred)
print('accuracy:', accuracy)

error value: 0.0768245042570597
accuracy: 0.994097995545657


### XGB w/ CountVectorizer

In [56]:
X_train, X_test, y_train, y_test = train_test_split(bag_of_words,combined['label'], test_size=0.2, random_state=42)
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
accuracy = accuracy_score(y_test, xgb_pred)
print('accuracy:', accuracy)

accuracy: 0.9967706013363029


### XGB w/ TfidfVectorize

In [57]:
X_train, X_test, y_train, y_test = train_test_split(txt_transformed,combined['label'], test_size=0.2, random_state=42)
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
accuracy = accuracy_score(y_test, xgb_pred)
print('accuracy:', accuracy)

accuracy: 0.9974387527839643


In [58]:
X_train, X_test, y_train, y_test = train_test_split(txt_transformed,combined['label'], test_size=0.2, random_state=42)
ran = RandomForestClassifier()
ran.fit(X_train, y_train)
ran_pred = ran.predict(X_test)
accuracy = accuracy_score(y_test, ran_pred)
print('accuracy:', accuracy)

accuracy: 0.9933184855233853


# Topic Summarization

In [71]:
combined_fake = combined[combined['label']==0]
combined_true = combined[combined['label']==1]

In [60]:
fake_d = {}
for line in combined_fake.text:
    for word in line:
        if word in fake_d:
                fake_d[word] = fake_d[word] + 1
        else:
            fake_d[word] = 1
Counter(fake_d).most_common(5)

[('trump', 79300),
 ('said', 33763),
 ('president', 27715),
 ('people', 26570),
 ('one', 24531)]

In [61]:
true_d = {}
for line in combined_true.text:
    for word in line:
        if word in true_d:
                true_d[word] = true_d[word] + 1
        else:
            true_d[word] = 1
Counter(true_d).most_common(5)

[('said', 99062),
 ('trump', 54700),
 ('u', 44570),
 ('would', 31605),
 ('reuters', 28976)]

# Testing

In [62]:
testing = pd.read_csv("testing_dataset_v1.csv")

In [63]:
def labelling(col):
    if col in ['Inaccurate', 'Misleading', 'Incorrect', 'Unsupported', 'Flawed_Reasoning', 'Imprecise', 'Lacks_Context']:
        return 0
    else:
        return 1

testing["Label" ] = testing['Verdict'].apply(lambda x: labelling(x))

In [64]:
testing['Claim'] = testing.Claim.apply(lambda x:x.lower())
testing['Claim'] = testing.Claim.apply(lambda x:re.sub(r'[^\w\s]+', ' ', x))
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def remove_stopwords(txt):
    no_stopwords = [word for word in txt.split() if word not in stop_words]
    return ' '.join(no_stopwords)
testing['Claim'] = testing['Claim'].apply(remove_stopwords)
nltk.download('punkt')
testing['Claim'] = testing['Claim'].apply(lambda x: ' '.join(word_tokenize(x)))

[nltk_data] Downloading package stopwords to /Users/megsr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/megsr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [65]:
txt_transformed = tf.transform(testing['Claim'])
count_vec.transform(testing['Claim'])

<176x121659 sparse matrix of type '<class 'numpy.int64'>'
	with 1946 stored elements in Compressed Sparse Row format>

In [66]:
log = model.predict(txt_transformed)
log

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [67]:
testing.Label.value_counts()

Label
0    161
1     15
Name: count, dtype: int64

In [68]:
api = pd.read_csv("all_sources_resDf.csv", index_col= 0)
api

Unnamed: 0,source_name,source_id,publish_date,title,text
0,CBC News,cbc-news,2024-01-25T17:08:14Z,Are heat pumps a climate solution in Canada's ...,Our planet is changing. So is our journalism. ...
1,CBC News,cbc-news,2024-02-02T17:13:17Z,"Greta Thunberg, 4 others acquitted on London c...",A judge on Friday acquitted climate activist G...
2,CBC News,cbc-news,2024-02-01T18:33:06Z,How effective a climate solution is removing C...,Our planet is changing. So is our journalism. ...
3,CBC News,cbc-news,2024-01-16T22:39:54Z,YouTube earns millions a year channels that pr...,YouTube is making millions of dollars a year f...
4,CBC News,cbc-news,2024-01-17T09:00:00Z,Climate change threatens northern Ontario's wi...,Tyler Tyance is all too familiar with the long...
...,...,...,...,...,...
3,Global News,,2024-02-02T16:14:43Z,Are microplastics harmful? Health Canada funds...,Health Canada is funding new research that wil...
0,Calgary Herald,,2024-02-08T00:55:51Z,Varcoe: Report warns of $600B hit to Canada ec...,'They are big numbers. And we were trying to m...
1,Calgary Herald,,2024-01-26T13:50:50Z,Varcoe: Danielle Smith wants to 'double down' ...,There are several major obstacles to significa...
0,Ctvnews.ca,,2024-01-27T03:57:36Z,Amazon: MacKenzie Scott sold US$10B in shares ...,MacKenzie Scott continues to sell billions of ...


In [69]:
api['text'] = api.text.apply(lambda x:str(x))
api['text'] = api.text.apply(lambda x:x.lower())
api['text'] = api.text.apply(lambda x:re.sub(r'[^\w\s]+', ' ', x))
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def remove_stopwords(txt):
    no_stopwords = [word for word in txt.split() if word not in stop_words]
    return ' '.join(no_stopwords)
api['text'] = api['text'].apply(remove_stopwords)
nltk.download('punkt')
api['text'] = api['text'].apply(lambda x: ' '.join(word_tokenize(x)))
api.text

[nltk_data] Downloading package stopwords to /Users/megsr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/megsr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0    planet changing journalism weekly newsletter p...
1    judge friday acquitted climate activist greta ...
2    planet changing journalism weekly newsletter p...
3    youtube making millions dollars year advertisi...
4    tyler tyance familiar long days cold nights da...
                           ...                        
3    health canada funding new research look potent...
0    big numbers trying make case ottawa alberta we...
1    several major obstacles significantly hike out...
0    mackenzie scott continues sell billions dollar...
0    jordan peterson failed attempt ontario court s...
Name: text, Length: 229, dtype: object

In [81]:
txt_transformed = tf.transform(api['text'])
arr = ran.predict(txt_transformed)
blah = list(arr)

In [89]:
api['random_forest_results']=blah
api[api['random_forest_results']==1]

Unnamed: 0,source_name,source_id,publish_date,title,text,random_forest_results
14,CBC News,cbc-news,2024-02-08T12:55:13Z,World temperatures go a full year above 1.5 C ...,world experienced hottest january record conti...,1
20,CBC News,cbc-news,2024-02-09T14:49:00Z,Candidates backed by imprisoned former PM Imra...,independent candidates backed imprisoned forme...,1
22,CBC News,cbc-news,2024-01-29T20:41:30Z,Top Liberal ministers duck questions about rep...,polls suggesting liberal party support still f...,1
82,CBC News,cbc-news,2024-01-25T04:35:00Z,Vancouver council approves policy statement fo...,indigenous led proposal build mixed use develo...,1
14,CBC News,cbc-news,2024-02-08T12:55:13Z,World temperatures go a full year above 1.5 C ...,world experienced hottest january record conti...,1
20,CBC News,cbc-news,2024-02-09T14:49:00Z,Candidates backed by imprisoned former PM Imra...,independent candidates backed imprisoned forme...,1
22,CBC News,cbc-news,2024-01-29T20:41:30Z,Top Liberal ministers duck questions about rep...,polls suggesting liberal party support still f...,1
82,CBC News,cbc-news,2024-01-25T04:35:00Z,Vancouver council approves policy statement fo...,indigenous led proposal build mixed use develo...,1


In [92]:
api.source_name.value_counts()

source_name
CBC News              200
Financial Post         19
Global News             4
Calgary Herald          2
The Globe And Mail      1
Ctvnews.ca              1
National Post           1
Name: count, dtype: int64

In [94]:
txt_transformed = tf.transform(testing['Claim'])
arr = ran.predict(txt_transformed)
arr

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])