In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk

from nltk.corpus import stopwords
from collections import Counter
from nltk.tokenize import word_tokenize  
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 8.9 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
import spacy

# Data

In [220]:
fake = pd.read_csv("Fake.csv")
true = pd.read_csv("True.csv")

# Data Cleaning

### Removing Pattern in True

In [221]:
true['text'] = true.text.apply(lambda x:x.split("-",1)[1:])
true['text'] = true.text.apply(lambda x:str(x)[1:-1])

### Removing News Agency Names

In [28]:
def remove_news_agency_name(text):
    return re.sub(r"Reuters|AP|New York Times|Washington Post|Business Insider|Atlantic|Fox News|National Review|Talking Points Memo|Buzzfeed News|Guardian|NPR|Vox|CNN|BBC|Bloomberg|Daily Mail|21st Century Wire|21st Century|Al Jazeera|21WIRE|ABC News", "", text)

In [223]:
true['text'] = true.text.apply(lambda x: remove_news_agency_name(x))
fake['text'] = fake.text.apply(lambda x: remove_news_agency_name(x))

### Combining Data

In [224]:
true['label']=1
fake['label']=0
combined = pd.concat([true, fake], ignore_index=True, axis=0)

### Normalizing Text

In [225]:
combined['text'] = combined.text.apply(lambda x:x.lower())
combined['text'] = combined.text.apply(lambda x:re.sub(r'[^\w\s]+', ' ', x))

### Removing Stopwords

In [226]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def remove_stopwords(txt):
    no_stopwords = [word for word in txt.split() if word not in stop_words]
    return ' '.join(no_stopwords)
combined['text'] = combined['text'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to /Users/megsr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Tokenizing

In [227]:
nltk.download('punkt')
combined['text'] = combined.text.apply(lambda x:word_tokenize(x))

[nltk_data] Downloading package punkt to /Users/megsr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Feature Extraction

### TfidfVectorize

In [17]:
tf = TfidfVectorizer(smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word',max_features=200)
txt = combined.text.apply(lambda x: ' '.join(x))
txt_fitted = tf.fit(txt)
txt_transformed = txt_fitted.transform(txt)

NameError: name 'TfidfVectorizer' is not defined

# Modeling

### Random Forest w/ TfidfVectorize

X_train, X_test, y_train, y_test = train_test_split(txt_transformed,combined['label'], test_size=0.2, random_state=42)
ran = RandomForestClassifier()
ran.fit(X_train, y_train)
ran_pred = ran.predict(X_test)
accuracy = accuracy_score(y_test, ran_pred)
print('accuracy:', accuracy)

In [230]:
X_train, X_test, y_train, y_test = train_test_split(txt_transformed,combined['label'], test_size=0.2, random_state=42)
vc = VotingClassifier([('lr', LogisticRegression()), ('dt', DecisionTreeClassifier()), ('rf', RandomForestClassifier()), ('svc', SVC()), ('ada', AdaBoostClassifier()), ('gb', GradientBoostingClassifier())])
vc.fit(X_train, y_train)
vc_pred = vc.predict(X_test)
accuracy = accuracy_score(y_test, vc_pred)
print('accuracy:', accuracy)

accuracy: 0.9638084632516704


# Testing

## Manually Collected Dataset

In [26]:
manual_dataset = pd.read_csv("manual_dataset.csv", index_col=0)
manual_dataset

Unnamed: 0_level_0,Real/ Fake,News Source,Link,Date,Topic/ Keyword (if applicable),Title,Text
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Real,The Globe and Mail,https://www.theglobeandmail.com/business/comme...,2/19/2024,Renewable energy,"With Alberta renewables ban, business common s...","Investment creates jobs, expands tax revenue, ..."
1,Real,CBC News,https://www.cbc.ca/news/canada/calgary/oil-ind...,2/22/24,Oil,Some sectors of oil industry 'dragging their h...,Prime Minister Justin Trudeau and Alberta Prem...
2,Real,CBC News,https://www.cbc.ca/news/canada/thunder-bay/cat...,03/04/24,Mining,"Demanding a proper say, Cat Lake First Nation'...",Cat Lake First Nation has been granted an inju...
3,Real,CBC News,https://www.cbc.ca/news/politics/guilbeault-no...,02/14/24,Climate,Feds will stop investing in 'large' road proje...,Environment Minister Steven Guilbeault said Mo...
4,Real,Global News,https://globalnews.ca/news/10292216/climate-ch...,02/13/2024,Climate,Floods and fires: Climate change brings health...,Communities across British Columbia needs to p...
5,Real,CBC News,https://www.cbc.ca/news/canada/toronto/ontario...,02/08/24,Climate,New mineral processing plants touted as 'missi...,A Canadian mining company will announce plans ...
6,Real,Financial Post,https://financialpost.com/pmn/business-pmn/inf...,03/04/24,Climate,Infernos Rage From Texas to Australia as Fire ...,(Bloomberg) — After a year of deadly wildfires...
7,Real,CBC News,https://www.cbc.ca/news/canada/saskatchewan/sa...,03/05/24,Climate,Sask. farmer 'breathing a sigh of relief' afte...,A Saskatchewan farmer says she is thrilled wit...
8,Real,CBC News,https://www.cbc.ca/news/canada/british-columbi...,03/03/24,Wildfire,Inaccurate government data may be stoking wild...,Wildfire fighting and forest management decisi...
9,Real,Calgary Herald,https://calgaryherald.com/opinion/columnists/v...,2/14/2024,Oil,Varcoe: Smith says Alberta won't sterilize 'pr...,Alberta remained a powerful magnet for investm...


### Preprocessing

In [29]:
manual_dataset['Text'] = manual_dataset.Text.apply(lambda x: remove_news_agency_name(str(x)))

In [30]:
manual_dataset['Text'] = manual_dataset.Text.apply(lambda x:str(x))
manual_dataset['Text'] = manual_dataset.Text.apply(lambda x:x.lower())
manual_dataset['Text'] = manual_dataset.Text.apply(lambda x:re.sub(r'[^\w\s]+', ' ', x))
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def remove_stopwords(txt):
    no_stopwords = [word for word in txt.split() if word not in stop_words]
    return ' '.join(no_stopwords)
manual_dataset['Text'] = manual_dataset['Text'].apply(remove_stopwords)
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rachelpawlik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rachelpawlik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Vectorizing

In [16]:
txt_transformed = tf.transform(manual_dataset["Text"])
arr = vc.predict(txt_transformed)
arr_list = list(arr)

NameError: name 'tf' is not defined

In [235]:
manual_dataset['random_forest_results']=arr_list
manual_dataset

Unnamed: 0_level_0,Real/ Fake,News Source,Link,Date,Topic/ Keyword (if applicable),Title,Text,random_forest_results
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Real,The Globe and Mail,https://www.theglobeandmail.com/business/comme...,2/19/2024,Renewable energy,"With Alberta renewables ban, business common s...",investment creates jobs expands tax revenue pr...,1
1,Real,CBC News,https://www.cbc.ca/news/canada/calgary/oil-ind...,2/22/24,Oil,Some sectors of oil industry 'dragging their h...,prime minister justin trudeau alberta premier ...,1
2,Real,CBC News,https://www.cbc.ca/news/canada/thunder-bay/cat...,03/04/24,Mining,"Demanding a proper say, Cat Lake First Nation'...",cat lake first nation granted injunction pausi...,1
3,Real,CBC News,https://www.cbc.ca/news/politics/guilbeault-no...,02/14/24,Climate,Feds will stop investing in 'large' road proje...,environment minister steven guilbeault said mo...,1
4,Real,Global News,https://globalnews.ca/news/10292216/climate-ch...,02/13/2024,Climate,Floods and fires: Climate change brings health...,communities across british columbia needs prep...,1
5,Real,CBC News,https://www.cbc.ca/news/canada/toronto/ontario...,02/08/24,Climate,New mineral processing plants touted as 'missi...,canadian mining company announce plans thursda...,1
6,Real,Financial Post,https://financialpost.com/pmn/business-pmn/inf...,03/04/24,Climate,Infernos Rage From Texas to Australia as Fire ...,year deadly wildfires around globe world burni...,1
7,Real,CBC News,https://www.cbc.ca/news/canada/saskatchewan/sa...,03/05/24,Climate,Sask. farmer 'breathing a sigh of relief' afte...,saskatchewan farmer says thrilled snowfall wee...,1
8,Real,CBC News,https://www.cbc.ca/news/canada/british-columbi...,03/03/24,Wildfire,Inaccurate government data may be stoking wild...,wildfire fighting forest management decisions ...,0
9,Real,Calgary Herald,https://calgaryherald.com/opinion/columnists/v...,2/14/2024,Oil,Varcoe: Smith says Alberta won't sterilize 'pr...,alberta remained powerful magnet investment ca...,1


In [236]:
combined_fake = combined[combined['label']==0]
combined_true = combined[combined['label']==1]

### Most Common Word Comparison in Training Data - Real vs Fake ** (dont use bc might get confused w topic summarization on manual dataset)

In [237]:
fake_d = {}
for line in combined_fake.text:
    for word in line:
        if word in fake_d:
                fake_d[word] = fake_d[word] + 1
        else:
            fake_d[word] = 1
Counter(fake_d).most_common(5)

[('trump', 79307),
 ('said', 33763),
 ('president', 27721),
 ('people', 26570),
 ('one', 24531)]

In [238]:
true_d = {}
for line in combined_true.text:
    for word in line:
        if word in true_d:
                true_d[word] = true_d[word] + 1
        else:
            true_d[word] = 1
Counter(true_d).most_common(5)

[('said', 99020),
 ('trump', 54546),
 ('u', 44437),
 ('would', 31595),
 ('president', 28067)]

### Word Length Comparison in Training Dataset - Real vs Fake ** (dont use for same reason as above)**

In [239]:
fake_word_length = combined_fake.text.apply(lambda x:len(x))
fake_word_length.median()

206.0

In [240]:
real_word_length = combined_true.text.apply(lambda x:len(x))
real_word_length.median()

217.0