<a href="https://colab.research.google.com/github/jacobbarkow/w207-final-project-barkow-laface-meehan-skokowski/blob/main/exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

from sklearn.utils import shuffle

import nltk

np.random.seed(0)

In [2]:
fake_raw = pd.read_csv('Fake.csv')
fake_raw['label'] = 'fake'

true_raw = pd.read_csv('True.csv')
true_raw['label'] = 'true'

FileNotFoundError: ignored

In [None]:
# sizes of fake and true datasets are similar
# not sure if we should only include some

print(len(fake_raw), len(true_raw))

23481 21417


In [None]:
full_set = fake_raw.append(true_raw)
print(len(full_set))

44898


In [None]:
full_set = shuffle(full_set)

In [None]:
train_data = full_set[:35000]
test_data = full_set[35000:44000]
dev_data = full_set[44000:]

train_text, train_title, train_subject, train_labels = train_data['text'], train_data['title'], train_data['subject'], train_data['label']
test_text, test_title, test_subject, test_labels = test_data['text'], test_data['title'], test_data['subject'], test_data['label']
dev_text, dev_title, dev_subject, dev_labels = dev_data['text'], dev_data['title'], dev_data['subject'], dev_data['label']

## Testing Simple Models - Article Text

In [None]:
cv_train = CountVectorizer()
train_text_cv = cv_train.fit_transform(train_text)
test_text_cv = cv_train.transform(test_text)

print(np.shape(train_text_cv))

(35000, 109908)


In [None]:
# testing MNB - scoring and F1

mnb = MultinomialNB()
mnb.fit(train_text_cv, train_labels)
print(mnb.score(test_text_cv, test_labels))

test_predicted = mnb.predict(test_text_cv)
print(metrics.f1_score(test_labels, test_predicted, average="weighted"))

0.9561111111111111
0.9561222784815994


In [None]:
# testing logistic regression - scoring and F1

logreg = LogisticRegression(solver="liblinear", multi_class="auto")
logreg.fit(train_text_cv, train_labels)
print(logreg.score(test_text_cv, test_labels))

test_predicted = logreg.predict(test_text_cv)
print(metrics.f1_score(test_labels, test_predicted, average="weighted"))

0.9954444444444445
0.9954446065891727


In [None]:
# what are the most impactful words

words = cv_train.get_feature_names()
sort_array = np.argsort(logreg.coef_[0])

for i in range(1, 11):
    print(words[sort_array[-i]], logreg.coef_[0][sort_array[-i]])

print('\n')

for i in range(1, 11):
    print(words[sort_array[i]], logreg.coef_[0][sort_array[i]])

reuters 7.338512318833623
washington 0.8629743127193463
said 0.6452391953924397
thursday 0.5813205711920263
bit 0.5575759061988066
wednesday 0.5420015904083523
republican 0.49823569781835364
london 0.4962741722213087
nov 0.45878413846247895
market 0.4556959269399991


read -1.0822895191660005
com -0.8633270158201595
image -0.8283507778846004
us -0.7843088077024256
featured -0.7679771244470786
just -0.7489551937639962
pic -0.7233833383553663
watch -0.6865570990968554
hillary -0.663001758422078
gop -0.6490196057874936


## Testing Simple Models - Title

In [None]:
cv_train_ti = CountVectorizer()
train_text_cv_ti = cv_train_ti.fit_transform(train_title)
test_text_cv_ti = cv_train_ti.transform(test_title)

print(np.shape(train_text_cv_ti))

(35000, 19396)


In [None]:
logreg_ti = LogisticRegression(solver="liblinear", multi_class="auto")
logreg_ti.fit(train_text_cv_ti, train_labels)
print(logreg_ti.score(test_text_cv_ti, test_labels))

test_predicted_ti = logreg_ti.predict(test_text_cv_ti)
print(metrics.f1_score(test_labels, test_predicted_ti, average="weighted"))

0.9642222222222222
0.964236476700352


In [None]:
# what are the most impactful words

words_ti = cv_train_ti.get_feature_names()
sort_array_ti = np.argsort(logreg_ti.coef_[0])

for i in range(1, 11):
    print(words_ti[sort_array_ti[-i]], logreg_ti.coef_[0][sort_array_ti[-i]])

print('\n')

for i in range(1, 11):
    print(words_ti[sort_array_ti[i]], logreg_ti.coef_[0][sort_array_ti[i]])

factbox 3.7412698794712864
says 2.394252343680438
urges 2.1814005941256713
britain 2.009552975341399
exclusive 1.859107491076504
reuters 1.79963428940857
zimbabwe 1.7975690130034685
talks 1.7922251955768105
seek 1.7544050159810252
myanmar 1.6999705984351985


breaking -4.189631714266171
gop -3.8160122784548824
us -3.6443869525272645
watch -3.360705254269749
boiler -3.325122726909894
hillary -3.1573989920598398
just -3.146966871823763
racist -3.0878399583326885
bernie -2.851391104879254
details -2.4687539359854536


## Testing Simple Models - Subject Text

In [None]:
cv_train_subj = CountVectorizer()
train_text_cv_subj = cv_train_subj.fit_transform(train_subject)
test_text_cv_subj = cv_train_subj.transform(test_subject)

print(np.shape(train_text_cv_subj))

(35000, 9)


In [None]:
# testing MNB - scoring and F1

mnb_subj = MultinomialNB()
mnb_subj.fit(train_text_cv_subj, train_labels)
print(mnb_subj.score(test_text_cv_subj, test_labels))

test_predicted_subj = mnb_subj.predict(test_text_cv_subj)
print(metrics.f1_score(test_labels, test_predicted_subj, average="weighted"))

1.0
1.0


In [None]:
# testing logistic regression - scoring and F1

logreg_subj = LogisticRegression(solver="liblinear", multi_class="auto")
logreg_subj.fit(train_text_cv_subj, train_labels)
print(logreg_subj.score(test_text_cv_subj, test_labels))

test_predicted_subj = logreg_subj.predict(test_text_cv_subj)
print(metrics.f1_score(test_labels, test_predicted_subj, average="weighted"))

1.0
1.0


### Subject field is actually useless because subjects can already uniquely identify if it's real or fake

## How successful is the model with just the word "reuters"?

In [None]:
cv_train_r = CountVectorizer(vocabulary=['reuters'])
train_text_cv_r = cv_train_r.fit_transform(train_text)
test_text_cv_r = cv_train_r.transform(test_text)

print(np.shape(train_text_cv_r))

(35000, 1)


In [None]:
logreg_r = LogisticRegression(solver="liblinear", multi_class="auto")
logreg_r.fit(train_text_cv_r, train_labels)
print(logreg_r.score(test_text_cv_r, test_labels))

test_predicted_r = logreg_r.predict(test_text_cv_r)
print(metrics.f1_score(test_labels, test_predicted_r, average="weighted"))

0.9917777777777778
0.9917799006805953
