In [52]:
# Iporting Dependences
import matplotlib
!pip install textblob  
from textblob import TextBlob
import re
import pandas as pd
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, f1_score, roc_curve, auc
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize, WhitespaceTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import os
from time import strftime
import pickle


In [12]:
# Set up random seed, for reproductability of randomness
np.random.seed(18)

In [25]:
# Import dataset
df1 = pd.read_csv('df1_clean.csv')
df1.tail(5)

Unnamed: 0,articletext,label
44924,BRUSSELS (Reuters) - NATO allies on Tuesday we...,0
44925,"LONDON (Reuters) - LexisNexis, a provider of l...",0
44926,MINSK (Reuters) - In the shadow of disused Sov...,0
44927,MOSCOW (Reuters) - Vatican Secretary of State ...,0
44928,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,0


In [26]:
df1 = df1[['articletext', 'label']]
df1.head

<bound method NDFrame.head of                                              articletext  label
0      Donald Trump just couldn t wish all Americans ...      1
1      House Intelligence Committee Chairman Devin Nu...      1
2      On Friday, it was revealed that former Milwauk...      1
3      On Christmas day, Donald Trump announced that ...      1
4      Pope Francis used his annual Christmas Day mes...      1
...                                                  ...    ...
44924  BRUSSELS (Reuters) - NATO allies on Tuesday we...      0
44925  LONDON (Reuters) - LexisNexis, a provider of l...      0
44926  MINSK (Reuters) - In the shadow of disused Sov...      0
44927  MOSCOW (Reuters) - Vatican Secretary of State ...      0
44928  JAKARTA (Reuters) - Indonesia will buy 11 Sukh...      0

[44929 rows x 2 columns]>

In [27]:
# Getting rid of empty lines
df1 = df1[df1.articletext.isna() == False]
length_df1 = len(df1)

In [28]:
# Build sublist of original df1, contains # lines picked at random, out of 20671 possible
random_indexes = list(np.random.choice(length_df1 - 2, 3000, replace=False))
df1 = df1.iloc[random_indexes]

In [50]:
# import nltk
# nltk.download('vader_lexicon')

In [29]:
# Function dissects text i, attributes polarity scores, positive/negative/neutral, polarity or not, and subject

def sentiment_analyzer(dataframe):
    sid = SentimentIntensityAnalyzer()
    scores = [sid.polarity_scores(i) for i in dataframe.articletext]
    compounds = np.array([i['compound'] for i in scores], dtype='float32')
    abs_compounds = np.array([np.sqrt(i ** 2) for i in compounds], dtype='float32')
    negs = np.array([i['neg'] for i in scores], dtype='float32')
    poss = np.array([i['pos'] for i in scores], dtype='float32')
    neus = np.array([i['neu'] for i in scores], dtype='float32')
    sent = dataframe['articletext'].apply(lambda x: TextBlob(x).sentiment)
    pol = np.array([s[0] for s in sent], dtype='float32')
    abs_pol = np.array([np.sqrt(i ** 2) for i in pol], dtype='float32')
    subj = np.array([s[1] for s in sent], dtype='float32')

    return compounds, abs_compounds, negs, poss, neus, sent, pol, abs_pol, subj
    compounds, abs_compounds, negs, poss, neus, sent, pol, abs_pol, subj = sentiment_analyzer(df1)


In [30]:
# Adding columns to df1, matching them with newly created variables
compounds, abs_compounds, negs, poss, neus, sent, pol, abs_pol, subj = sentiment_analyzer(df1)

df1['compounds'] = compounds
df1['abs_compounds'] = abs_compounds
df1['negs'] = negs
df1['neus'] = neus
df1['poss'] = poss
df1['pol'] = pol
df1['abs_pol'] = abs_pol
df1['subj'] = subj
df1.head()

Unnamed: 0,articletext,label,compounds,abs_compounds,negs,neus,poss,pol,abs_pol,subj
34295,NEW YORK (Reuters) - Financially troubled Puer...,0,-0.991,0.991,0.133,0.777,0.09,0.062536,0.062536,0.483074
39427,DUBLIN (Reuters) - The Irish government would ...,0,0.9791,0.9791,0.058,0.742,0.201,0.123843,0.123843,0.231019
42065,"MANCHESTER, England (Reuters) - Prime Minister...",0,0.9459,0.9459,0.035,0.841,0.124,-0.128083,0.128083,0.259238
35950,ANKARA/ISTANBUL (Reuters) - Turkey criticized ...,0,0.9815,0.9815,0.075,0.811,0.114,0.007934,0.007934,0.323331
20454,"What a role model for women and young girls, a...",1,0.9967,0.9967,0.043,0.799,0.158,0.105835,0.105835,0.453447


In [None]:
##########################
### Logistic Regression ##
##########################

In [31]:
#Set X (predictor) and y (target) variables
X = df1[['compounds', 'negs', 'neus', 'poss', 'pol', 'subj']]
y = df1['label']


In [32]:
# First classifier
lrxtrain, lrxtest, lrytrain, lrytest = train_test_split(X, y)
lr = LogisticRegression()
lr.fit(lrxtrain, lrytrain)
lrpreds = lr.predict(lrxtest)
accuracy = accuracy_score(lrytest, lrpreds)
f1 = f1_score(lrytest, lrpreds)



In [33]:
# First attempt gives accuracy and f1 score of (0.7013, 0.7083)
print(accuracy, f1)

0.7013333333333334 0.7083333333333331


In [None]:
### 2. Improving our classifier using CountVectorizer 
#####################################################


In [34]:
#Train/Test Split
x_values = df1[['articletext', 'compounds', 'abs_compounds', 'negs', 'neus', 'poss', 'pol', 'abs_pol', 'subj']]
y_values = df1['label']
xtrain, xtest, ytrain, ytest = train_test_split(x_values, y_values)

In [35]:
# Cleans article from numbers, capital letters, punctuation and spaces for better classifier results
def clean_article(article):
    art = re.sub("[^A-Za-z0-9' ]", '', str(article))
    art2 = re.sub("[( ' )(' )( ')]", ' ', str(art))
    art3 = re.sub("\s[A-Za-z]\s", ' ', str(art2))
    return art3.lower()

In [36]:
# Initiate the model
model = CountVectorizer(stop_words='english', ngram_range=(1, 2), max_features=998, max_df=1.0, min_df=1, binary=False)


In [37]:
# Fit and transform the vectorizor 
test_data = model.transform(xtest.articletext)
training_data

<2250x998 sparse matrix of type '<class 'numpy.int64'>'
	with 164983 stored elements in Compressed Sparse Row format>

In [38]:
dftrain = pd.DataFrame(training_data.toarray())
dftrain.columns = model.get_feature_names()

In [39]:
dftest = pd.DataFrame(test_data.toarray())
dftest.columns = model.get_feature_names()

In [40]:
# Create the predictions for y training data
lr2 = LogisticRegression()
lr2.fit(dftrain, ytrain)
lr2_preds = lr2.predict(dftest)
accuracy = accuracy_score(ytest, lr2_preds)
f1 = f1_score(ytest, lr2_preds)





0.984375

In [41]:
# Second attempt gives accuracy and f1 score of (0.9893, 0.9843)
print(accuracy, f1)

0.984 0.984375


In [46]:
### Test classifier on real life articles

def classify_real_articles():
    path = r'Users⁩\bolboceanunicolai⁩\Desktop⁩\fake_news_detector⁩\⁨articles\\'
    directory = os.fsencode(path)

    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.endswith(".txt"):

            article_import = open(path + filename, 'r')

            real_article = article_import.read()

            # Transformations to fit classifier format
            real_article = [real_article]
            real_article = model.transform(real_article)
            real_article = pd.DataFrame(real_article.toarray())
            real_article.columns = model.get_feature_names()

            real_article_pred = lr2.predict(real_article)
            print(real_article_pred)
            if real_article_pred[0] == 0:
                print(filename + " is probably real")
            else:
                print(filename + " is probably fake")


In [47]:
pickle.dump(lr2, open("model.pkl", "wb"), protocol=2)
pickle.dump(clean_article, open("clean_article.pkl", 'wb'))
pickle.dump(model, open("bow2.pkl", 'wb'), protocol=2)
print('\n')



