### Get the data.

Download the dataset from https://surfdrive.surf.nl/files/index.php/s/bfNFkuUVoVtiyuk. This is a subset of the data from https://doi.org/10.7910/DVN/YHWTFC. 

In [1]:
from glob import glob

# Load all the articles from the Infowars directory
infowarsfiles = glob('articles/*/Infowars/*')
infowarsarticles = []

for filename in infowarsfiles:
    try:
        with open(filename) as f:
            infowarsarticles.append(f.read())
            
    except FileNotFoundError:
        print(f"File not found: {filename}")

In [12]:
import pandas as pd
import os
import re

# Define a generator function to read the data
def read_data(listofoutlets):
    for label in listofoutlets:
        for file in glob(os.path.join( f'articles/*/{label}/*')):
            with open(file) as f:
                yield label, file, f.read()

# Use the generator to read the articles and set the corresponding labels
data = list(read_data(['Infowars', 'BBC', 'The Guardian']))

# Create a DataFrame from the data
df = pd.DataFrame(data, columns=['outlets', 'data', 'article_text'])

# Extract 'date' and 'article_title' from the 'data' column
df['date'] = df['data'].apply(lambda x: re.search(r'--(\d{4}-\d{2}-\d{2})--', x).group(1) if re.search(r'--(\d{4}-\d{2}-\d{2})--', x) else None)
df['article_title'] = df['data'].apply(lambda x: os.path.basename(x).split('--')[-1])

# Drop the original 'data' column as it's no longer needed
df.drop(columns=['data'], inplace=True)

# Report outcome
print(f"Number of documents: {len(df)}")
print(f"Number of unique outlets: {df['outlets'].nunique()}")

df.head(10)

Number of documents: 2908
Number of unique outlets: 3


Unnamed: 0,outlets,article_text,date,article_title
0,Infowars,A high school in Vermonts capital raised a Bla...,2018-02-02,Black Lives Matter Flag Flies Over Vermont Sch...
1,Infowars,A Monmouth Poll released on Wednesday that sho...,2018-02-02,Democratic Generic Congressional Ballot Advant...
2,Infowars,"Thursday on Fox News Channels The Story, Rep. ...",2018-02-02,GOP Rep Gaetz Responds to Comey Tweet I Dont K...
3,Infowars,Hillary Clinton has passed over a million doll...,2018-02-02,Hillary Has Sent Over 1 Million From Onward To...
4,Infowars,The Department of Immigration and Customs Enfo...,2018-02-02,ICE Raids 77 Northern California Businesses
5,Infowars,President Donald Trumps approval rating is now...,2018-02-02,Trump Approval Rating Almost Tied With Obama A...
6,Infowars,President Trump took to Twitter on Thursday ni...,2018-02-02,Trump Democrats just arent calling on DACA
7,Infowars,Donald Trump has lashed out at the leadership ...,2018-02-02,Trump Leadership of FBI Justice Dept politiciz...
8,Infowars,The classified Republican memo that has become...,2018-02-02,Washington braces for polarizing Nunes memo
9,Infowars,President Donald Trump celebrated the Friday r...,2018-02-03,Collusion Is Dead Trump Celebrates Release of ...


In [13]:
from glob import glob  # allow to search for files and directories that match a specified pattern.

# Define Function to retrieve news articles from specified news outlets
def read_data(listofoutlets):
    texts = []
    labels = []
    
    for label in listofoutlets:
        for file in glob(f'articles/*/{label}/*'):
            try:
                with open(file) as f:
                    texts.append(f.read())
                    labels.append(label)

            except FileNotFoundError:
                pass
    return texts, labels

X, y = read_data(['Infowars', 'BBC', 'The Guardian']) #choose your own newsoutlets

### Split Data

In [15]:
from sklearn.model_selection import train_test_split

#split the dataset in a train and test sample
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Vectorize the data

To train a classifier that will predict whether articles come from a fake news source (e.g., `Infowars`) or a quality news outlet (e.g., `bbc`). In other words, I want to predict `source` based on linguistic variations in the articles.

To arrive at a model that will do just that, I transform 'text' to 'features'. By defining different vectorizers, with the following options:
- `count` vs. `tfidf` vectorizers
- with/ without pruning
- with/ without stopword removal

In [16]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import TreebankWordTokenizer 
from sklearn.model_selection import train_test_split

In [17]:
# TFIDF Vectorizer
vectorizer_TFIDF = TfidfVectorizer(tokenizer=TreebankWordTokenizer().tokenize, 
                                  stop_words='english', 
                                  max_df=.75, 
                                  min_df=2)  #remove words that occur in more than 75% or less than n = 2 documents

In [18]:
# Count Vectorizer
vectorizer_count = CountVectorizer(tokenizer=TreebankWordTokenizer().tokenize, 
                                  stop_words='english', 
                                  max_df=.75, 
                                  min_df=2)  #remove words that occur in more than 75% or less than n = 2 documents

**Why do we only '.transform' the Test-set, but only '.fit_transform' the Train-set?**

- fit_transform on Training Data: Learns the vocabulary and IDF values from the training data and transforms the training data into a TF-IDF matrix.
- transform on Test Data: Uses the learned vocabulary and IDF values from the training data to transform the test data into a TF-IDF matrix, ensuring consistency and preventing data leakage.


In [24]:
#Fit the vectorizer, and transform.
X_TFIDFfeatures_train = vectorizer_TFIDF.fit_transform(X_train) #fit and transform the training data
X_TFIDFfeatures_test = vectorizer_TFIDF.transform(X_test)

X_countfeatures_train = vectorizer_count.fit_transform(X_train) #fit and transform the training data
X_countfeatures_test = vectorizer_count.transform(X_test)



In [21]:
# View Sparse matrix
import pandas as pd
X_features_train_df = pd.DataFrame(X_features_train.toarray(), 
                                   columns=vectorizer_TFIDF.get_feature_names_out())

X_features_train_df.iloc[:, 10:20]

Unnamed: 0,'backside,'bad,'blink,'blue,'but,'curious,'d,'do,'excuse,'free
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
2321,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2322,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Fit a Classifier - Compare Metrics

In [28]:
from sklearn.naive_bayes import MultinomialNB #import the Multinomial Naive Bayes model classifier model
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

#### Multinomial Naive Bayes Classifier

In [23]:
# Metrics of TfidVectorizer
model = MultinomialNB()
model.fit(X_TFIDFfeatures_train, y_train) #Fit model to the Training Data

y_pred = model.predict(X_TFIDFfeatures_test) 

print(f"Accuracy : {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Accuracy : 0.865979381443299
              precision    recall  f1-score   support

         BBC       0.93      0.96      0.95       202
    Infowars       0.93      0.73      0.82       221
The Guardian       0.74      0.93      0.83       159

    accuracy                           0.87       582
   macro avg       0.87      0.87      0.86       582
weighted avg       0.88      0.87      0.87       582



In [25]:
# Metrics of CountVectorizer
model = MultinomialNB()
model.fit(X_countfeatures_train, y_train) #Fit model to the Training Data

y_pred = model.predict(X_countfeatures_test)

print(f"Accuracy : {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Accuracy : 0.8848797250859106
              precision    recall  f1-score   support

         BBC       0.97      0.96      0.96       202
    Infowars       0.93      0.78      0.85       221
The Guardian       0.76      0.94      0.84       159

    accuracy                           0.88       582
   macro avg       0.89      0.89      0.88       582
weighted avg       0.90      0.88      0.89       582



#### Logistical Regression

In [29]:
# Metrics of TfidVectorizer
model = LogisticRegression()
model.fit(X_TFIDFfeatures_train, y_train) #Fit model to the Training Data

y_pred = model.predict(X_TFIDFfeatures_test)

print(f"Accuracy : {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Accuracy : 0.929553264604811
              precision    recall  f1-score   support

         BBC       1.00      0.91      0.95       202
    Infowars       0.88      0.95      0.91       221
The Guardian       0.92      0.93      0.93       159

    accuracy                           0.93       582
   macro avg       0.93      0.93      0.93       582
weighted avg       0.93      0.93      0.93       582



In [30]:
# Metrics of CountVectorizer
model = LogisticRegression()
model.fit(X_countfeatures_train, y_train) #Fit model to the Training Data

y_pred = model.predict(X_countfeatures_test)

print(f"Accuracy : {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Accuracy : 0.9432989690721649
              precision    recall  f1-score   support

         BBC       0.96      0.98      0.97       202
    Infowars       0.93      0.93      0.93       221
The Guardian       0.94      0.92      0.93       159

    accuracy                           0.94       582
   macro avg       0.94      0.94      0.94       582
weighted avg       0.94      0.94      0.94       582

