In [93]:
import pandas as pd
import matplotlib.pyplot as plt
from ast import literal_eval
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import PassiveAggressiveClassifier

import transformers as ppb # pytorch-pretrained-bert
import torch

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\madsv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [100]:
import pipeline as pp
import models as ml

import importlib

<module 'pipeline' from 'c:\\Users\\madsv\\Documents\\Documents\\University\\DataScience\\FakeNews\\src\\pipeline.py'>

In [3]:
# The number of rows to train the model
BATCH_SIZE = 1000000

In [266]:
importlib.reload(pp)

def Clean_data(file):
    stopwords_lst = stopwords.words('english') + ["<NUM>","<DATE>","<URL>","<EMAIL>"]
    cleaned_data = pp.apply_pipeline(file, [
            (pp.binary_labels(), 'type'),
            (pp.Clean_data(), 'content'),
            (pp.Tokenizer(), "content"),
            (pp.Remove_stopwords(stopwords_lst), "content"),
            (pp.Stem(), "content"),
        ], 
        get_batch=True, 
        batch_size=BATCH_SIZE)

    return cleaned_data


cleaned_data = Clean_data("../datasets/sample/dataset.csv")

In [267]:
importlib.reload(pp)

def Get_unique_words(cleaned_data):
    unique_words = pp.Generate_unique_word_list()
    pp.apply_pipeline_pd(cleaned_data, [
        (unique_words, "content")
    ])
    return unique_words.get_unique_words()

unique_words = Get_unique_words(cleaned_data)
print(unique_words)

['aa', 'aaa', 'aaaaaaaaaand', 'aaaaaand', 'aaaaah', 'aaacc', 'aaamok', 'aad', 'aadhar', 'aadvantag', 'aaf', 'aafca', 'aafia', 'aagc', 'aak', 'aaliyah', 'aamar', 'aamon', 'aan', 'aanbevelingen', 'aanegativea', 'aangenomen', 'aanleid', 'aap', 'aapl', 'aar', 'aard', 'aaron', 'aaronovich', 'aaronovitch', 'aaronsmorri', 'aarp', 'aastablea', 'aat', 'aaup', 'aaway', 'aaxa', 'ab', 'aba', 'ababa', 'aback', 'abacu', 'abada', 'abadi', 'abajo', 'abal', 'abalon', 'abanazar', 'abanderar', 'abandon', 'abandono', 'abarrel', 'abasolo', 'abassador', 'abat', 'abaya', 'abayomi', 'abayov', 'abb', 'abba', 'abbascontrol', 'abbasiya', 'abbass', 'abbau', 'abbey', 'abbi', 'abboccato', 'abbot', 'abbott', 'abbottabad', 'abbottadbad', 'abbrevi', 'abbv', 'abbvi', 'abc', 'abcg', 'abcnbc', 'abcnew', 'abd', 'abdal', 'abdalaziz', 'abdalhaleem', 'abdalla', 'abdallah', 'abdalmahmood', 'abdel', 'abdelaziz', 'abdelfatah', 'abdelfattah', 'abdelhakim', 'abdelhaleem', 'abdelhamid', 'abdellah', 'abdelmaj', 'abdelrahman', 'abde

In [259]:
print(hex(ord(unique_words[-5][0])))
print(unique_words[-5][0])

0x79
y


In [223]:
content = pp.apply_pipeline("../datasets/sample/dataset.csv", [], get_batch=True, batch_size=100)['content']
cleaned = pp.apply_pipeline("../datasets/sample/dataset.csv", [(pp.Clean_data(), 'content')], get_batch=True, batch_size=100)['content']
print(content[23])
print(cleaned[23])

Lawsuit Alleges Fox News Is Officially In the Tank For Donald Trump

% of readers think this story is Fact. Add your two cents.

Headline: Bitcoin & Blockchain Searches Exceed Trump! Blockchain Stocks Are Next!

Caricature by DonkeyHotey flic.kr/p/Ct4G4K

If you’ve ever wondered about the slavish bootlicking the Donald Trump gets across a broad spectrum of Fox News hosts (see here | here | here | here | here) the mystery has been solved. Months ago Roger Ailes agreed to do whatever he could to help Donald Trump.

We’ve had hints before. Back in January, Jeff Dunetz, writing at lidblog.com (best known on Twitter as @yidwithlid) reported on a conversation he had with Cheri Jacobus. This was just conversation that may or may not be accurately remembered. Quite honestly, I took it to be a bit overblown. The idea that Roger Ailes would turn Fox News into a Trump super PAC seemed far-fetched.

But there has always been the nagging doubts. For instance, Donald Trump has been acting like he is

In [271]:
importlib.reload(pp)

def Vectorize_content(cleaned_data, unique_words):
    vectors = pp.apply_pipeline_pd_tqdm(cleaned_data, [
        (pp.Create_word_vector(unique_words), "content")
    ])
    return vectors

vectors = Vectorize_content(cleaned_data, unique_words)
print(vectors["content"])

TfidfVectorizer performs the following operations:
* <strong>Tokenization:</strong> It breaks the text into individual words or tokens.
* <strong>Counting:</strong> It counts the number of occurrences of each token in each document.
* <strong>Normalization:</strong> It calculates the frequency of each token in each document by dividing the count by the total number of tokens in the document.
* <strong>Weighting:</strong> It applies the Tfidf weighting scheme to each token in each document. The Tfidf weight of a token in a document is proportional to its frequency in the document, but inversely proportional to its frequency in the corpus (i.e., the collection of all documents).

In [None]:
def Vectorize(file, to_csv_file_name=None):

    cleaned_data = pp.apply_pipeline(file, [
            (pp.binary_labels(), 'type'),
            (pp.Clean_data(), 'content')
        ], 
        get_batch=True, 
        batch_size=BATCH_SIZE)

    print("Shape of data: ", cleaned_data.shape)

    #TODO: how to avoid leaking information from the test set? 
    vect = TfidfVectorizer(stop_words='english', max_df=0.7)
    vect.fit(cleaned_data['content'])
    
    print("Vocabulary: ", len(vect.vocabulary_), " words")
    content_tfidf = vect.transform(cleaned_data['content'])
    print("Shape of vect: ", content_tfidf.shape)

    vectorized_data = {
        "X": content_tfidf,
        "y": cleaned_data['type']
    }

    if to_csv_file_name != None:
        content_tfidf_df = pd.DataFrame(content_tfidf.todense(),columns = vect.get_feature_names_out())
        content_tfidf_df.to_csv(to_csv_file_name, index=False)

    print("Done vectorizing data!")
    return vectorized_data

vectorized_data = Vectorize("../datasets/sample/dataset.csv")

In [28]:
X_train, X_test, y_train, y_test = train_test_split(vectorized_data["X"], vectorized_data["y"].astype(int), test_size=0.40, random_state=42)

In [34]:
def support_vector_classifier(X_train, X_test, y_train, y_test):
    # Define the classifier classes
    svc = SVC(kernel='linear')

    # Fit the model
    svc.fit(X_train, y_train)

    # Predict on the test set
    svc_pred = svc.predict(X_test)

    # Evaluate performance
    print("svc accuracy: " + str(accuracy_score(y_test, svc_pred)))

In [35]:
support_vector_classifier(X_train, X_test, y_train, y_test)

svc accuracy: 0.805


In [36]:
def k_neighbors_classifier(X_train, X_test, y_train, y_test):
    # Define the classifier classes
    k_nearest = KNeighborsClassifier(n_neighbors=15, weights='distance')

    # Fit the model
    k_nearest.fit(X_train, y_train)

    # Predict on the test set
    k_nearest_pred = k_nearest.predict(X_test)

    # Evaluate performance
    print("k_nearest accuracy:", accuracy_score(y_test, k_nearest_pred))


In [37]:
k_neighbors_classifier(X_train, X_test, y_train, y_test)

k_nearest accuracy: 0.75875


In [38]:
def passive_aggressive_classifier(X_train, X_test, y_train, y_test):
    # Define the classifier classes
    passive_aggressive = PassiveAggressiveClassifier()

    # Fit the model
    passive_aggressive.fit(X_train, y_train)

    # Predict on the test set
    passive_aggressive_pred = passive_aggressive.predict(X_test)

    # Evaluate performance
    print("passive_aggressive accuracy:", accuracy_score(y_test, passive_aggressive_pred))

In [39]:
passive_aggressive_classifier(X_train, X_test, y_train, y_test)

passive_aggressive accuracy: 0.79875


In [23]:
unique, counts = np.unique(y_train, return_counts=True)
total = sum(counts)
print("counts: ", dict(zip(unique, counts)))
print("count freq", dict(zip(unique, counts/total)))

counts:  {0: 2737, 1: 3263}
count freq {0: 0.45616666666666666, 1: 0.5438333333333333}


In [26]:
def distilBERT(file):
    content = pp.apply_pipeline(file, [], get_batch=True, batch_size=BATCH_SIZE)['content']

    #Tokenizer and model input
    pretrained_weights = 'distilbert-base-uncased'
    tokenizer = ppb.DistilBertTokenizer.from_pretrained(pretrained_weights)
    model = ppb.DistilBertModel.from_pretrained(pretrained_weights, from_tf=True)

    #Tokenize input
    tokenized = content.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
    model.eval()

    #Pad input so that all sequences are of the same size:
    max_len = 0
    for i in tokenized.values:
        if len(i) > max_len:
            max_len = len(i)
    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
    padded = padded[:,:32]

    # Tell embedding model to disregard pad tokens
    attention_mask = np.where(padded != 0, 1, 0)
    
    device = torch.device("cpu")

    if torch.cuda.is_available():
      model = model.cuda()
      device = torch.device("cuda")

    # Convert input to a pytorch tensor
    input = torch.tensor(np.array(padded), device=device)
    attention_mask = torch.tensor(attention_mask, device=device)

    # Embed sequences (processing in batches to avoid memory problems)
    batch_size= 200
    embeddings = []

    for start_index in range(0, input.shape[0], batch_size):
      with torch.no_grad():
        # Call embedding model
        embedding = model(input[start_index:start_index+batch_size], 
                          attention_mask=attention_mask[start_index:start_index+batch_size])[0][:,0,:]
        embeddings.append(embedding)
    embeddings = torch.cat(embeddings)   # concatenate all batch outputs back into one tensor

    # Move embeddings back to numpy
    embeddings = embeddings.cpu().numpy()
    
    return embeddings
  
embeddings = distilBERT("../datasets/sample/dataset.csv")

All TF 2.0 model weights were used when initializing DistilBertModel.

All the weights of DistilBertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertModel for predictions without further training.
Token indices sequence length is longer than the specified maximum sequence length for this model (1771 > 512). Running this sequence through the model will result in indexing errors


In [None]:
labels = pp.apply_pipeline("../datasets/sample/train.csv", [], get_batch=True, batch_size=BATCH_SIZE)['type']
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42)

layers = [1,2,3,4,5]
layer_sizes = [2,5,8,11,14]
tuple_list = []

for layer_size in layer_sizes:
    for layer in layers:
        tuple_list.append((layer_size,) * layer)
        
inputs = {'hidden_layer_sizes': tuple_list}

# Define the classifier classes
MLP = MLPClassifier()


#Gridsearch
cross_val = GridSearchCV(MLP, inputs)
print("Best parameters: ", cross_val.best_params_")

# Fit the model
cross_val.fit(X_train, y_train)
