In [3]:
import numpy as np
import pandas as pd
import re
import string
import nltk
from nltk.util import ngrams
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy 
import json
import random
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import os

In [8]:
def rand_emot():
    e = ["(o_o)",":-)",":P",":D","x)","ᓚᘏᗢ","╯°□°）╯︵ ┻━┻",":)",
         "*<:-)","^_^","(⌐■_■)","¯\_(ツ)_/¯", "(T_T)",":o","OwO",
        "( ͡❛ ͜ʖ ͡❛)","(̶◉͛‿◉̶)","( ≖.≖)","(ㆆ_ㆆ)","ʕ•́ᴥ•̀ʔっ","( ◡́.◡̀)","(^◡^ )"]
    return random.choice(e)

def read_data(path): #input the path to the directory with data
    frames = []
    
    _, _, files = next(os.walk(path)) #create a list of all datafile names     
          
    for file in tqdm(files): #for every file in directory
        with open(path+"/"+file) as f: #read each file
            dataframe = pd.read_json(f) #convert file to dataframe
     
        frames.append(dataframe) #append each dataframe to list
    data = pd.concat(frames, sort=False) #make it one big dataframe
    
    return data
    

In [9]:
data = read_data("scraped")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=31.0), HTML(value='')))




In [10]:
l = data["Domain"].value_counts().index.tolist()
l

['politiken.dk',
 'ekstrabladet.dk',
 'computerworld.dk',
 'jv.dk',
 'berlingske.dk',
 'jyllands-posten.dk',
 'bold.dk',
 'dr.dk',
 'bt.dk',
 'stiften.dk',
 'soundvenue.com',
 'altinget.dk',
 'sn.dk',
 'ing.dk',
 'gaffa.dk',
 'tv2.dk',
 'finans.dk',
 'version2.dk',
 'journalisten.dk',
 'information.dk',
 'fyens.dk',
 'nordjyske.dk',
 'ekkofilm.dk',
 'seoghoer.dk',
 'tv2east.dk',
 'billedbladet.dk',
 'finanswatch.dk',
 'fagbladet3f.dk',
 'ejendomswatch.dk',
 'borsen.dk',
 'motormagasinet.dk',
 'helsingordagblad.dk',
 'fodevarewatch.dk',
 'jiyan.dk',
 'pov.international',
 'samvirke.dk',
 'ugeavisen.dk',
 'skalvilege.nu',
 'kulturmonitor.dk',
 'heleherlev.dk',
 'videnskab.dk',
 'mobilabonnement.dk',
 'bilsektionen.dk',
 'dsr.dk',
 'wood-supply.dk',
 'amtsavisen.dk',
 'turisme.nu',
 'herningfolkeblad.dk',
 'prosa.dk',
 'aktieraadet.dk',
 'alt.dk',
 'altomvarebiler.dk',
 'betxpert.com',
 'finansbureauet.dk',
 'tv2lorry.dk',
 'nordea.com',
 'dagensmedicin.dk',
 'itsfashionbaby.dk',
 'dfi.dk

In [11]:
frames = []

domains_of_interest = ['politiken.dk',
 'ekstrabladet.dk',
 'computerworld.dk',
 'finans.dk',
 'berlingske.dk',
 'fyens.dk',
 'dr.dk',
 'journalisten.dk',
 'tv2.dk',
 'jyllands-posten.dk',
 'information.dk',
 'jv.dk',
 'sn.dk',
 'bt.dk',
 'version2.dk']

for domain in domains_of_interest: 
    train = data.loc[(data["Domain"] == domain)][:1000]
    frames.append(train)

trainset = pd.concat(frames, sort=False)

bodies = list(trainset["Body"])
domains = list(trainset["Domain"])

In [12]:
len(domains)

15000

In [13]:
train, test, train_labels, test_labels = train_test_split(bodies, domains, test_size=0.1, random_state=42)

In [14]:
def remove_punctuation(text):
    return re.sub(r',|\.|:|!|\?|;', '', text)

In [15]:
#POS tagger trained on Danish news and media corpus
POS_tagger_DK = spacy.load("da_core_news_md")

### NGram features - word, characters, POS-tags

In [24]:
#Trains a TF-IDF vectorizer of word n-grams
def word_ngram_vectorizer(train_corpus, n): 
    
    vectorizer = TfidfVectorizer(max_features=3000, analyzer="word", ngram_range=(n,n))
    print("training vectorizer...",rand_emot())
    
    X = vectorizer.fit_transform(train_corpus)
    print("vectorizer fit!", rand_emot())
    
    ngrams = vectorizer.get_feature_names()
      
    return X, vectorizer

#Trains a TF-IDF vectorizer of character n-grams
def char_ngram_vectorizer(train_corpus, n): 
    
    vectorizer = TfidfVectorizer(max_features=3000, analyzer="char", ngram_range=(n,n))
    print("training vectorizer...",rand_emot())
    
    X = vectorizer.fit_transform(train_corpus)
    print("vectorizer fit!", rand_emot())
    
    ngrams = vectorizer.get_feature_names()
    
    return X, vectorizer

#Trains a TF-IDF vectorizer of POS n-grams. A POS corpus is generated in the function using a tagger for Danish
def POS_ngram_vectorizer(train_corpus, n): 
    
    #Create POS corpus
    POS_corpus = []

    for doc in train_corpus:
        tagged_doc = POS_tagger_DK(doc) #tag each document in corpus with POS tags using spacy
        POS_list = []

        for token in tagged_doc:
            POS_list.append(token.pos_)

        #concatenate as POS tags for the document
        POS_text = " ".join(POS_list)
        POS_corpus.append(POS_text)

    
    vectorizer = TfidfVectorizer(max_features=3000, analyzer="word", ngram_range=(n,n))
    print("training vectorizer...",rand_emot())
    
    X = vectorizer.fit_transform(POS_corpus)
    print("vectorizer fit!", rand_emot())
    
    ngrams = vectorizer.get_feature_names()
    
    # Returns 
    return X, vectorizer

# Gets weights for terms based on trained vectorizer
# Works for both word and character ngrams and skipgrams
def get_tfidf_ngrams(vectorizer, test_corpus):
    '''Returns the TF-IDF weighted ngram frequencies of test documents'''
    #Multiple texts required
    return vectorizer.transform(test_corpus)

# Function generates POS test corpus first and then gets weights for terms based on trained vectorizer. 
def get_tfidf_POS_ngrams(vectorizer, test_corpus):
    '''Returns the TF-IDF weighted ngram frequencies of test documents'''
    #Create POS corpus
    POS_corpus = []

    for doc in test_corpus:
        tagged_doc = POS_tagger_DK(doc) #tag each document in corpus with POS tags using spacy
        POS_list = []

        for token in tagged_doc:
            POS_list.append(token.pos_)

        #concatenate POS tags as one string, i.e. the documented represented as the POS tags
        POS_text = " ".join(POS_list)
        POS_corpus.append(POS_text)
    
    #Multiple texts required
    return vectorizer.transform(POS_corpus)

def get_tfidf(vectorizer, test_corpus):
    '''Returns the TF-IDF weighted word frequencies of test documents'''
    #Multiple texts required
    return vectorizer.transform(test_corpus).toarray()
    


In [9]:
X_unigrams_train,unigram_vectorizer = word_ngram_vectorizer(train,1)
X_char_unigrams_train,char_unigram_vectorizer = char_ngram_vectorizer(train,1)
X_POS_unigram_train, POS_unigram_vectorizer = POS_ngram_vectorizer(train,1)

X_bigrams_train,bigram_vectorizer = word_ngram_vectorizer(train,2)
X_char_bigrams_train,char_bigram_vectorizer = char_ngram_vectorizer(train,2)
X_POS_bigram_train, POS_bigram_vectorizer = POS_ngram_vectorizer(train,2)

X_trigrams_train,trigram_vectorizer = word_ngram_vectorizer(train,3)
X_char_trigrams_train,char_trigram_vectorizer = char_ngram_vectorizer(train,3)
X_POS_trigram_train, POS_trigram_vectorizer = POS_ngram_vectorizer(train,3)

training vectorizer... :P
vectorizer fit! ¯\_(ツ)_/¯
training vectorizer... ( ≖.≖)
vectorizer fit! ( ◡́.◡̀)
training vectorizer... (T_T)
vectorizer fit! (ㆆ_ㆆ)
training vectorizer... ʕ•́ᴥ•̀ʔっ
vectorizer fit! :-)
training vectorizer... :P
vectorizer fit! ^_^
training vectorizer... ( ≖.≖)
vectorizer fit! ʕ•́ᴥ•̀ʔっ
training vectorizer... ( ͡❛ ͜ʖ ͡❛)
vectorizer fit! x)
training vectorizer... :-)
vectorizer fit! OwO
training vectorizer... OwO
vectorizer fit! (^◡^ )


In [43]:
X_unigrams_test = get_tfidf_ngrams(unigram_vectorizer, test).toarray()
X_char_unigrams_test = get_tfidf_ngrams(char_unigram_vectorizer, test).toarray()
X_POS_unigram_test = get_tfidf_POS_ngrams(POS_unigram_vectorizer, test).toarray()

X_bigrams_test = get_tfidf_ngrams(bigram_vectorizer, test).toarray()
X_char_bigrams_test = get_tfidf_ngrams(char_bigram_vectorizer, test).toarray()
X_POS_bigram_test = get_tfidf_POS_ngrams(POS_bigram_vectorizer, test).toarray()

X_trigrams_test = get_tfidf_ngrams(trigram_vectorizer, test).toarray()
X_char_trigrams_test = get_tfidf_ngrams(char_trigram_vectorizer, test).toarray()
X_POS_trigram_test = get_tfidf_POS_ngrams(POS_trigram_vectorizer, test).toarray()

In [44]:
from scipy.sparse import hstack

In [152]:
X_train = hstack((X_unigrams_train,X_bigrams_train))
X_train = hstack((X_train,X_trigrams_train))
X_train = hstack((X_train,X_char_unigrams_train))
X_train = hstack((X_train,X_char_bigrams_train))
X_train = hstack((X_train,X_char_trigrams_train))
X_train = hstack((X_train,X_POS_unigram_train))
X_train = hstack((X_train,X_POS_bigram_train))
X_train = hstack((X_train,X_POS_trigram_train))



In [153]:
X_test = np.hstack((X_unigrams_test,X_bigrams_test))
X_test = np.hstack((X_test,X_trigrams_test))
X_test = np.hstack((X_test,X_char_unigrams_test))
X_test = np.hstack((X_test,X_char_bigrams_test))
X_test = np.hstack((X_test,X_char_trigrams_test))
X_test = np.hstack((X_test,X_POS_unigram_test))
X_test = np.hstack((X_test,X_POS_bigram_test))
X_test = np.hstack((X_test,X_POS_trigram_test))


In [134]:
X_train.shape, X_test.shape

((225, 15991), (25, 15991))

### Function words

In [2]:
with open('function_words/funktionsord.txt', "r", encoding="utf-8") as fw:
    func_words = fw.read().split("\n")

In [16]:
for i, word in enumerate(func_words):
    word = remove_punctuation(word)
    func_words[i] = word
    if " " in word:
        func_words.remove(word)   

In [31]:
def function_words(trainset, testset, function_words): 
    
    fw_corpus = []
    
    for document in tqdm(trainset): 
        fw_document = ""
        document = remove_punctuation(document)
        words = document.split() #split text into words
        
        for word in words: 
            if word in function_words:
                fw_document = fw_document + word + " "
        
        fw_corpus.append(fw_document)
        
    
    vectorizer = TfidfVectorizer(max_features=3000, analyzer="word")
    print("training vectorizer...",rand_emot())
    
    X = vectorizer.fit_transform(fw_corpus)
    print("vectorizer fit!", rand_emot())
    
    fw_vectorized = get_tfidf(vectorizer, testset)
       
    return X, fw_vectorized, fw_corpus

In [135]:
X_fw_train, X_fw_test, fw_corpus = function_words(train, test, func_words)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=225.0), HTML(value='')))


training vectorizer... :-)
vectorizer fit! x)


In [171]:
X_train = np.hstack((X_train.todense(), X_fw_train.todense()))
X_test = np.hstack((X_test, X_fw_test))

In [172]:
print(X_train.shape), print(X_test.shape)

(225, 16129)
(25, 16129)


(None, None)

In [142]:
import scipy.sparse
from scipy.sparse import hstack

### Skipgrams

In [20]:
# from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk import skipgrams

In [178]:
# stop_words = set(stopwords.words("danish"))

In [179]:
# corpus_without_stopwords =[]

# for doc in tqdm(sub_bodies): 
#     temp_doc = []
#     for word in doc.lower().split(): 
#         if word not in stop_words: 
#             temp_doc.append(word)
#     corpus_without_stopwords.append(" ".join(temp_doc))

In [180]:
# new_corpus =[]

# for doc in corpus_without_stopwords: 
#     split_doc = doc.split(" ")
#     new_corpus.append(split_doc)
# corpus = new_corpus

In [181]:
# w2v_model = Word2Vec(corpus, sg=1)

In [182]:
# w2v_model.most_similar("mand", topn=5)

In [21]:
def skipgram_vectorizer(train_corpus, n, k): #k = step (skip) size
    
    skipgram_corpus = []
    
    for doc in tqdm(train_corpus): 
        l = list(skipgrams(doc.split(), n, k))
        new_doc = ' '.join([' '.join(x) for x in l]) #concatenate skipgrams as new representation of doc
        skipgram_corpus.append(new_doc)
    
    vectorizer = TfidfVectorizer(max_features=3000, analyzer="word", ngram_range=(n,n))
    print("training vectorizer...",rand_emot())
    
    X = vectorizer.fit_transform(skipgram_corpus)
    print("vectorizer fit!", rand_emot())
    
    ngrams = vectorizer.get_feature_names()
      
    return X, vectorizer, ngrams

In [22]:
X_skip_bigrams, skip_vectorizer, skipgram_names = skipgram_vectorizer(train, 2, 2)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=13500.0), HTML(value='')))


training vectorizer... ╯°□°）╯︵ ┻━┻
vectorizer fit! (o_o)


In [25]:
skip_bigrams_vectorized = get_tfidf_ngrams(skip_vectorizer, test) # returns X_test for skipgrams

In [26]:
X_train = X_skip_bigrams
X_test = skip_bigrams_vectorized

# Classification

In [27]:
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.metrics import f1_score

### K-Nearest Neighbours

In [28]:
KNN_classifier = KNeighborsClassifier(n_neighbors=5)
KNN_classifier.fit(X_train, train_labels)
KNN_prediction = KNN_classifier.predict(X_test)
KNN_score = KNN_classifier.score(X_test, test_labels)
KNN_f1 = f1_score(test_labels, KNN_prediction, average="weighted")

### Dummy Classifier

In [29]:
dummy_classifier = DummyClassifier(strategy="prior")
dummy_classifier.fit(X_train, train_labels)
dummy_prediction = dummy_classifier.predict(X_test)
dummy_score = dummy_classifier.score(X_test, test_labels)
dummy_f1 = f1_score(test_labels, dummy_prediction, average="weighted")

### Random Forest Classifier

In [30]:
RandomForest_classifier = RandomForestClassifier(random_state=42)
RandomForest_classifier.fit(X_train, train_labels)
RandomForest_prediction = RandomForest_classifier.predict(X_test)
RandomForest_score = RandomForest_classifier.score(X_test, test_labels)
RandomForest_f1 = f1_score(test_labels, RandomForest_prediction, average="weighted")
RandomForest_classifier.feature_importances_

array([1.72854410e-04, 1.53155723e-04, 6.49239403e-05, ...,
       2.45455418e-05, 9.62685393e-04, 1.21588330e-04])

### Naive Bayes Classifier

In [33]:
NaiveBayes_classifier = GaussianNB()
NaiveBayes_classifier.fit(X_train.toarray(), train_labels)
NaiveBayes_prediction = NaiveBayes_classifier.predict(X_test.toarray())
NaiveBayes_score = NaiveBayes_classifier.score(X_test.toarray(), test_labels)
NaiveBayes_f1 = f1_score(test_labels, NaiveBayes_prediction, average="weighted")

### Support Vector Machine 

In [None]:
SVM_classifier = svm.SVC()
SVM_classifier.fit(X_train, train_labels)
SVM_prediction = SVM_classifier.predict(X_test)
SVM_score = SVM_classifier.score(X_test, test_labels)
SVM_f1 = f1_score(test_labels, SVM_prediction, average="weighted")

# Evaluation

In [None]:
print("Evaluation of Newspaper attribution\n")
print("Accuracy scores:\n KNN: {}\n Dummy: {}\n Random Forest: {}\n Naive Bayes: {}\n SVM: {} \n\n".format(KNN_score, dummy_score, RandomForest_score, NaiveBayes_score, SVM_score))
print("F1 scores:\n KNN: {}\n Dummy: {}\n Random Forest: {}\n Naive Bayes: {}\n SVM: {} \n\n".format(KNN_f1, dummy_f1, RandomForest_f1, NaiveBayes_f1, SVM_f1))