# Here we go again...

In [None]:
from simpletransformers.language_representation import RepresentationModel
from simpletransformers.config.model_args import ModelArgs
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import random
import os
import re
import warnings
warnings.filterwarnings('ignore')

## Read data

When running the function, input the path to where the desired dataset(s) is/are located.

In [None]:
def read_data(path): #input the path to the directory with data
    frames = []
    
    _, _, files = next(os.walk(path)) #create a list of all datafile names     
          
    for file in tqdm(files): #for every file in directory
        with open(path+"/"+file) as f: #read each file
            dataframe = pd.read_json(f) #convert file to dataframe
     
        frames.append(dataframe) #append each dataframe to list
    data = pd.concat(frames, sort=False) #make it one big dataframe
    
    return data, frames

In [None]:
author_subset, author_subset_df_list = read_data("final_subsets/final_author_subset")

In [None]:
domain_subset, domain_subset_df_list = read_data("final_subsets/final_domain_subset")

## Encoding entire dataset

When running the function that encodes the dataset, make sure the dataset is formatted as a list of dataframes - the 2nd object that's returned from the ```read_data``` function.

In [None]:
def remove_symbols(text):
    return re.sub('\W+',' ', text)

In [None]:
def encode_dataset(bodies, target, file_name="train", subset="authors", training_epochs=1): #data = bodies of dataset, target = target values, i.e. domains or authors
      
    model_args = ModelArgs(encoding="utf-8", manual_seed=42, num_train_epochs=training_epochs)

    print("Initializing Representation Model")
    model = RepresentationModel(
                model_type='bert',
                model_name='Maltehb/danish-bert-botxo',
                args=model_args,
                use_cuda=False)

    lower_bodies = []

    #clean bodies from punctuation and lowercase words
    for text in bodies: 
        text = text.lower()
        lower_bodies.append(text)

    #encode lowered bodies
    print(f"Encoding {file_name}set for {subset} subset...")
    word_vectors = model.encode_sentences(lower_bodies, combine_strategy='mean') 

    if subset == "authors":
        
        np.save(f"auto_encodings/author_encodings/{file_name}_X", word_vectors) 
        np.save(f"auto_encodings/author_encodings/{file_name}_y", target)
    
    if subset == "domains":
        
        np.save(f"auto_encodings/domain_encodings/{file_name}_X", word_vectors) 
        np.save(f"auto_encodings/domain_encodings/{file_name}_y", target)
    
    
    #save encodings to file numbered with matching index of current datafile
    np.save(f"auto_encodings/{file_name}_autoencodings_{subset}", word_vectors) 
    
    np.save(f"auto_encodings/{file_name}_autoencodings_{subset}_target", target)
        
    print("Data saved o/\o")
    
    return None

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
author_bodies = list(author_subset.Body)
authors = list(author_subset.Byline)

domain_bodies = list(author_subset.Body)
domains = list(author_subset.Byline)

bodies_author_sub = auhthor_bodies[:500]
authors_sub = authors[:500]

In [None]:
train_X_authors, test_X_authors, train_y_authors, test_y_authors = train_test_split(author_bodies, authors, test_size=0.2, random_state=42, stratify=authors)
train_X_domains, test_X_domains, train_y_domains, test_y_domains = train_test_split(domain_bodies, domains, test_size=0.2, random_state=42, stratify=domains)

In [None]:
len(train_X_authors), len(test_X_authors)

In [None]:
%%time
encode_dataset(train_X_authors, train_y_authors, file_name="train", subset="authors")

In [None]:
%%time
encode_dataset(test_X_authors, test_y_authors, file_name="test", subset="authors")

In [None]:
%%time
encode_dataset(train_X_domains, train_y_domains, file_name="train", subset="domains")

In [None]:
%%time
encode_dataset(train_X_domains, train_y_domains, file_name="test", subset="domains")

### Loading encodings from saved files

In [None]:
def load_autoencodings(path, train_or_test="train"): 
    
    _,_, files = next(os.walk(path))

    encodings_array = []
    target_array = []
    
    for file in tqdm(files):

        if file == f"{train_or_test}_X.npy":
            encodings = np.load(path+'/'+file)
            encodings_array.append(encodings)
            
        if file == f"{train_or_test}_y.npy": 
            target = np.load(path+'/'+file)
            target_array.append(target)
                

    return encodings, target

In [None]:
author_X_train, author_y_train = load_autoencodings("auto_encodings/author_encodings", train_or_test="train")

In [None]:
author_X_test, author_y_test = load_autoencodings("auto_encodings/author_encodings", train_or_test="test")

In [None]:
domain_X_train, domain_y_train = load_autoencodings("auto_encodings/domain_encodings", train_or_test="train")

In [None]:
domain_X_test, domain_y_test = load_autoencodings("auto_encodings/domain_encodings", train_or_test="test")

## Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier


In [None]:
y = [0]*100 + [1]*100 + [2]*100 + [3]*100 + [4]*100 + [5]*100 + [6]*100
# y = [0]*1000 + [1]*1000 + [2]*1000 + [3]*1000 + [4]*1000 + [5]*1000 +[6]*1000

# train_X, test_X, train_y, test_y = train_test_split(encodings, domains)
# test_y

In [None]:
rfc = RandomForestClassifier().fit(train_encodings, train_target)

In [None]:
rfc.score(test_encodings, test_target)

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import *

dum = DummyClassifier().fit(train_encodings, train_target)
dum.score(test_encodings, test_target)

In [None]:
import classifier_unit_test
import matplotlib.pyplot as plt

In [None]:
tpr_fpr = classifier_unit_test.test_classifier(rfc, train_X, test_X, train_y, test_y, give_roc=True)

In [None]:
true_pos_rate = tpr_fpr["tpr"]
false_pos_rate = tpr_fpr["fpr"]
dum_tpr = tpr_fpr["dum_tpr"]
dum_fpr = tpr_fpr["dum_fpr"]

In [None]:
plt.plot(false_pos_rate, true_pos_rate)
plt.plot(dum_tpr,dum_fpr)
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.show()

# Test the shit on codified

In [None]:
codified = np.load('../../../codified.npy')
codified.shape

In [None]:
text = list(data.Header.astype(str))
texts = text
final_text = ''
for text in tqdm(texts):
    text = text.strip()
    text = text.replace('\n','')
    text = text.replace('\r','')
    text = text.replace('\t','')
    if len(text) < 10:
        continue
    final_text += ''.join(text)
    final_text += '\n'
final_text = final_text[:-1]

headers = final_text.split('\n')

In [None]:
politiken2_ind = []
information2_ind = []
politiken2 = []
information2 = []

for i, header in tqdm(enumerate(headers)):
    if header in politiken:
        politiken2_ind.append(i)
    elif header in information:
        information2_ind.append(i)
    
    if len(politiken2) == 1000 and len(information2)==1000:
        break

In [None]:
politiken2_vecs = np.array([codified[i] for i in politiken2_ind])
information2_vecs = np.array([codified[i] for i in information2_ind][:992])
codified_vecs = np.vstack((politiken2_vecs, information2_vecs))

y = [0]*992 + [1]*992
train_X, test_X, train_y, test_y = train_test_split(codified_vecs, y)

In [None]:
# random.shuffle(y)
# train_X, test_X, train_y, test_y = train_test_split(codified_vecs, y)

In [None]:
import classifier_unit_test

In [None]:
rfc = RandomForestClassifier()

In [None]:
classifier_unit_test.test_classifier(rfc, train_X, test_X, train_y, test_y)

In [None]:
auc

# Generation test

In [None]:
#Make vector of some articles
bodies = list(data["Body"])[9000:9100]
headers = list(data["Header"])[9000:9100]

model = RepresentationModel(
        model_type='bert',
        model_name='Maltehb/danish-bert-botxo',
        use_cuda=False)

vectors = model.encode_sentences(bodies, combine_strategy='mean')
vectors.shape

In [None]:
train_data = [[vectors[i], headers[i]] for i in range(len(headers)-10)]
eval_data = [[vectors[i], headers[i]] for i in range(len(headers)-10, len(headers))]
train_df = pd.DataFrame(train_data, columns=["input", "target"])
eval_df = pd.DataFrame(eval_data, columns=["input", "target"])
train_df.head()

In [None]:
from transformers import MarianMTModel, MarianTokenizer
import torch

tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-de")
embeddings = model.get_input_embeddings()