# Deep learning for AA - Convolutional Neural Network

## installing libraries

In [None]:
#!pip install pandas numpy matplotlib tqdm keras 

In [None]:
# general libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import random

random.seed(42)
np.random.seed(42)

### Imports

In [None]:
# MLP for Pima Indians Dataset with 10-fold cross validation via sklearn
import keras
import tensorflow as tf
from keras.models import Sequential
from keras.utils import np_utils
from keras.layers import Conv1D, GRU, Dropout, Dense,Convolution1D

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, BatchNormalization

from keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from keras.optimizers import Adagrad

In [None]:
import sklearn
from sklearn import metrics

In [None]:
for l,v in {'pandas':pd, 'numpy':np, 'keras': keras, 'sklearn':sklearn,}.items():
    print(f'{l}  version {v.__version__}')

pandas  version 1.1.4
numpy  version 1.18.5
keras  version 2.4.3
sklearn  version 0.22.2.post1


## Reading data from google drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
baseDir = '/content/gdrive/My Drive/Colab Notebooks/AA'

In [None]:
datasets = pd.read_json(baseDir+'/data/AllDS.json.zip', orient='records', compression='gzip')
datasets['row_index'] = np.arange(len(datasets));
datasets = datasets[['row_index','dataset','problem','language','set','filename','text','label']]


*   **row_index**: auxiliaray field to help merging the result of the models.
*   **Dataset**: group of documents belonging to the same scenario (social, literature, lyrics, etc)
*   **Problem**:  a especific test case like 20 authors with short text or 5 authors with long texts.
*   **Set**: Known is the development set,  unkown is the validation. Note this is not the train-test split, what should be done only with the development set.
*   **Filename**: the original filename in the corpus
*   **label**: the target


In [None]:
datasets.head()

Unnamed: 0,row_index,dataset,problem,language,set,filename,text,label
0,0,pan18_train,problem00001,en,known,known00001.txt,"graceful ones.\n\n""One more,"" Marvelous said, ...",candidate00001
1,1,pan18_train,problem00001,en,known,known00002.txt,"before. If he can, he’ll remember a classmate ...",candidate00001
2,2,pan18_train,problem00001,en,known,known00003.txt,she thought - he was in Team Baron only becaus...,candidate00001
3,3,pan18_train,problem00001,en,known,known00007.txt,"As far as she remembers, she's always hated pr...",candidate00001
4,4,pan18_train,problem00001,en,known,known00006.txt,"“Wait for me, please!”\n\nShe glanced towards ...",candidate00001


In [None]:
datasets.groupby(['dataset']).agg({'problem':'nunique'}).T

dataset,lyrics,pan18_eval,pan18_train,socialaa
problem,10,20,10,32


In [None]:
def filter_dataset(df, dataset, problem):
    df = datasets.query(f"dataset == '{dataset}' and problem == '{problem}'")

    train = df[df['set'] =='known']
    X_train = np.array([c for c in train['text']])
    y_train = train['label'].values
    index_train = train['row_index'].values

    # The test set is actually the validation set and should not be used in the training processing
    test = df[df['set'] =='unknown']
    X_test = np.array([c for c in test['text']])
    y_test = test['label'].values
    index_test = test['row_index'].values

    return X_train, y_train, index_train, X_test, y_test, index_test;


In [None]:
def build_model(num_tokens, embedding_dim):
    from tensorflow.keras import layers
    model = Sequential();
    model.add(Embedding(num_tokens, embedding_dim, trainable=True));
    model.add(layers.Conv1D(200, 10, activation="sigmoid", kernel_initializer='he_uniform'))
    model.add(layers.GlobalAvgPool1D())
    model.add(Dropout(0.25));
    model.add(Dense(len(np.unique(y_train)), activation='softmax', kernel_initializer='he_uniform'));
    return model;

In [None]:
def fit_dl_model(X_train,
                 X_test,
                 y_train_index,
                 y_test_index,
                 vocab_size = 10000, max_length = 1000, embedding_dim = 150):

    #fitting the vocabulary
    vectorizer = TextVectorization(max_tokens=vocab_size, output_sequence_length=max_length)
    text_ds = tf.data.Dataset.from_tensor_slices(X_train).batch(128)
    vectorizer.adapt(text_ds)

    voc = vectorizer.get_vocabulary()
    word_index = dict(zip(voc, range(len(voc))))
    num_tokens = len(voc) + 2

    #converting data types to keras
    x_train = vectorizer(np.array([[s] for s in X_train])).numpy()
    x_val = vectorizer(np.array([[s] for s in X_test])).numpy()

    y_train = np.array(y_train_index)
    y_val = np.array(y_test_index)


    #setting the random seed
    from tensorflow.random import set_seed as tf_set_seed
    tf_set_seed(2)
    np.random.seed(2)

    model = build_model(num_tokens, embedding_dim);

    my_callbacks = [
        tf.keras.callbacks.EarlyStopping(patience=50),
        tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.01, patience=3, min_lr=0.001),
    ]

    optimizer = keras.optimizers.RMSprop(learning_rate=0.01)
    model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["acc"])

    history = model.fit(x_train, y_train, batch_size=int(len(x_train)/2+1), epochs=300,
                        callbacks=[my_callbacks],
                        validation_split=0.2,
                        verbose = 0,
                        #validation_data=(x_val, y_val)
            )
    return model.predict(x_train), model.predict(x_val)


In [None]:
predictions = [];
classification_reports = [];

for dataset in datasets['dataset'].unique():
    for problem in datasets.query(f"dataset == '{dataset}'")['problem'].unique():
        print(dataset, problem);

        X_train, y_train, index_train, X_test, y_test, index_test = filter_dataset(datasets,dataset,problem)
        class_names = np.unique(y_train)
        class_index = {c:i for i,c in enumerate(class_names)}
        y_train_index = [class_index[c] for c in y_train]
        y_test_index = [class_index[c] for c in y_test]

        pred_proba_train, pred_proba_test = fit_dl_model(X_train,X_test,y_train_index, y_test_index)

        pred_train = [class_names[i] for i in pred_proba_train.argmax(axis=1)]
        pred_test = [class_names[i] for i in pred_proba_test.argmax(axis=1)]

        #saving predictions for future analysis
        def appendPrediction(predictions,classes_,index, pred,proba):
            for i,p, pr in  zip(index, pred, proba):
                predictions.append(dict(**{
                        'dataset':dataset,
                        'problem':problem,
                        'model':'CNN',
                        'row_index':i,
                        'pred':p,
                    }, **{
                        cc:pr_ for cc,pr_ in zip(classes_, pr)
                    })
                )

        appendPrediction(predictions,class_names, index_train, pred_train, pred_proba_train)
        appendPrediction(predictions,class_names, index_test, pred_test, pred_proba_test)

        class_report = metrics.classification_report(y_test,pred_test,output_dict=True)

        classification_reports.append(dict(**{
            'dataset':dataset,
            'problem':problem,
            'model':'CNN',
            'classification_report':class_report
        }, **class_report['macro avg']))

pan18_train problem00001


  _warn_prf(average, modifier, msg_start, len(result))


pan18_train problem00002
pan18_train problem00003
pan18_train problem00004
pan18_train problem00005
pan18_train problem00006
pan18_train problem00007
pan18_train problem00008
pan18_train problem00009
pan18_train problem00010
pan18_eval problem00001
pan18_eval problem00002
pan18_eval problem00003
pan18_eval problem00004
pan18_eval problem00005
pan18_eval problem00006
pan18_eval problem00007
pan18_eval problem00008
pan18_eval problem00009
pan18_eval problem00010
pan18_eval problem00011
pan18_eval problem00012
pan18_eval problem00013
pan18_eval problem00014
pan18_eval problem00015
pan18_eval problem00016
pan18_eval problem00017
pan18_eval problem00018
pan18_eval problem00019
pan18_eval problem00020
lyrics problem00001
lyrics problem00002
lyrics problem00003
lyrics problem00004
lyrics problem00005
lyrics problem00006
lyrics problem00007
lyrics problem00008
lyrics problem00009
lyrics problem00010
socialaa problem00001
socialaa problem00002
socialaa problem00003
socialaa problem00004
sociala

In [None]:
pd.DataFrame(predictions)\
    .sort_values('row_index').round(5)\
    .to_csv(baseDir+'/DL/output_dl/cnn_predictions.csv.zip', index=False, compression='zip', encoding='utf-8')

### predictions 

* predictions file contains the prediction for a dataset|problem|instance vs LR|MLP|LGBM
* Model is the model pipeline.
* Prediction is the label with the highest probability

In [None]:
pd.DataFrame(predictions).round(5).sort_values('row_index')

Unnamed: 0,dataset,problem,model,row_index,pred,candidate00001,candidate00002,candidate00003,candidate00004,candidate00005,candidate00006,candidate00007,candidate00008,candidate00009,candidate00010,candidate00011,candidate00012,candidate00013,candidate00014,candidate00015,candidate00016,candidate00017,candidate00018,candidate00019,candidate00020,candidate00021,candidate00022,candidate00023,candidate00024,candidate00025,candidate00026,candidate00027,candidate00028,candidate00029,candidate00030,candidate00031,candidate00032,candidate00033,candidate00034,candidate00035,candidate00036,candidate00037,candidate00038,candidate00039,candidate00040,candidate00041,candidate00042,candidate00043,candidate00044,candidate00045,candidate00046,candidate00047,candidate00048,candidate00049,candidate00050
0,pan18_train,problem00001,CNN,0,candidate00001,0.96244,0.00164,0.00049,0.00585,0.00530,0.00125,0.00112,0.00152,0.00113,0.00237,0.00041,0.00626,0.00078,0.00083,0.00013,0.00844,0.00001,0.00001,0.00000,0.00002,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,pan18_train,problem00001,CNN,1,candidate00001,0.97175,0.00121,0.00034,0.00384,0.00302,0.00093,0.00049,0.00076,0.00202,0.00098,0.00064,0.00308,0.00089,0.00059,0.00008,0.00936,0.00000,0.00001,0.00000,0.00001,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,pan18_train,problem00001,CNN,2,candidate00001,0.96557,0.00084,0.00052,0.00606,0.00430,0.00119,0.00079,0.00176,0.00101,0.00153,0.00026,0.00570,0.00081,0.00064,0.00006,0.00893,0.00000,0.00001,0.00000,0.00002,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,pan18_train,problem00001,CNN,3,candidate00001,0.92891,0.00659,0.00068,0.00926,0.00651,0.00291,0.00181,0.00114,0.00352,0.00701,0.00233,0.00515,0.00184,0.00271,0.00090,0.01862,0.00002,0.00003,0.00001,0.00005,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,pan18_train,problem00001,CNN,4,candidate00001,0.94418,0.00694,0.00064,0.00651,0.00542,0.00328,0.00213,0.00118,0.00177,0.00547,0.00135,0.00646,0.00097,0.00180,0.00087,0.01093,0.00001,0.00003,0.00001,0.00005,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30961,socialaa,problem00032,CNN,30961,candidate00022,0.02370,0.02568,0.02394,0.02659,0.02554,0.02463,0.02615,0.02627,0.02444,0.02391,0.02744,0.02716,0.02674,0.02628,0.02194,0.02600,0.02494,0.02563,0.02344,0.02451,0.02406,0.02820,0.02553,0.02279,0.02465,0.02637,0.02403,0.02518,0.02548,0.02504,0.02464,0.02328,0.02564,0.02399,0.02464,0.02169,0.02582,0.02424,0.02428,0.02293,0.00038,0.00016,0.00015,0.00025,0.00010,0.00026,0.00025,0.00052,0.00021,0.00033
30962,socialaa,problem00032,CNN,30962,candidate00030,0.02726,0.02700,0.02051,0.02175,0.02276,0.02678,0.02356,0.02418,0.02538,0.02131,0.02369,0.02350,0.02782,0.02485,0.01684,0.02374,0.02613,0.02299,0.02719,0.02757,0.02723,0.02365,0.02410,0.02653,0.02177,0.02542,0.02779,0.02604,0.02642,0.02822,0.02567,0.02426,0.02720,0.02710,0.02596,0.02427,0.02004,0.02721,0.02697,0.02630,0.00043,0.00020,0.00019,0.00031,0.00012,0.00030,0.00029,0.00056,0.00025,0.00038
30963,socialaa,problem00032,CNN,30963,candidate00022,0.02444,0.02508,0.02447,0.02571,0.02565,0.02448,0.02643,0.02589,0.02441,0.02381,0.02641,0.02644,0.02577,0.02597,0.02219,0.02591,0.02470,0.02550,0.02406,0.02474,0.02370,0.02772,0.02586,0.02311,0.02461,0.02529,0.02476,0.02486,0.02501,0.02508,0.02485,0.02380,0.02529,0.02495,0.02436,0.02336,0.02497,0.02562,0.02441,0.02361,0.00039,0.00017,0.00016,0.00026,0.00011,0.00027,0.00026,0.00054,0.00021,0.00035
30964,socialaa,problem00032,CNN,30964,candidate00013,0.02718,0.02791,0.01989,0.02267,0.02236,0.02699,0.02308,0.02449,0.02484,0.02066,0.02463,0.02366,0.02914,0.02519,0.01669,0.02374,0.02712,0.02290,0.02662,0.02757,0.02788,0.02512,0.02345,0.02636,0.02125,0.02645,0.02713,0.02593,0.02722,0.02851,0.02495,0.02374,0.02787,0.02600,0.02646,0.02222,0.02099,0.02574,0.02678,0.02556,0.00043,0.00020,0.00019,0.00031,0.00012,0.00031,0.00029,0.00055,0.00025,0.00038


### Classification Reports

In [None]:
pd.DataFrame(classification_reports).to_json(
    baseDir+'/DL/output_dl/cn_classification_reports.json.zip',
    orient='records',
    compression='zip'
)

In [None]:
import re
def statistics(x):
    docs = x.query('set == "known"').groupby('label').agg({'filename':'nunique'}).mean().astype(int).values[0];

    nchar = int(x.query('set == "unknown"')['text'].apply(lambda x:len(x)).mean());
    leastOne = lambda x: x if x>0 else 1;
    nword = x.query('set == "unknown"')['text'].apply(lambda x:len(re.findall(r'\b\w+\b',x))).apply(leastOne).mean();

    nauthors = len(x['label'].unique())
    return pd.Series({
        'ndocs':docs,
        'nauthors':nauthors,
        'nchar': int(nchar/10)*10,
        'nword':int(np.ceil(nword/5)*5),
    })

metadata = datasets.groupby(['dataset','problem','language']).apply(statistics).reset_index()

In [None]:
temp = pd.DataFrame(classification_reports) \
        .pivot_table(
            index=['dataset','problem'],
            columns='model',
            values='f1-score')[['CNN']]

temp2 = temp.reset_index().merge(metadata)

with open(baseDir+'/DL/output_dl/cn_report.txt','w') as f:

    f.write(temp.round(2).to_latex())
    f.write("\n\n");
    f.write(temp2.groupby(['dataset','language']).mean()[['CNN']].reset_index().round(2).to_latex(index=False))

    f.write("\n\n");
    f.write(temp2.groupby(['dataset','ndocs']).mean()[['CNN']].reset_index().round(2).to_latex(index=False))

    f.write("\n\n");
    f.write(temp2.groupby(['dataset','nauthors']).mean()[['CNN']].reset_index().round(2).to_latex(index=False))

    f.write("\n\n");
    f.write(temp2.groupby(['dataset','nchar']).mean()[['CNN']].reset_index().round(2).to_latex(index=False))

    f.write("\n\n");
    f.write(temp2.groupby(['dataset','nword']).mean()[['CNN']].reset_index().round(2).to_latex(index=False))


In [None]:
with pd.option_context("display.max_rows", 100):
    display(temp.round(4))

Unnamed: 0_level_0,model,CNN
dataset,problem,Unnamed: 2_level_1
lyrics,problem00001,0.524
lyrics,problem00002,0.2347
lyrics,problem00003,0.1641
lyrics,problem00004,0.0923
lyrics,problem00005,0.0711
lyrics,problem00006,0.3042
lyrics,problem00007,0.2092
lyrics,problem00008,0.1788
lyrics,problem00009,0.1738
lyrics,problem00010,0.1225
