# NLP 

In [2]:
import os
import numpy as np
import pandas as pd
from termcolor import colored
import seaborn as sns
import matplotlib.pyplot as plt
import re

Este notebook contiene de nuevo el pipeline de preprocesado con el fin de extraer los valores para el dato title.

In [None]:
corpus_df = pd.read_excel('./projects.xlsx')
corpus_df.head()

In [None]:
corpus_df_labels = pd.read_excel('./SciVocCodes.xlsx')
corpus_df_labels.head()

Unnamed: 0,full_path,title,code,full_code
0,/agricultural sciences/agricultural biotechnol...,livestock cloning,,/27/79/483/
1,/agricultural sciences/agricultural biotechnol...,plant cloning,1275.0,/27/79/483/1275
2,/agricultural sciences/agricultural biotechnol...,agricultural genetics,483.0,/27/79/483
3,/agricultural sciences/agricultural biotechnol...,biomass,481.0,/27/79/481
4,/agricultural sciences/agricultural biotechnol...,marker assisted selection,487.0,/27/79/487


In [124]:
corpus_df_labels['first_code'] = corpus_df_labels.apply(lambda row: re.findall(r'\d+', str(row.full_code))[0], axis = 1)
corpus_df_labels.head()

Unnamed: 0,full_path,title,code,full_code,first_code
0,/agricultural sciences/agricultural biotechnol...,livestock cloning,,/27/79/483/,27
1,/agricultural sciences/agricultural biotechnol...,plant cloning,1275.0,/27/79/483/1275,27
2,/agricultural sciences/agricultural biotechnol...,agricultural genetics,483.0,/27/79/483,27
3,/agricultural sciences/agricultural biotechnol...,biomass,481.0,/27/79/481,27
4,/agricultural sciences/agricultural biotechnol...,marker assisted selection,487.0,/27/79/487,27


Drop NA

In [125]:
corpus_df = corpus_df.dropna(subset=['euroSciVocCode'])
corpus_df = corpus_df.reset_index()

Add columns

In [126]:
def main_theme(row):
  results = re.findall(r'\d+', row.euroSciVocCode)
  return int(corpus_df_labels.loc[corpus_df_labels['code'] == int(results[0])].first_code.values[0])

In [127]:
def rest_of_themes(row):
  results = re.findall(r'\d+', row.euroSciVocCode)
  x = []
  for i in range(1,len(results)):
    new_theme = int(corpus_df_labels.loc[corpus_df_labels['code'] == int(results[i])].first_code.values[0])
    if new_theme != row.main_theme and new_theme not in x:
      x.append(new_theme)
  return x

In [128]:
corpus_df['main_theme'] = corpus_df.apply(main_theme, axis=1)
corpus_df['rest_themes'] = corpus_df.apply(rest_of_themes, axis=1)

# Pipeline de preprocesado

Necessary imports

In [519]:
import nltk
packages = ['punkt', 'stopwords', 'omw-1.4', 'wordnet']
for package in packages:
    try:
        nltk.data.find('tokenizers/' + package)
    except LookupError:
        nltk.download(package)
try:
  import lxml
except ModuleNotFoundError:
  %pip install lxml
try:
  import contractions
except ModuleNotFoundError:
  %pip install contractions
  import contractions
from bs4 import BeautifulSoup
from nltk.tokenize import wordpunct_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

wnl = WordNetLemmatizer()
stopwords_en = stopwords.words('english')

[nltk_data] Downloading package stopwords to /Users/pita/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/pita/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/pita/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [520]:
# To wrap long text lines

from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
  
get_ipython().events.register('pre_run_cell', set_css)

Cleaning functions

In [521]:
def wrangle_text(text):

    '''
    Eliminates contractions and html markers.
    '''

    soup = BeautifulSoup(text, 'lxml')
    text = soup.get_text()
    text = re.sub(r'https://\S+|www\.\S+', '', text)
    text = contractions.fix(text)

    return text


def prepare_text(text):

    '''
    Preprocessing pipeline.
    '''

    text = wrangle_text(text)
    tokens = wordpunct_tokenize(text)
    tokens_filtered = [word.lower() for word in tokens if word.isalnum()]
    lemmatized_text = [wnl.lemmatize(word) for word in tokens_filtered]
    clean = [word for word in lemmatized_text if word not in stopwords_en]
    
    return clean

In [572]:
corpus_df['title&summary'] = corpus_df['title'] + '. ' + corpus_df['summary']
corpus_df['clean_summary'] = corpus_df['summary'].apply(prepare_text)
corpus_df['clean_title'] = corpus_df['title'].apply(prepare_text)
corpus_df['clean_title&summary'] = corpus_df['title&summary'].apply(prepare_text)

N-grams

In [None]:
! pip install gensim==4.2.0


Collecting gensim==4.2.0
  Downloading gensim-4.2.0-cp38-cp38-macosx_10_9_x86_64.whl (24.0 MB)
[K     |████████████████████████████████| 24.0 MB 3.2 MB/s eta 0:00:01
[?25hCollecting smart-open>=1.8.1
  Downloading smart_open-6.3.0-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.0 MB/s eta 0:00:01
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.2.0 smart-open-6.3.0


In [566]:
from gensim.models.phrases import Phrases
def ngrams(corpus, phrase_model): 
    corpus = [el for el in phrase_model[corpus]]
    return corpus

In [576]:
phrase_model = Phrases(corpus_df['clean_title&summary'], min_count=2)
corpus_df['clean_summary'] = ngrams(corpus_df.clean_summary, phrase_model)
corpus_df['clean_title'] =  ngrams(corpus_df.clean_title,phrase_model)
corpus_df['clean_title&summary'] =  ngrams(corpus_df['clean_title&summary'],phrase_model)

In [None]:
corpus_df.to_csv('./cleaned_data.csv',index=False)

# Representación vectorial Word Embeddings 

## FastText Pretrained Full FB model

In [3]:
corpus_df = pd.read_csv('./cleaned_data.csv')
corpus_df.head(2)

Unnamed: 0,projectID,acronym,title,summary,startDate,endDate,totalCost,ecMaxContribution,masterCall,subCall,...,coordinatorCountry,euroSciVocCode,publicationID,patentID,main_theme,rest_themes,clean_summary,clean_title,title&summary,clean_title&summary
0,115843,EbolaMoDRAD,Ebola Virus: Modern Approaches for developing ...,The current Ebola Virus Disease (EVD) outbreak...,2015-02-01,2018-01-31,4300935.0,4300935.0,H2020-JTI-IMI2-2014-02-single-stage,H2020-JTI-IMI2-2014-02-single-stage,...,SE,"[155, 56306972, 325, 137, 1609]","['115843_202840_PUBLI', '115843_202838_PUBLI',...",,21,[23],"['current', 'ebola_virus', 'disease_evd', 'out...","['ebola_virus', 'modern', 'approach', 'develop...",Ebola Virus: Modern Approaches for developing ...,"['ebola_virus', 'modern', 'approach', 'develop..."
1,115910,PERISCOPE,PERtussIS COrrelates of Protection Europe - So...,Pertussis vaccines have been very successful i...,2016-03-01,2022-08-31,29926687.0,21000000.0,H2020-JTI-IMI2-2015-03-two-stage,H2020-JTI-IMI2-2015-03-two-stage,...,NL,"[137, 1439, 44109686, 48479582]","['115910_1008742_PUBLI', '115910_629396_PUBLI'...",,21,[],"['pertussis_vaccine', 'successful', 'reducing'...","['pertussis', 'correlate_protection', 'europe'...",PERtussIS COrrelates of Protection Europe - So...,"['pertussis', 'correlate_protection', 'europe'..."


In [None]:
#! pip install fasttext
#fasttext.util.download_model('en', if_exists='ignore')

In [8]:
import fasttext
import fasttext.util
import ast

In [588]:
ft = fasttext.load_model('./data.nosync/cc.en.300.bin')



In [602]:
def convert_to_embedding(data):
  final_data = []
  for document in data: 
    embedding = []
    count = 0
    if (type(document) != list): document = ast.literal_eval(document)
    for word in document:
      word_embedding = ft.get_word_vector(word)
      if len(embedding)==0: 
        embedding = word_embedding 
      else: embedding += word_embedding
      count += 1
    embedding /= count
    final_data.append(embedding.tolist())
  return pd.Series(final_data)

In [47]:
corpus_df['pretrained_fasttext'] = convert_to_embedding(corpus_df['clean_summary'])
corpus_df['pretrained_fasttext_title'] = convert_to_embedding(corpus_df['clean_title'])
corpus_df['pretrained_fasttext_title&summary'] = convert_to_embedding(corpus_df['clean_title&summary'])

In [607]:
corpus_df.to_csv('./cleaned_data_fasttext_full.csv',index=False)

## Fastext Pretrained CompressedFastText

Esta forma de implementación quedó descartada al ser una version de fasttext muy ligera y dejar fuera representaciones de palabras muy importantes como 5g, 4g o Wifi. 

In [None]:
! pip install gensim==4.2.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim==4.2.0
  Using cached gensim-4.2.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.2.0


In [None]:
try:
  import compress_fasttext
except ModuleNotFoundError:
  ! pip install compress-fasttext
  import compress_fasttext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting compress-fasttext
  Downloading compress-fasttext-0.1.3.tar.gz (14 kB)
Building wheels for collected packages: compress-fasttext
  Building wheel for compress-fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for compress-fasttext: filename=compress_fasttext-0.1.3-py3-none-any.whl size=14601 sha256=7c8a3b669000f1e464cea3cd5842510d7eacca870dc031829cac6a72a649b339
  Stored in directory: /root/.cache/pip/wheels/c7/63/9f/39db0410175167cee5eeae4fde2405d957cd05c1d8811a51cf
Successfully built compress-fasttext
Installing collected packages: compress-fasttext
Successfully installed compress-fasttext-0.1.3


In [None]:
#with open("cleaned_summary.txt", 'w', encoding='utf-8') as fout:
#  for el in corpus_df['clean_summary'].values.tolist():
#    fout.write(' '.join(el) + '\n')

In [None]:
# fastTextPre = compress_fasttext.models.CompressedFastTextKeyedVectors.load(
#     'https://github.com/avidale/compress-fasttext/releases/download/v0.0.4/cc.en.300.compressed.bin'
# )

## Fastext Custom Cleaned Dataset

In [608]:
# with open("cleaned_title&summary.txt", 'w', encoding='utf-8') as fout:
#   for el in corpus_df['clean_title&summary'].values.tolist():
#     fout.write(' '.join(el) + '\n')

# Esta sentencia es necesaria una vez para entrenar fast text

In [84]:
from gensim.models import FastText
import copy

class IterableCorpus_fromfile:
    def __init__(self, filename):
        self.__filename = filename
    def __iter__(self):
        for line in open(self.__filename):
            yield line.lower().split()

model_fasttext = FastText(sentences=IterableCorpus_fromfile("cleaned_summary.txt"), vector_size=300, min_count = 5, window=5, workers=4, seed=42, sg=1)

In [85]:
model_fasttext.save("model_fastText_cleansummary.model")
word_vectors = model_fasttext.wv
word_vectors.save("model_fastText_cleansummary.wordvectors")

In [155]:
from gensim.models import KeyedVectors
fastText_wv = KeyedVectors.load("model_fastText_cleansummary.wordvectors", mmap='r')

In [44]:
def convert_to_embedding_ff(data):
  final_data = []
  for document in data: 
    embedding = []
    count = 0
    if (type(document) != list): document = ast.literal_eval(document)
    for word in document:
      word_embedding = fastText_wv.get_vector(word).copy()
      word_embedding.setflags(write=True)
      if len(embedding)==0: 
        embedding = word_embedding 
      else: embedding += word_embedding
      count += 1
    embedding /= count
    final_data.append(embedding.tolist())
    del embedding
  return pd.Series(final_data)

In [177]:
corpus_df['custom_trained_fasttext'] = convert_to_embedding_ff(corpus_df['clean_summary'])

In [610]:
model_fasttext_ts = FastText(sentences=IterableCorpus_fromfile("cleaned_title&summary.txt"), vector_size=300, min_count = 5, window=5, workers=4, seed=42, sg=1)

In [611]:
model_fasttext.save("model_fastText_cleantitle&summary.model")
word_vectors = model_fasttext.wv
word_vectors.save("model_fastText_cleantitle&summary.wordvectors")

In [42]:
from gensim.models import KeyedVectors
fastText_wv = KeyedVectors.load("model_fastText_cleantitle&summary.wordvectors", mmap='r')

In [45]:
corpus_df['custom_trained_fasttext_title'] = convert_to_embedding_ff(corpus_df['clean_title'])
corpus_df['custom_trained_fasttext_title&summary'] = convert_to_embedding_ff(corpus_df['clean_title&summary'])

In [47]:
corpus_df.to_csv('./cleaned_data_fasttext_all.csv',index=False)

# MLP Single Label

In [5]:
corpus_df = pd.read_csv('./cleaned_data_fasttext_all.csv')


In [9]:
corpus_df['pretrained_fasttext'] = corpus_df['pretrained_fasttext'].apply(ast.literal_eval)
corpus_df['custom_trained_fasttext'] = corpus_df['custom_trained_fasttext'].apply(ast.literal_eval)
corpus_df['pretrained_fasttext_title'] = corpus_df['pretrained_fasttext_title'].apply(ast.literal_eval)
corpus_df['pretrained_fasttext_title&summary'] = corpus_df['pretrained_fasttext_title&summary'].apply(ast.literal_eval)
corpus_df['custom_trained_fasttext_title'] = corpus_df['custom_trained_fasttext_title'].apply(ast.literal_eval)
corpus_df['custom_trained_fasttext_title&summary'] = corpus_df['custom_trained_fasttext_title&summary'].apply(ast.literal_eval)

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim 
torch.set_printoptions(precision=10)
import ast
import time
from tabulate import tabulate


In [11]:
def themenumber_to_vector(number):
    tensor = torch.zeros(6)
    if number == 21: tensor[0] = 1
    elif number == 23: tensor[1] = 1
    elif number == 27: tensor[2] = 1
    elif number == 25: tensor[3] = 1
    elif number == 29: tensor[4] = 1
    elif number == 31: tensor[5] = 1
    return tensor

def position_to_number(position):
  if position == 0: return 21
  elif position == 1: return 23
  elif position == 2: return 27
  elif position == 3: return 25
  elif position == 4: return 29 
  elif position == 5: return 31


In [12]:
class MLP_main(nn.Module):    

  def __init__(self,dimx, hidden_dim, nlabels, dropout_prob):
      super().__init__()

      self.hidden_dim = hidden_dim
      self.fc_layers = nn.ModuleList()
      self.fc_layers.append(nn.Linear(dimx,hidden_dim[0]))
      for index, value in enumerate(hidden_dim[1:]):
        self.fc_layers.append(nn.Linear(hidden_dim[index], hidden_dim[index + 1]))
      self.fc_layers.append(nn.Linear(hidden_dim[-1],nlabels))
      self._parameters
      self.relu = nn.ReLU()
      self.logsoftmax = nn.Softmax(dim = 0)
      
      self.dropout = nn.Dropout(p = dropout_prob)

  def forward(self,x):
        for layer in self.fc_layers[:-1]:
          x = layer(x)
          x = self.relu(x)
          x = self.dropout(x)
        x = self.fc_layers[-1](x)
        x = self.logsoftmax(x)
        return x

In [13]:
class EarlyStopping:

    def __init__(self, patience, verbose, path ,delta=0.001 ):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path

    def __call__(self, val_loss, model):

        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        if self.verbose: print('Validation has reduced from {} to {}. Saving model ...'.format(self.val_loss_min,val_loss))
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [14]:
class MLP(MLP_main):
  
  def __init__(self,dimx, hidden_dim, nlabels,epochs,dropout_prob, name, lr = 0.001):
    super().__init__(dimx, hidden_dim, nlabels, dropout_prob)

    self.epochs = epochs
    self.lr = lr
    self.optim = optim.Adam(self.parameters(), self.lr)
    self.criterion = nn.CrossEntropyLoss() 
    self.loss_during_training = []
    self.valid_loss_during_training = []
    self.name = name + str(self.hidden_dim) + '_' + str(round(time.time()))
    self.path = ''
    self.last_epoch = -1

  def trainloop(self, train_vectors, train_labels, valid_vectors, valid_labels, patience = 20):

    self.path = './best_checkpoint_' + self.name + '.pt'
    early_stopping = EarlyStopping(patience = patience, verbose = True ,path = self.path)
    print('Training: {}'.format(self.name))
    
    for e in range(0,int(self.epochs)):
      self.last_epoch = e + 1
      running_loss = 0.

      self.train()

      classes_predicted = []

      for summary, label in zip(train_vectors, train_labels):

        self.optim.zero_grad()
       
        output = self.forward(summary)
        class_predited = position_to_number(np.argmax(output.detach().numpy()))
        if class_predited not in classes_predicted: classes_predicted.append(class_predited)
        # print('Vector predicted{}, realclass {}, real vector{}'.format(output,label,themenumber_to_vector(label)))
        loss = self.criterion(output,themenumber_to_vector(label))

        loss.backward()

        self.optim.step()

        running_loss+=loss.item()

      self.loss_during_training.append(running_loss/len(train_vectors))

      self.compute_validation_loss(valid_vectors, valid_labels)

      if(True):

          print("Training loss after {}  epochs: {}, classes {}".format(e,self.loss_during_training[-1], classes_predicted))
          print("Validation loss after {} epochs: {}".format(e,self.valid_loss_during_training[-1]))

      early_stopping(self.valid_loss_during_training[-1],self)
      if early_stopping.early_stop:
        print("Early stopping")
        print("Loading best epoch")
        self.load_state_dict(torch.load(self.path))
        break
              
  def compute_validation_loss(self,valid_vectors, valid_labels):
      total_length = 0
      self.eval()
      with torch.no_grad():

        validation_loss = 0.

        for sumary, label in zip(valid_vectors, valid_labels):
        
          output = self.forward(sumary)

          loss = self.criterion(output,themenumber_to_vector(label))
          
          validation_loss += loss.item()

        self.valid_loss_during_training.append(validation_loss/len(valid_vectors))
  

  def evaluate(self,test_vectors, test_labels, test_rest_labels):
        # print('Evaluating: {} '.format(self.path))
        accuracy = 0
        real_accuracy = 0
        self.eval()
        with torch.no_grad():
          classes_predicted = []
          for sumary, label, rest_labels in zip(test_vectors,test_labels, test_rest_labels):
            logprobs = self.forward(sumary)
            # print('probs {}'.format(logprobs))
            class_predicted = position_to_number(np.argmax(logprobs))
            # print('class predicted {}, real_class {}, rest labels {}'.format(class_predicted,label, rest_labels))
            if class_predicted not in classes_predicted: classes_predicted.append(class_predicted)
            if label == class_predicted: 
              accuracy += 1
              real_accuracy += 1
            elif class_predicted in ast.literal_eval(rest_labels): 
              real_accuracy +=1
          return accuracy/len(test_vectors), real_accuracy/len(test_vectors), classes_predicted

In [15]:
data = corpus_df.copy()

In [51]:
data.groupby('main_theme').count().sort_values(['projectID'], ascending=False)['projectID']

main_theme
23    12523
25     6624
29     5962
21     5143
31     1154
27      646
Name: projectID, dtype: int64

In [52]:
data.groupby('rest_themes').count().sort_values(['projectID'], ascending=False).head(10)['projectID']

rest_themes
[]          15050
[23]         4980
[25]         3138
[21]         1810
[29]         1620
[31]          724
[23, 25]      550
[27]          520
[25, 23]      444
[23, 29]      290
Name: projectID, dtype: int64

In [16]:
train, validate, test = np.split(data.sample(frac=1, random_state=42), [int(.7*len(data)), int(.85*len(data))])
train.reset_index(drop=True,inplace=True)
validate.reset_index(drop=True,inplace=True)
test.reset_index(drop=True,inplace=True)

In [54]:
train.groupby('main_theme').count().sort_values(['projectID'], ascending=False).head(10)['projectID']

main_theme
23    8779
25    4619
29    4134
21    3623
31     813
27     468
Name: projectID, dtype: int64

In [17]:
c23 = train[train['main_theme'] == 23]
c25 = train[train['main_theme'] == 25]
c29 = train[train['main_theme'] == 29]
c21 = train[train['main_theme'] == 21]
c31 = train[train['main_theme'] == 31]
c27 = train[train['main_theme'] == 27]

maxvalue = len(c27)
uc23 = c23.sample(maxvalue)
uc25 = c25.sample(maxvalue)
uc29 = c29.sample(maxvalue)
uc21 = c21.sample(maxvalue)
uc31 = c31.sample(maxvalue)

minvalue = int(len(c23) * 0.8)
oc23 = c23.sample(minvalue)
oc25 = c25.sample(minvalue, replace=True)
oc29 = c29.sample(minvalue, replace=True)
oc21 = c21.sample(minvalue, replace=True)
oc31 = c31.sample(minvalue, replace=True)
oc27 = c27.sample(minvalue, replace=True)

undersampled_train = pd.concat([uc23,uc25,uc29,uc21,uc31,c27])
undersampled_train.reset_index(drop = True,inplace = True)
oversampled_train = pd.concat([oc23,oc25,oc29,oc21,oc31,oc27])


undersampled_train = undersampled_train.sample(frac = 1)
undersampled_train.reset_index(inplace=True,drop=True)
oversampled_train = oversampled_train.sample(frac = 1)
oversampled_train.reset_index(inplace=True,drop=True)
train.reset_index(inplace=True,drop=True)

In [56]:
undersampled_train.groupby('main_theme').count().sort_values(['projectID'], ascending=False)['projectID']

main_theme
21    468
23    468
25    468
27    468
29    468
31    468
Name: projectID, dtype: int64

In [57]:
oversampled_train.groupby('main_theme').count().sort_values(['projectID'], ascending=False)['projectID']

main_theme
21    7023
23    7023
25    7023
27    7023
29    7023
31    7023
Name: projectID, dtype: int64

### Training Just Summary

In [28]:
train_fb = torch.tensor(train.pretrained_fasttext)
train_custom = torch.tensor(train.custom_trained_fasttext)
train_labels = torch.tensor(train.main_theme)

train_under_fb = torch.tensor(undersampled_train.pretrained_fasttext)
train_under_custom = torch.tensor(undersampled_train.custom_trained_fasttext)
train_under_labels = torch.tensor(undersampled_train.main_theme)

train_over_fb = torch.tensor(oversampled_train.pretrained_fasttext)
train_over_custom = torch.tensor(oversampled_train.custom_trained_fasttext)
train_over_labels = torch.tensor(oversampled_train.main_theme)

validate_fb = torch.tensor(validate.pretrained_fasttext)
validate_custom = torch.tensor(validate.custom_trained_fasttext)
validate_labels = torch.tensor(validate.main_theme)

test_fb = torch.tensor(test.pretrained_fasttext)
test_custom = torch.tensor(test.custom_trained_fasttext)
test_labels = torch.tensor(test.main_theme)
test_rest_labels = test.rest_themes

ValueError: too many dimensions 'str'

In [487]:
pretrained_fasttext_MLP_u = MLP(dimx = 300, hidden_dim = [100,50], nlabels = 6, dropout_prob = 0.2, epochs = 100, name = 'pretrained_fast_under_')
pretrained_fasttext_MLP_o = MLP(dimx = 300, hidden_dim = [100,50], nlabels = 6, dropout_prob = 0.2, epochs = 100, name = 'pretrained_fast_over_')
pretrained_fasttext_MLP = MLP(dimx = 300, hidden_dim = [100,50], nlabels = 6, dropout_prob = 0.2, epochs = 100, name = 'pretrained_fast_')
custom_fasttext_MLP_u = MLP(dimx = 300, hidden_dim = [100,50], nlabels = 6, dropout_prob = 0.2, epochs = 100, name = 'custom_fast_under_')
custom_fasttext_MLP_o = MLP(dimx = 300, hidden_dim = [100,50], nlabels = 6, dropout_prob = 0.2, epochs = 100, name = 'custom_fast_over_' )
custom_fasttext_MLP = MLP(dimx = 300, hidden_dim = [100,50], nlabels = 6, dropout_prob = 0.2, epochs = 100, name = 'custom_fast_' )

In [None]:
pretrained_fasttext_MLP_u.trainloop(train_under_fb,train_under_labels,validate_fb,validate_labels)
pretrained_fasttext_MLP_o.trainloop(train_over_fb,train_over_labels,validate_fb,validate_labels)
pretrained_fasttext_MLP.trainloop(train_fb,train_labels,validate_fb,validate_labels)
custom_fasttext_MLP_u.trainloop(train_under_custom, train_under_labels, validate_custom, validate_labels)
custom_fasttext_MLP_o.trainloop(train_over_custom, train_over_labels, validate_custom, validate_labels)
custom_fasttext_MLP.trainloop(train_custom, train_labels, validate_custom, validate_labels)

In [504]:
acc_pre_u, real_acc_pre_u, pred_pre_u = pretrained_fasttext_MLP_u.evaluate(test_fb,test_labels,test_rest_labels)
acc_pre_o, real_acc_pre_o, pred_pre_o = pretrained_fasttext_MLP_o.evaluate(test_fb,test_labels,test_rest_labels)
acc_pre, real_acc_pre, pred_pre = pretrained_fasttext_MLP.evaluate(test_fb,test_labels,test_rest_labels)
acc_cust_u, real_acc_cust_u, pred_cust_u = custom_fasttext_MLP_u.evaluate(test_custom,test_labels, test_rest_labels)
acc_cust_o, real_acc_cust_o, pred_cust_o = custom_fasttext_MLP_o.evaluate(test_custom,test_labels, test_rest_labels)
acc_cust, real_acc_cust, pred_cust = custom_fasttext_MLP.evaluate(test_custom,test_labels, test_rest_labels)

In [508]:
table = [
    ['Model Name', 'Accuracy','Real Accuracy','Predited Classes','Last Epoch','Best Epoch Train','Best Epoch Validate'],
    [pretrained_fasttext_MLP_u.name,acc_pre_u,real_acc_pre_u,pred_pre_u,pretrained_fasttext_MLP_u.last_epoch,min(pretrained_fasttext_MLP_u.loss_during_training),min(pretrained_fasttext_MLP_u.valid_loss_during_training)],
    [pretrained_fasttext_MLP_o.name,acc_pre_o,real_acc_pre_o,pred_pre_o,pretrained_fasttext_MLP_o.last_epoch,min(pretrained_fasttext_MLP_o.loss_during_training),min(pretrained_fasttext_MLP_o.valid_loss_during_training)],
    [pretrained_fasttext_MLP.name,acc_pre,real_acc_pre,pred_pre,pretrained_fasttext_MLP.last_epoch,min(pretrained_fasttext_MLP.loss_during_training),min(pretrained_fasttext_MLP.valid_loss_during_training)],
    [custom_fasttext_MLP_u.name,acc_cust_u,real_acc_cust_u,pred_cust_u,custom_fasttext_MLP_u.last_epoch,min(custom_fasttext_MLP_u.loss_during_training),min(custom_fasttext_MLP_u.valid_loss_during_training)],
    [custom_fasttext_MLP_o.name,acc_cust_o,real_acc_cust_o,pred_cust_o,custom_fasttext_MLP_o.last_epoch,min(custom_fasttext_MLP_o.loss_during_training),min(custom_fasttext_MLP_o.valid_loss_during_training)],
    [custom_fasttext_MLP.name,acc_cust,real_acc_cust,pred_cust,custom_fasttext_MLP.last_epoch,min(custom_fasttext_MLP.loss_during_training),min(custom_fasttext_MLP.valid_loss_during_training)]
]

In [509]:
print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

╒════════════════════════════════════════════╤════════════╤═════════════════╤══════════════════════════╤══════════════╤════════════════════╤═══════════════════════╕
│ Model Name                                 │   Accuracy │   Real Accuracy │ Predited Classes         │   Last Epoch │   Best Epoch Train │   Best Epoch Validate │
╞════════════════════════════════════════════╪════════════╪═════════════════╪══════════════════════════╪══════════════╪════════════════════╪═══════════════════════╡
│ pretrained_fast_under_[100, 50]_1672930517 │   0.576123 │        0.809692 │ [21, 23, 25, 29, 31, 27] │           39 │            1.31173 │               1.4513  │
├────────────────────────────────────────────┼────────────┼─────────────────┼──────────────────────────┼──────────────┼────────────────────┼───────────────────────┤
│ pretrained_fast_over_[100, 50]_1672930517  │   0.603993 │        0.832363 │ [21, 23, 29, 25, 31, 27] │           66 │            1.31854 │               1.42864 │
├─────────

In [518]:
text_summary_results = pd.DataFrame(data = table[1:], columns = table[0])
text_summary_results.to_csv('best_results_text_summary.csv',index=False)

### Training Title And Summary

#### Toguether (one vector)

In [642]:
train_fb = torch.tensor(train['pretrained_fasttext_title&summary'])
train_custom = torch.tensor(train['custom_trained_fasttext_title&summary'])
train_labels = torch.tensor(train.main_theme)

train_under_fb = torch.tensor(undersampled_train['pretrained_fasttext_title&summary'])
train_under_custom = torch.tensor(undersampled_train['custom_trained_fasttext_title&summary'])
train_under_labels = torch.tensor(undersampled_train.main_theme)

train_over_fb = torch.tensor(oversampled_train['pretrained_fasttext_title&summary'])
train_over_custom = torch.tensor(oversampled_train['custom_trained_fasttext_title&summary'])
train_over_labels = torch.tensor(oversampled_train.main_theme)

validate_fb = torch.tensor(validate['pretrained_fasttext_title&summary'])
validate_custom = torch.tensor(validate['custom_trained_fasttext_title&summary'])
validate_labels = torch.tensor(validate.main_theme)

test_fb = torch.tensor(test['pretrained_fasttext_title&summary'])
test_custom = torch.tensor(test['custom_trained_fasttext_title&summary'])
test_labels = torch.tensor(test.main_theme)
test_rest_labels = test.rest_themes

In [646]:
pretrained_fasttext_MLP_u = MLP(dimx = 300, hidden_dim = [100,50], nlabels = 6, dropout_prob = 0.2, epochs = 100, name = 'pretrained_fast_tsone_under_')
pretrained_fasttext_MLP_o = MLP(dimx = 300, hidden_dim = [100,50], nlabels = 6, dropout_prob = 0.2, epochs = 100, name = 'pretrained_fast_tsone_over_')
# pretrained_fasttext_MLP = MLP(dimx = 300, hidden_dim = [100,50], nlabels = 6, dropout_prob = 0.2, epochs = 100, name = 'pretrained_tsone_fast_')
custom_fasttext_MLP_u = MLP(dimx = 300, hidden_dim = [100,50], nlabels = 6, dropout_prob = 0.2, epochs = 100, name = 'custom_fast_tsone_under_')
custom_fasttext_MLP_o = MLP(dimx = 300, hidden_dim = [100,50], nlabels = 6, dropout_prob = 0.2, epochs = 100, name = 'custom_fast_tsone_over_' )
custom_fasttext_MLP = MLP(dimx = 300, hidden_dim = [100,50], nlabels = 6, dropout_prob = 0.2, epochs = 100, name = 'custom_fast_tsone_' )

In [647]:
pretrained_fasttext_MLP_u.trainloop(train_under_fb,train_under_labels,validate_fb,validate_labels)
pretrained_fasttext_MLP_o.trainloop(train_over_fb,train_over_labels,validate_fb,validate_labels)
# pretrained_fasttext_MLP.trainloop(train_fb,train_labels,validate_fb,validate_labels)
custom_fasttext_MLP_u.trainloop(train_under_custom, train_under_labels, validate_custom, validate_labels)
custom_fasttext_MLP_o.trainloop(train_over_custom, train_over_labels, validate_custom, validate_labels)
custom_fasttext_MLP.trainloop(train_custom, train_labels, validate_custom, validate_labels)

Training: pretrained_fast_tsone_under_[100, 50]_1673102749
Training loss after 0  epochs: 1.675601868165864, classes [23, 27, 31, 25, 21, 29]
Validation loss after 0 epochs: 1.7140661981111756
Validation has reduced from inf to 1.7140661981111756. Saving model ...
Training loss after 1  epochs: 1.5453554285727336, classes [27, 31, 29, 21, 25, 23]
Validation loss after 1 epochs: 1.626965512972505
Validation has reduced from 1.7140661981111756 to 1.626965512972505. Saving model ...
Training loss after 2  epochs: 1.5038283423141197, classes [27, 31, 29, 25, 21, 23]
Validation loss after 2 epochs: 1.59631246008512
Validation has reduced from 1.626965512972505 to 1.59631246008512. Saving model ...
Training loss after 3  epochs: 1.4781497128074326, classes [27, 31, 29, 25, 23, 21]
Validation loss after 3 epochs: 1.5405810702709906
Validation has reduced from 1.59631246008512 to 1.5405810702709906. Saving model ...
Training loss after 4  epochs: 1.4607779249081925, classes [27, 31, 25, 29, 23

In [648]:
acc_pre_u, real_acc_pre_u, pred_pre_u = pretrained_fasttext_MLP_u.evaluate(test_fb,test_labels,test_rest_labels)
acc_pre_o, real_acc_pre_o, pred_pre_o = pretrained_fasttext_MLP_o.evaluate(test_fb,test_labels,test_rest_labels)
acc_pre, real_acc_pre, pred_pre = pretrained_fasttext_MLP.evaluate(test_fb,test_labels,test_rest_labels)
acc_cust_u, real_acc_cust_u, pred_cust_u = custom_fasttext_MLP_u.evaluate(test_custom,test_labels, test_rest_labels)
acc_cust_o, real_acc_cust_o, pred_cust_o = custom_fasttext_MLP_o.evaluate(test_custom,test_labels, test_rest_labels)
acc_cust, real_acc_cust, pred_cust = custom_fasttext_MLP.evaluate(test_custom,test_labels, test_rest_labels)

In [651]:
table = [
    ['Model Name', 'Accuracy','Real Accuracy','Predited Classes','Last Epoch','Best Epoch Train','Best Epoch Validate'],
    [pretrained_fasttext_MLP_u.name,acc_pre_u,real_acc_pre_u,pred_pre_u,pretrained_fasttext_MLP_u.last_epoch,min(pretrained_fasttext_MLP_u.loss_during_training),min(pretrained_fasttext_MLP_u.valid_loss_during_training)],
    [pretrained_fasttext_MLP_o.name,acc_pre_o,real_acc_pre_o,pred_pre_o,pretrained_fasttext_MLP_o.last_epoch,min(pretrained_fasttext_MLP_o.loss_during_training),min(pretrained_fasttext_MLP_o.valid_loss_during_training)],
    [pretrained_fasttext_MLP.name,acc_pre,real_acc_pre,pred_pre,pretrained_fasttext_MLP.last_epoch,min(pretrained_fasttext_MLP.loss_during_training),min(pretrained_fasttext_MLP.valid_loss_during_training)],
    [custom_fasttext_MLP_u.name,acc_cust_u,real_acc_cust_u,pred_cust_u,custom_fasttext_MLP_u.last_epoch,min(custom_fasttext_MLP_u.loss_during_training),min(custom_fasttext_MLP_u.valid_loss_during_training)],
    [custom_fasttext_MLP_o.name,acc_cust_o,real_acc_cust_o,pred_cust_o,custom_fasttext_MLP_o.last_epoch,min(custom_fasttext_MLP_o.loss_during_training),min(custom_fasttext_MLP_o.valid_loss_during_training)],
    [custom_fasttext_MLP.name,acc_cust,real_acc_cust,pred_cust,custom_fasttext_MLP.last_epoch,min(custom_fasttext_MLP.loss_during_training),min(custom_fasttext_MLP.valid_loss_during_training)]
]

In [652]:
print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

╒══════════════════════════════════════════════════╤════════════╤═════════════════╤══════════════════════════╤══════════════╤════════════════════╤═══════════════════════╕
│ Model Name                                       │   Accuracy │   Real Accuracy │ Predited Classes         │   Last Epoch │   Best Epoch Train │   Best Epoch Validate │
╞══════════════════════════════════════════════════╪════════════╪═════════════════╪══════════════════════════╪══════════════╪════════════════════╪═══════════════════════╡
│ pretrained_fast_tsone_under_[100, 50]_1673102749 │   0.564892 │        0.799293 │ [21, 25, 29, 23, 31, 27] │           42 │            1.31812 │               1.47053 │
├──────────────────────────────────────────────────┼────────────┼─────────────────┼──────────────────────────┼──────────────┼────────────────────┼───────────────────────┤
│ pretrained_fast_tsone_over_[100, 50]_1673102749  │   0.589434 │        0.81926  │ [21, 29, 25, 23, 31, 27] │           32 │            1.34004 

In [653]:
text_summary_results = pd.DataFrame(data = table[1:], columns = table[0])
text_summary_results.to_csv('best_results_text_title&sumary_one_vector.csv',index=False)

#### Separetly (two vectors)

In [18]:
train_fb = torch.cat((torch.tensor(train['pretrained_fasttext']),torch.tensor(train['pretrained_fasttext_title'])),1)
train_custom = torch.cat((torch.tensor(train['custom_trained_fasttext']),torch.tensor(train['custom_trained_fasttext_title'])),1)
train_labels = torch.tensor(train.main_theme)

train_under_fb = torch.cat((torch.tensor(undersampled_train['pretrained_fasttext']),torch.tensor(undersampled_train['pretrained_fasttext_title'])),1)
train_under_custom = torch.cat((torch.tensor(undersampled_train['custom_trained_fasttext']),torch.tensor(undersampled_train['custom_trained_fasttext_title'])),1)
train_under_labels = torch.tensor(undersampled_train.main_theme)

train_over_fb = torch.cat((torch.tensor(oversampled_train['pretrained_fasttext']),torch.tensor(oversampled_train['pretrained_fasttext_title'])),1)
train_over_custom = torch.cat((torch.tensor(oversampled_train['custom_trained_fasttext']),torch.tensor(oversampled_train['custom_trained_fasttext_title'])),1)
train_over_labels = torch.tensor(oversampled_train.main_theme)

validate_fb = torch.cat((torch.tensor(validate['pretrained_fasttext']),torch.tensor(validate['pretrained_fasttext_title'])),1)
validate_custom = torch.cat((torch.tensor(validate['custom_trained_fasttext']),torch.tensor(validate['custom_trained_fasttext_title'])),1)
validate_labels = torch.tensor(validate.main_theme)

test_fb = torch.cat((torch.tensor(test['pretrained_fasttext']),torch.tensor(test['pretrained_fasttext_title'])),1)
test_custom = torch.cat((torch.tensor(test['custom_trained_fasttext']),torch.tensor(test['custom_trained_fasttext_title'])),1)
test_labels = torch.tensor(test.main_theme)
test_rest_labels = test.rest_themes

In [21]:
pretrained_fasttext_MLP_u = MLP(dimx = 600, hidden_dim = [200,100], nlabels = 6, dropout_prob = 0.2, epochs = 100, name = 'pretrained_fast_tstwo_under_')
pretrained_fasttext_MLP_o = MLP(dimx = 600, hidden_dim = [200,100], nlabels = 6, dropout_prob = 0.2, epochs = 100, name = 'pretrained_fast_tstwo_over_')
pretrained_fasttext_MLP = MLP(dimx = 600, hidden_dim = [200,100], nlabels = 6, dropout_prob = 0.2, epochs = 100, name = 'pretrained_tstwo_fast_')
custom_fasttext_MLP_u = MLP(dimx = 600, hidden_dim = [200,100], nlabels = 6, dropout_prob = 0.2, epochs = 100, name = 'custom_fast_tstwo_under_')
custom_fasttext_MLP_o = MLP(dimx = 600, hidden_dim = [200,100], nlabels = 6, dropout_prob = 0.2, epochs = 100, name = 'custom_fast_tstwo_over_' )
custom_fasttext_MLP = MLP(dimx = 600, hidden_dim = [200,100], nlabels = 6, dropout_prob = 0.2, epochs = 100, name = 'custom_fast_tstwo_' )

In [None]:
pretrained_fasttext_MLP_u.trainloop(train_under_fb,train_under_labels,validate_fb,validate_labels)
pretrained_fasttext_MLP_o.trainloop(train_over_fb,train_over_labels,validate_fb,validate_labels)
pretrained_fasttext_MLP.trainloop(train_fb,train_labels,validate_fb,validate_labels)
custom_fasttext_MLP_u.trainloop(train_under_custom, train_under_labels, validate_custom, validate_labels)
custom_fasttext_MLP_o.trainloop(train_over_custom, train_over_labels, validate_custom, validate_labels)
custom_fasttext_MLP.trainloop(train_custom, train_labels, validate_custom, validate_labels)

In [23]:
acc_pre_u, real_acc_pre_u, pred_pre_u = pretrained_fasttext_MLP_u.evaluate(test_fb,test_labels,test_rest_labels)
acc_pre_o, real_acc_pre_o, pred_pre_o = pretrained_fasttext_MLP_o.evaluate(test_fb,test_labels,test_rest_labels)
acc_pre, real_acc_pre, pred_pre = pretrained_fasttext_MLP.evaluate(test_fb,test_labels,test_rest_labels)
acc_cust_u, real_acc_cust_u, pred_cust_u = custom_fasttext_MLP_u.evaluate(test_custom,test_labels, test_rest_labels)
acc_cust_o, real_acc_cust_o, pred_cust_o = custom_fasttext_MLP_o.evaluate(test_custom,test_labels, test_rest_labels)
acc_cust, real_acc_cust, pred_cust = custom_fasttext_MLP.evaluate(test_custom,test_labels, test_rest_labels)

In [None]:
table = [
    ['Model Name', 'Accuracy','Real Accuracy','Predited Classes','Last Epoch','Best Epoch Train','Best Epoch Validate'],
    [pretrained_fasttext_MLP_u.name,acc_pre_u,real_acc_pre_u,pred_pre_u,pretrained_fasttext_MLP_u.last_epoch,min(pretrained_fasttext_MLP_u.loss_during_training),min(pretrained_fasttext_MLP_u.valid_loss_during_training)],
    [pretrained_fasttext_MLP_o.name,acc_pre_o,real_acc_pre_o,pred_pre_o,pretrained_fasttext_MLP_o.last_epoch,min(pretrained_fasttext_MLP_o.loss_during_training),min(pretrained_fasttext_MLP_o.valid_loss_during_training)],
    [pretrained_fasttext_MLP.name,acc_pre,real_acc_pre,pred_pre,pretrained_fasttext_MLP.last_epoch,min(pretrained_fasttext_MLP.loss_during_training),min(pretrained_fasttext_MLP.valid_loss_during_training)],
    [custom_fasttext_MLP_u.name,acc_cust_u,real_acc_cust_u,pred_cust_u,custom_fasttext_MLP_u.last_epoch,min(custom_fasttext_MLP_u.loss_during_training),min(custom_fasttext_MLP_u.valid_loss_during_training)],
    [custom_fasttext_MLP_o.name,acc_cust_o,real_acc_cust_o,pred_cust_o,custom_fasttext_MLP_o.last_epoch,min(custom_fasttext_MLP_o.loss_during_training),min(custom_fasttext_MLP_o.valid_loss_during_training)],
    [custom_fasttext_MLP.name,acc_cust,real_acc_cust,pred_cust,custom_fasttext_MLP.last_epoch,min(custom_fasttext_MLP.loss_during_training),min(custom_fasttext_MLP.valid_loss_during_training)]
]

In [29]:
print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

╒═══════════════════════════════════════════════════╤════════════╤═════════════════╤══════════════════════════╤══════════════╤════════════════════╤═══════════════════════╕
│ Model Name                                        │   Accuracy │   Real Accuracy │ Predited Classes         │   Last Epoch │   Best Epoch Train │   Best Epoch Validate │
╞═══════════════════════════════════════════════════╪════════════╪═════════════════╪══════════════════════════╪══════════════╪════════════════════╪═══════════════════════╡
│ pretrained_fast_tstwo_under_[200, 100]_1673137505 │   0.547213 │        0.77683  │ [21, 25, 29, 31, 23, 27] │           46 │            1.40456 │               1.45324 │
├───────────────────────────────────────────────────┼────────────┼─────────────────┼──────────────────────────┼──────────────┼────────────────────┼───────────────────────┤
│ pretrained_fast_tstwo_over_[200, 100]_1673137505  │   0.595466 │        0.824459 │ [21, 29, 25, 23, 31, 27] │           43 │            1.

In [30]:
text_summary_results = pd.DataFrame(data = table[1:], columns = table[0])
text_summary_results.to_csv('best_results_text_title&sumary_two_vector.csv',index=False)