In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import sys

import pickle

from transformers import AutoConfig, AutoTokenizer, TFAutoModel, DataCollatorWithPadding

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import Adam

import metrics

from simpletransformers.language_representation import RepresentationModel

from TweetDataReport import datasplit, print_tweet_report, check_relevance_balance

import time
import itertools
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt

from IPython.display import clear_output

import os
import re

from hyperopt import hp, fmin, tpe , pyll

In [2]:
print("Python Version:" + sys.version)
print("TensorFlow Version:", tf.__version__)
if tf.test.gpu_device_name():
    print("GPU is available")
else:
    print("GPU is NOT available")
print("CUDA Version:", tf.test.is_built_with_cuda())

Python Version:3.9.18 (main, Sep 11 2023, 14:09:26) [MSC v.1916 64 bit (AMD64)]
TensorFlow Version: 2.10.0
GPU is available
CUDA Version: True


In [125]:
with open('data/italian/00-dirty_dataset/feature_extractions/bert/dbmdz_bert-base-italian-cased.pkl','rb') as file:
    WF = pickle.load(file)
data = WF.copy()
data

Unnamed: 0,reps,relevance
0,"[[-0.16349472, -0.022513028, 0.1601058, 0.3005...",1
1,"[[-0.1870734, -0.1289106, 0.36564282, 0.020390...",0
2,"[[-0.058511183, -0.09192906, -0.24975105, 0.45...",1
3,"[[-0.07063484, -0.08345323, 0.18612616, 0.2530...",1
4,"[[-0.09547403, 0.21283852, 0.012412298, 0.3508...",1
...,...,...
1887,"[[-0.030228468, -0.14970928, 0.20723383, 0.424...",0
1888,"[[-0.115419574, 0.18743767, 0.23865303, 0.2736...",1
1889,"[[-0.042768005, 0.074689046, 0.05405128, 0.196...",1
1890,"[[-0.09403053, -0.04718489, 0.18331964, 0.4239...",0


In [219]:
class Relovir_Error(Exception):
    """
    This class checks if the given relovir number is acceptable based on the ratio of the dataset you gave.
    The dataset of course must be a pandas dataframe with two columns.The second column must be the 0
    (irrelevant) or 1(relevant) values of the corresponding tweets/representations. 
    
    """
    def __init__(self,dataset,relovir):
        self.relovir = relovir
        dataset = dataset.copy()
        checker = check_relevance_balance(dataset).copy()
        self.irr = checker.loc[checker['relevance']==0,'count'].reset_index(drop=True)[0]
        self.rel = checker.loc[checker['relevance']==1,'count'].reset_index(drop=True)[0]
        self.ratio = self.rel/self.irr
        if self.relovir>self.ratio or self.relovir>1:
            raise self
    def __str__(self):
        if self.relovir>self.ratio:
            return f"The relovir ratio of your dataset is {self.ratio} but you gave me {self.relovir}. The relovir ratio cannot be larger than 1."

In [222]:
def datasplit_new(df,testsize,relovir=None):
    """
    Firstly we split the dataset into train and test parts.
    
    Then we create the training dataset by picking up the irrelevant tweets from the training 
    split part with only the number of relevant tweet
    
    The relovir variable represents the relative ratio of irrelevant(we usually have more irrelevant so) 
    over relevant number of training examples in the set.
    
    Returns as a (examples,768) np array the representations and the y as (examples,) shaped np array.
    
    Future: maybe it would be better to split the dataset by relevance and then pick up the "correct" 
    relovir ratio for the test dataset from the ratio of the total dataset. 
    Now we include some randomness which is not particularly wanted due to the fact that after the split
    the relovir ratios of the training and test parts won't match exactly. 
    I will have to make 20 30 iterations per model to make sure we get the average.
    In the other case we were going to be satisfied with 5. 
    """
    df = df.copy() # make a copied instance of the dataset
    
    X_train, X_test, y_train, y_test = train_test_split(df['reps'], df['relevance'], test_size = testsize)

    if relovir is not None:
        
        Relovir_Error(df,relovir) # check if you can accept the relovir variable instance maybe the ratio is not that big
        
        # now we reconstruct the training dataset in order to use the relovir 
        training_set = pd.DataFrame()
        training_set['reps'] = X_train
        training_set['relevance'] = y_train
        training_set.reset_index(drop = True)
        # we split the training data in irrelevant and relevant cases
        # we make sure the zeros and the ones correctly correspond to irr and rel respectively
        grouping = training_set.groupby('relevance')
        group_dict = {}
        for name, group in grouping:
            group_dict[str(name)] = group
        # we find the absolute numbers 
        irr = group_dict['0']
        rel = group_dict['1']
        #print(len(irr)+len(rel))
        #pickup all the irrelevants and the correct random part of the relevants
        dfirr = irr.sample(frac = 1).reset_index(drop = True)
        #print(len(dfirr))
        dfrel = rel.sample(n = int(len(irr)*relovir)).reset_index(drop = True)
        #print(len(dfrel))
        #print(len(dfirr)+len(dfrel)
        training_set = None
        training_set = pd.concat([dfirr, dfrel]).sample(frac = 1).reset_index(drop = True)
    
    training_set_X = np.vstack(training_set['reps'])
    test_set_X = np.vstack(X_test)
    y_train = training_set['relevance'].to_frame().reset_index(drop = True)
    y_test = y_test.to_frame().reset_index(drop = True)
    return training_set_X, test_set_X, y_train.to_numpy().reshape(-1), y_test.to_numpy().reshape(-1)
    
xtr , xts , ytr , yts = datasplit_new(data, testsize = 0.3, relovir = 0.2)


(1056,)


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [191]:
print_tweet_report(data)

AttributeError: 'tuple' object has no attribute 'columns'

In [177]:
check_relevance_balance(data_new)

Unnamed: 0,relevance,count,balance
0,0,860,64.95%
1,1,464,35.05%


In [74]:
data_new = datasplit_new(data, testsize = 0.3, relovir = 0.4)
# data_new

{'rel0':                                                    reps  relevance
878   [[ 7.82199129e-02 -9.90853831e-02  1.13456212e...          0
693   [[-6.93078190e-02 -1.53684229e-01  1.67621210e...          0
758   [[-2.75426716e-01  9.96491760e-02 -8.91101658e...          0
108   [[-6.13126159e-01 -7.80202821e-02  1.87519327e...          0
1426  [[ 4.42697145e-02 -1.32218733e-01  3.34164947e...          0
...                                                 ...        ...
1191  [[-3.35526586e-01  2.43928675e-02  3.65641177e...          0
1677  [[ 4.48293053e-02  3.26756150e-01  2.34281510e...          0
419   [[-2.38247976e-01 -1.18343003e-01 -1.08924881e...          0
689   [[-1.11226447e-01 -1.33113116e-01  1.69762716e...          0
1526  [[-3.57638150e-02  6.03203811e-02  3.91110390e...          0

[882 rows x 2 columns], 'rel1':                                                    reps  relevance
793   [[ 1.18133217e-01  1.06173556e-03  2.20925882e...          1
436   [[ 2.75166184e

In [73]:
print_tweet_report(data_new)

AttributeError: 'NoneType' object has no attribute 'columns'

In [20]:
check_relevance_balance(data)

Unnamed: 0,relevance,count,balance
0,0,1258,66.49%
1,1,634,33.51%
