In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import sys

import pickle

from transformers import AutoConfig, AutoTokenizer, TFAutoModel, DataCollatorWithPadding

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import Adam

import metrics

from simpletransformers.language_representation import RepresentationModel

from TweetDataReport import datasplit, print_tweet_report, check_relevance_balance

import time
import itertools
import warnings
# warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt

from IPython.display import clear_output

import os
import re

from hyperopt import hp, fmin, tpe , pyll

In [4]:
print("Python Version:" + sys.version)
print("TensorFlow Version:", tf.__version__)
if tf.test.gpu_device_name():
    print("GPU is available")
else:
    print("GPU is NOT available")
print("CUDA Version:", tf.test.is_built_with_cuda())

Python Version:3.9.18 (main, Sep 11 2023, 14:09:26) [MSC v.1916 64 bit (AMD64)]
TensorFlow Version: 2.10.0
GPU is available
CUDA Version: True


In [5]:
with open('data/italian/00-dirty_dataset/feature_extractions/bert/dbmdz_bert-base-italian-cased.pkl','rb') as file:
    WF = pickle.load(file)

In [4]:
check_relevance_balance(WF)

Unnamed: 0,relevance,count,balance
0,0,1258,66.49%
1,1,634,33.51%


In [5]:
print_tweet_report(WF)

Unnamed: 0,column_names,data_types,shape_len,unique_values
0,reps,<class 'numpy.ndarray'>,"(1, 768)",1892
1,relevance,<class 'numpy.int64'>,(),2


In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
        
def datasplit_new(df,testsize,relovir):
    """
    Firstly we split the dataset into train and test parts.
    
    Then we create the training dataset by picking up the irrelevant tweets from the training 
    split part with only the number of relevant tweet
    
    The relovir variable represents the relative ratio of irrelevant(we usually have more irrelevant so) 
    over relevant number of training examples in the set.
    
    Returns as a (examples,768) np array the representations and the y as (examples,) shaped np array.
    
    Future: we need to be able to reduce zero and one examples accordinglydepending of which 
    
    """
    # make a copied instance of the dataset
    df = df.copy()
    # we split the dataset into train and test subsets
    df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df['reps'], df['relevance'], test_size = testsize)
    # reconstruct training set
    if relovir>0:
        training_set = pd.DataFrame()
        training_set['reps'] = df_X_train.copy()
        training_set['relevance'] = df_y_train.copy()
        training_set.reset_index(drop = True)
        # we split the training dataset by relervance into two DataFrames irr and rel
        grouping = training_set.groupby('relevance')
        group_dict = {}
        for name, group in grouping:
            group_dict[str(name)] = group
        # we find the absolute numbers 
        df_training_irr = group_dict['0'].reset_index(drop = True)
        df_training_rel = group_dict['1'].reset_index(drop = True)
        # based on the relovir parameter pick up the set with the appropriate ratio, here an explanation for the inner if is needed:
        Nrel = len(df_training_rel)
        print(Nrel)
        Nirr = len(df_training_irr)
        print(Nirr)
        N = Nrel + Nirr
        if relovir<=1:
            if relovir<Nrel/Nirr:
                relevant_part = df_training_rel.sample(n = int(Nirr*relovir))
                df_training = pd.concat([df_training_irr, relevant_part]).sample(frac=1).reset_index(drop = True)
            else:
                df_training = pd.concat([df_training_irr, df_training_rel]).sample(frac=1).reset_index(drop = True)
        else:
            if relovir<Nrel/Nirr:
                irrelevant_part = df_training_irr.sample(n = int((1/relovir)*Nrel))
                df_training = pd.concat([df_training_rel, irrelevant_part]).sample(frac=1).reset_index(drop = True)
            else:
                df_training = pd.concat([df_training_irr, df_training_rel]).sample(frac=1).reset_index(drop = True)
        df_X_train = df_training['reps'].copy()
        df_y_train = df_training['relevance'].copy()
    else:
        print("relovir can't be negative or zero")
    df_X_train.apply(lambda x: x.reshape(768,))
    df_X_test = df_X_test.copy().apply(lambda x: x.reshape(768,))
    training_set_X = np.vstack(df_X_train)
    test_set_X = np.vstack(df_X_test)
    return training_set_X, test_set_X, df_y_train, df_y_test

training_set_X, test_set_X, training_set_y,  test_set_y = datasplit_new(WF,0.3,relovir=0.1)
some = pd.DataFrame()
some['relevance']=training_set_y
check_relevance_balance(some)

440
884


Unnamed: 0,relevance,count,balance
0,0,884,90.95%
1,1,88,9.05%


In [6]:
print(f"Το training set X: {type(training_set_X)} με {training_set_X.shape}")
print(f"To test set X: {type(test_set_X)} με {test_set_X.shape}")
print(f"To training set y: {type(training_set_y)} με {len(training_set_y)}")
print(f"To test set y: {type(test_set_y )} με {len(test_set_y)}")

model = Sequential()
model.add(Input(shape = (768,)))
model.add(Dense(768,activation='relu'))
model.add(Dense(384,activation='relu'))
model.add(Dense(192,activation='relu'))
model.add(Dense(1,activation='softmax'))
model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['acc', metrics.precision, metrics.recall, metrics.f1])
history = model.fit(training_set_X, training_set_y,validation_data=( test_set_X,  test_set_y), batch_size = 100, epochs = 50, verbose = 2,callbacks=[],shuffle = True)
model.evaluate(test_set_X,test_set_y)

Το training set X: <class 'numpy.ndarray'> με (1044, 768)
To test set X: <class 'numpy.ndarray'> με (568, 768)
To training set y: <class 'pandas.core.series.Series'> με 1044
To test set y: <class 'pandas.core.series.Series'> με 568
Epoch 1/50
11/11 - 2s - loss: 0.5065 - acc: 0.1667 - precision: 0.1651 - recall: 1.0000 - f1: 0.2822 - val_loss: 0.6962 - val_acc: 0.3169 - val_precision: 0.3196 - val_recall: 1.0000 - val_f1: 0.4834 - 2s/epoch - 219ms/step
Epoch 2/50
11/11 - 0s - loss: 0.4015 - acc: 0.1667 - precision: 0.1651 - recall: 1.0000 - f1: 0.2817 - val_loss: 0.5921 - val_acc: 0.3169 - val_precision: 0.3196 - val_recall: 1.0000 - val_f1: 0.4834 - 95ms/epoch - 9ms/step
Epoch 3/50
11/11 - 0s - loss: 0.3503 - acc: 0.1667 - precision: 0.1721 - recall: 1.0000 - f1: 0.2921 - val_loss: 0.4972 - val_acc: 0.3169 - val_precision: 0.3196 - val_recall: 1.0000 - val_f1: 0.4834 - 103ms/epoch - 9ms/step
Epoch 4/50
11/11 - 0s - loss: 0.3061 - acc: 0.1667 - precision: 0.1640 - recall: 1.0000 - f1: 0

[0.3954237103462219,
 0.31690141558647156,
 0.31712964177131653,
 1.0,
 0.4766274690628052]

In [232]:
451/873

0.5166093928980527

In [203]:
data_new = datasplit_new(data, testsize = 0.3, relovir = 0.4)
# data_new

NameError: name 'data' is not defined

In [73]:
print_tweet_report(data_new)

AttributeError: 'NoneType' object has no attribute 'columns'

In [20]:
check_relevance_balance(data)

Unnamed: 0,relevance,count,balance
0,0,1258,66.49%
1,1,634,33.51%


In [165]:
import numpy as np
import pandas as pd

# Example Pandas Series containing NumPy arrays
my_series = pd.Series([np.array([1, 2, 3]), np.array([4, 5, 6])])

# Use np.vstack() to stack the arrays vertically
stacked_array = np.vstack(my_series)

# The resulting array will have shape (1300, 768)
print(stacked_array.shape)

(2, 3)


In [166]:
my_series

0    [1, 2, 3]
1    [4, 5, 6]
dtype: object

In [167]:
stacked_array

array([[1, 2, 3],
       [4, 5, 6]])