In [1]:
# Import Libraries
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# Import Functions
import sys
sys.path.append('../src')
from preprocessing import DataInspection, HandleMissingValues, RemoveDuplicates, CorrectDataTypes, TranslateText, HandleOutliers, SplitDataset

# Crear instancias de cada etapa
data_inspection = DataInspection()
handle_missing_values = HandleMissingValues()
remove_duplicates = RemoveDuplicates()
correct_data_types = CorrectDataTypes()
translate_text = TranslateText()
handle_outliers = HandleOutliers()
split_dataset = SplitDataset()

# Encadenar las etapas
data_inspection.set_next(handle_missing_values)
handle_missing_values.set_next(remove_duplicates)
remove_duplicates.set_next(correct_data_types)
correct_data_types.set_next(translate_text)
translate_text.set_next(handle_outliers)
handle_outliers.set_next(split_dataset)


# Cargar datos (por ejemplo, un DataFrame de pandas)
train_df = pd.read_csv('../data/train_data.csv')
print(f'train_data.shape: {train_df.shape}')
train_data = train_df.sample(frac=0.001)
train_data.reset_index(inplace=True, drop=True)
print(f'train_data.shape: {train_data.shape}')

# Ejecutar la pipeline
processed_data = data_inspection.process(train_data)

train_data.shape: (3600000, 3)
train_data.shape: (3600, 3)

**View Data Structure**

HEAD


Unnamed: 0,labels,review_title,text
0,__label__2,Childress doesn't disappoint...,"Quirky, dysfunctional Southerners are among my..."
1,__label__1,fails on every level,A previous customer described Jackie under My ...
2,__label__1,Smells like Glue!,I loved the concept of the pillow and how it m...
3,__label__1,"Came stained and torn, also not very comfortable","When we received the product, the fabric of th..."
4,__label__1,incorrect charge,the movie was good but I should not have been ...



SAMPLE


Unnamed: 0,labels,review_title,text
3441,__label__1,Rusty,We got these for our rental condo. When we vis...
2041,__label__2,our good luck,I have both read & taught Michael K. and consi...
2951,__label__2,Ashes of Britannia,"Boadicea is a Druid-Queen. General Suetonius, ..."
3028,__label__2,Must read!,"If you've read, ""Generation to Generation"" by ..."
941,__label__2,Awesome book,I think this book is better than the Harry pot...



TAIL


Unnamed: 0,labels,review_title,text
3595,__label__1,Not very helpful,If you are looking for a book to help with foo...
3596,__label__2,"Read it in an hour, think about it for a lifet...","I read ""Old Man and the Sea"" nearly three year..."
3597,__label__1,What is gun control?,"Do you truly wish felons, terrorists, mentally..."
3598,__label__1,Too soft,"The fabric is very comfortable, but entirely t..."
3599,__label__2,A Most Delightful Read,I was given this book by a friend two years be...



**Info Summary**

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600 entries, 0 to 3599
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   labels        3600 non-null   object
 1   review_title  3589 non-null   object
 2   text          3600 non-null   object
dtypes: object(3)
memory usage: 2.1 MB

**Summary Statistics**


Unnamed: 0,labels,review_title,text
count,3600,3589,3600
unique,2,3459,3600
top,__label__2,Disappointed,"Quirky, dysfunctional Southerners are among my..."
freq,1800,10,1



**Handle Missing Values**

review_title    11
labels           0
text             0
dtype: int64

Actual Missing Values

labels          0
review_title    0
text            0
dtype: int64

**Remove Duplicates**

There are no duplicated values.
ACTUAL SHAPE: (3600, 3)

**Translate Text**


Translating Text: 100%|██████████| 3600/3600 [00:12<00:00, 285.19it/s]



**Handle Outliers**

Number of detected outliers: 0
SHAPE BEFORE: (3600, 4)
ACTUAL SHAPE: (3600, 4)



2024-07-08 18:01:51.994588: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-08 18:01:52.006150: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-08 18:01:52.024540: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 18:01:52.024577: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-08 18:01:52.035731: I tensorflow/core/platform/cpu_feature_gua


LABELS:
   __label__1  __label__2
0           0           1
1           1           0
2           1           0
3           1           0
4           1           0

X_train shape: (2880, 50)
y_train shape: (2880, 2, 2)
