Inspiration: https://www.tensorflow.org/tutorials/text/classify_text_with_bert

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!pip install -q tensorflow-text
!pip install -q tf-models-official

In [3]:
import os
import shutil
import gzip
import re

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import tensorflow_addons as tfa
from official.nlp import optimization  # to create AdamW optmizer

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.model_selection import StratifiedShuffleSplit
from keras.callbacks import ModelCheckpoint

tf.get_logger().setLevel('ERROR')

# Data importation

In [4]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

def clean_description(content):
    length = len(content)

    if length == 0:
      return np.nan
    elif length == 1:
      return content[0]
    else:
      return "".join(content)
      
def parse_external_data(path, int_class):
    ext_data = pd.read_json(path, lines=True)
    df = ext_data[['title', 'description']]
    df['class'] = int_class
    df['processed_description'] = df.apply(lambda x: clean_description(x['description']), axis=1)
    df.drop(columns=['description'], inplace=True, axis=1)
    df.rename(mapper={"processed_description": "description"}, inplace=True, axis=1)
    return df


os.chdir('/content/drive/MyDrive/Capgemini/Hackathon/notebooks')
Xy_train = pd.read_csv(os.path.join('..', 'data', 'original', 'train.csv'), index_col=['id'])
X_test = pd.read_csv(os.path.join('..', 'data', 'original', 'test.csv'), index_col=['id'])
Xy_train.dropna(subset=['title'], inplace=True)

ext_data_games = parse_external_data(os.path.join('..', 'data', 'external', 'videogames.json'), 0)
ext_data_electronics = parse_external_data(os.path.join('..', 'data', 'external', 'reduced_electronics.json'), 3)
ext_data_pets = parse_external_data(os.path.join('..', 'data', 'external', 'reduced_pets.json'), 1)
ext_data_sports = parse_external_data(os.path.join('..', 'data', 'external', 'reduced_sports.json'), 2)
data = pd.concat([Xy_train, ext_data_games, ext_data_electronics, ext_data_pets, ext_data_sports], ignore_index=True)
data.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421561 entries, 0 to 421560
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   title        421561 non-null  object
 1   description  382620 non-null  object
 2   class        421561 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 9.6+ MB


In [5]:
data.groupby(by='class').count()

Unnamed: 0_level_0,title,description
class,Unnamed: 1_level_1,Unnamed: 2_level_1
0,86376,76026
1,111362,104215
2,112285,100121
3,111538,102258


In [6]:
data.head()

Unnamed: 0,title,description,class
0,Samsung 32-inch Curved LED Monitor (Ultra- Sli...,1800R curved monitor with 3000:1 contrast rati...,3
1,HotHands Body & Hand Super Warmers - Long Last...,"Bring the Heat!,HotHands Body & Hand Super War...",2
2,"WePet Cat Litter Mat, Kitty Litter Trapping Ma...",,1
3,All-new Kindle Paperwhite Water-Safe Fabric Co...,,3
4,Street Fighter 30th Anniversary Collection - P...,Celebrate the 30th Anniversary of the iconic S...,0


In [7]:
data.isna().sum()

title              0
description    38941
class              0
dtype: int64

## Preprocessing Data

In [8]:
def clean_str(content):
    length = len(content)

    if length <= 1:
      return np.nan
    else:
      return content

def filter_str(df):
    mask_title = df['title'].str.contains(r'^.*[a-zA-Z].*$', na=True)
    cleaned_title = df[mask_title]
    mask_description = cleaned_title['description'].str.contains(r'^.*[a-zA-Z].*$', na=True)
    cleaned_both = cleaned_title[mask_description]

    BAD_CHARS = ['<', '>', ']', '[', '{', '}', '|', '\\']
    pat = '|'.join(['({})'.format(re.escape(c)) for c in BAD_CHARS])
    cleaned = cleaned_both[~cleaned_both['description'].str.contains(pat)]
    return cleaned

def preprocess_data(df):
  df.dropna(subset=['description', 'title'], inplace=True)
  df['processed_title'] = df.apply(lambda x: clean_str(x['title']), axis=1)
  df.drop(columns=['title'], inplace=True, axis=1)
  df.rename(mapper={'processed_title': 'title'}, inplace=True, axis=1)
  cleaned_df = df.dropna(subset=['description', 'title'], inplace=False)

  df['processed_description'] = df.apply(lambda x: clean_str(x['description']), axis=1)
  df.drop(columns=['description'], inplace=True, axis=1)
  df.rename(mapper={'processed_description': 'description'}, inplace=True, axis=1)
  df.dropna(subset=['description', 'title'], inplace=True)

  cleaned_df = filter_str(df)
  return cleaned_df

cleaned_df = preprocess_data(data)
cleaned_df.isna().sum()

  return func(self, *args, **kwargs)


class          0
title          0
description    0
dtype: int64

In [9]:
cleaned_df.groupby(by='class').count()

Unnamed: 0_level_0,title,description
class,Unnamed: 1_level_1,Unnamed: 2_level_1
0,45852,45852
1,93050,93050
2,85355,85355
3,75073,75073


In [10]:
cleaned_df.head()

Unnamed: 0,class,title,description
0,3,Samsung 32-inch Curved LED Monitor (Ultra- Sli...,1800R curved monitor with 3000:1 contrast rati...
1,2,HotHands Body & Hand Super Warmers - Long Last...,"Bring the Heat!,HotHands Body & Hand Super War..."
4,0,Street Fighter 30th Anniversary Collection - P...,Celebrate the 30th Anniversary of the iconic S...
5,1,Pioneer Pet Replacement Pump and Transformer,Replacement pump for Pioneer Pet Fountains. Fo...
7,1,ASPCA ACC Cat House & Scratcher w/ Bonus Catni...,Give kitty a relaxing place to play and hide w...


In [11]:
cleaned_df['len_description'] = cleaned_df['description'].apply(lambda x: len(x.split()))
cleaned_df['len_title'] = cleaned_df['title'].apply(lambda x: len(x.split()))
cleaned_df.describe()

Unnamed: 0,class,len_description,len_title
count,299330.0,299330.0,299330.0
mean,1.633578,65.96273,8.616975
std,1.019868,66.913893,4.544983
min,0.0,1.0,1.0
25%,1.0,21.0,6.0
50%,2.0,46.0,8.0
75%,3.0,86.0,10.0
max,3.0,2165.0,119.0


# Bert imports

In [12]:
bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8'
bert_model_name = 'bert_multi_cased_L-12_H-768_A-12'
bert_model_name = 'bert_en_uncased_L-12_H-768_A-12'

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_cased_L-12_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_multi_cased_L-12_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-2_H-256_A-4': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-2_H-512_A-8': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-2_H-768_A-12': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-4_H-128_A-2': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-4_H-256_A-4': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-4_H-512_A-8': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-4_H-768_A-12': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-6_H-128_A-2': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-6_H-256_A-4': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-6_H-512_A-8': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-6_H-768_A-12': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-8_H-128_A-2': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-8_H-256_A-4': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-8_H-512_A-8': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-8_H-768_A-12': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-10_H-128_A-2': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-10_H-256_A-4': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-10_H-512_A-8': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-10_H-768_A-12': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-12_H-128_A-2': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-12_H-256_A-4': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-12_H-512_A-8': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-12_H-768_A-12': 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
    'albert_en_base': 'https://tfhub.dev/tensorflow/albert_en_base/2',
    'electra_small': 'https://tfhub.dev/google/electra_small/2',
    'electra_base': 'https://tfhub.dev/google/electra_base/2',
    'experts_pubmed': 'https://tfhub.dev/google/experts/bert/pubmed/2',
    'experts_wiki_books': 'https://tfhub.dev/google/experts/bert/wiki_books/2',
    'talking-heads_base': 'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-256_A-4': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-512_A-8': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-128_A-2': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-256_A-4': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-512_A-8': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-128_A-2': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-256_A-4': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-512_A-8': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-128_A-2': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-256_A-4': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-512_A-8': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-128_A-2': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-256_A-4': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-512_A-8': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-128_A-2': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-256_A-4': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-512_A-8': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_multi_cased_L-12_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
    'albert_en_base': 'https://tfhub.dev/tensorflow/albert_en_preprocess/2',
    'electra_small': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'electra_base': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_pubmed': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_wiki_books': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talking-heads_base': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


# Modeling functions

In [13]:
def load_data(Xy_train, X_test, column):

    Xy_train_column = Xy_train.dropna(subset=[column], inplace=False)
    X_test_column = X_test.dropna(subset=[column], inplace=False)
    X_test_column = X_test_column[column]
    X_train_column = Xy_train_column[column]
    y_train_column = Xy_train_column['class']
    y_train_column_cat = tf.keras.utils.to_categorical(y_train_column)

    return X_train_column, y_train_column_cat, X_test_column

In [14]:
def train_model(X_train, y_train_cat, tfhub_handle_encoder, tfhub_handle_preprocess, model_name, seq_length=172, epochs=15):

    def build_classifier_model(seq_length):
        text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
        preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
        encoder_inputs = preprocessing_layer(text_input)
        encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
        outputs = encoder(encoder_inputs)
        net = outputs['pooled_output']
        net = tf.keras.layers.Dropout(0.1)(net)
        net = tf.keras.layers.Dense(4, activation=tf.keras.activations.softmax, name='classifier')(net)
        return tf.keras.Model(text_input, net)

    bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
    bert_model = hub.KerasLayer(tfhub_handle_encoder)

    classifier_model = build_classifier_model(seq_length)
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    metrics = tfa.metrics.F1Score(num_classes=4)

    steps_per_epoch = len(X_train)
    num_train_steps = steps_per_epoch * epochs
    num_warmup_steps = int(0.1 * num_train_steps)

    init_lr = 3e-5
    optimizer = optimization.create_optimizer(init_lr=init_lr,
                                            num_train_steps=num_train_steps,
                                            num_warmup_steps=num_warmup_steps,
                                            optimizer_type='adamw')
    
    classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)
    
    checkpoint = ModelCheckpoint(model_name + ".hdf5", monitor='val_f1_score', verbose=1,
                                 save_best_only=True, mode='auto', period=1)

    print(f'Training model with {tfhub_handle_encoder}')
    history = classifier_model.fit(X_train, y_train_cat, 32, epochs=epochs, validation_split=0.20)

    return classifier_model, optimizer

In [15]:
def predict_and_save(classifier_model, X_test, filename_submission, filename_unpack, filename_model, labels):

    y_pred_cat = classifier_model.predict(
        X_test, batch_size=None, verbose=1, steps=None, callbacks=None, max_queue_size=10,
        workers=1, use_multiprocessing=False)
    
    y_pred = np.argmax(y_pred_cat, axis=1)

    y_pred_submission = pd.DataFrame(y_pred, index=X_test.index.values, columns=['class'])
    y_pred_submission.index.rename('id', inplace=True)
    y_pred_submission.to_csv(os.path.join('..', 'data', filename_submission))

    y_pred_cat_df = pd.DataFrame(y_pred_cat, index=X_test.index.values, columns=labels)
    y_pred_cat_df.index.rename('id', inplace=True)
    y_pred_cat_df.to_csv(os.path.join('..', 'data', filename_unpack))

    classifier_model.save(os.path.join('..', 'saved_models', 'voting_classifier', filename_model))

# Execution

## Subset

In [17]:
cleaned_df.to_csv(os.path.join('..', 'data', 'all', '400k.csv'), sep='#')

In [24]:
two_or_more = cleaned_df[cleaned_df['len_title'] >= 2]
two_or_more.drop_duplicates(subset=['title'], inplace=True)
subset = two_or_more.groupby(by=['class']).sample(n=10000)
subset.shape
subset.head(100)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,class,title,description,len_description,len_title
50734,0,MightySkins Protective Skin Decal Wrap Cover f...,Each Xbox 360 Skin kit is printed with super-h...,93,19
87123,0,Extra Long 10 Feet Gold plated USB Play and ch...,Misplaced your USB cable or just want an extra...,30,24
51830,0,Playstation 2 Online Start-up Disc 4.0,WORKS GREAT,2,6
50261,0,Ultimate Pinball Extreme,Includes 20 Fantastic Full-Tilt Tables!Product...,209,3
40723,0,Diner Dash - Sony PSP,"In Diner Dash, you play the restaurant entrepr...",38,5
...,...,...,...,...,...
56680,0,Sonic Generations - Xbox 360,Set across three defining eras from 20 years o...,52,5
922,0,Thrustmaster T-Flight HOTAS One Ace Combat 7 E...,Official flight stick for Ace combat open skie...,11,10
77627,0,Skull Stripes Purple - Holiday Bundle Decal St...,WraptoSkinzTM skins are superb photo quality d...,77,21
50766,0,Halo Special Edition Xbox Game System [Xbox],"Includes 1 controller, AV cable, power cord.",7,7


In [25]:
subset_random = subset.sample(frac=1, random_state=42)

In [27]:
subset_random.head()

Unnamed: 0,class,title,description,len_description,len_title
175870,3,Gefen USB Hub,4 Port USB 2.0 Hub EXT-USB-144 Why not buy thi...,16,3
235917,1,Pet 'n Shape 3 Pack Chik 'n Skewers (12 oz),"Chik n Skewers are tasty, long lasting and als...",79,10
379489,2,2 Piece 4&#39; Slat 1-13/16&quot; Grey S.S. C...,This durable fiberglass bow lasts longer than ...,25,8
49065,0,Wii Riiser Aerobic Step for the Wii Fit Balanc...,The Riiser is a stand that you put under your ...,55,10
328066,2,TheraBand Professional Non-Latex Resistance Ba...,TheraBand Professional Non-Latex Resistance Ba...,73,28


## Title

In [28]:
# X_train_title, y_train_title_cat, X_test_title = load_data(subset_random, X_test, 'title')
# X_train_title.shape 

In [29]:
# classifier_model_title, optimizer_title = train_model(X_train_title, y_train_title_cat, tfhub_handle_encoder, tfhub_handle_preprocess, 'title_big_data_checkpoint', epochs=5)

In [30]:
# predict_and_save(classifier_model_title, X_test_title, 'title_big_data.csv', 'title_softmax_big_data.csv', 'title_big_data', labels=['title_0', 'title_1', 'title_2', 'title_3'])

## Description

In [31]:
X_train_desc, y_train_desc_cat, X_test_desc = load_data(subset_random, X_test, 'description')
X_train_desc.head()

175870    4 Port USB 2.0 Hub EXT-USB-144 Why not buy thi...
235917    Chik n Skewers are tasty, long lasting and als...
379489    This durable fiberglass bow lasts longer than ...
49065     The Riiser is a stand that you put under your ...
328066    TheraBand Professional Non-Latex Resistance Ba...
Name: description, dtype: object

In [32]:
classifier_model_desc, optimizer_description = train_model(X_train_desc, y_train_desc_cat, tfhub_handle_encoder, tfhub_handle_preprocess, epochs=5, model_name='description_big_data_checkpoint')

Training model with https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [33]:
predict_and_save(classifier_model_desc, X_test_desc, 'desc_big_data.csv', 'desc_softmax_big_data.csv', 'description_big_data', labels=['desc_0', 'desc_1', 'desc_2', 'desc_3'])





## Concat

In [None]:
cleaned_df['concatenation'] = cleaned_df['title'] + cleaned_df['description']
X_test['concatenation'] = X_test['title'] + X_test['description']

X_train_concat, y_train_concat_cat, X_test_concat = load_data(cleaned_df, X_test, 'concatenation')
classifier_model_concat, optimizer_concat = train_model(X_train_concat, y_train_concat_cat, tfhub_handle_encoder, tfhub_handle_preprocess, epochs=10, model_name='checkpoint_big_data_concat')
predict_and_save(classifier_model_concat, X_test_concat, 'concat_big_data.csv', 'concat_softmax_big_data.csv', 'concat_big_data', labels=['concat_0', 'concat_1', 'concat_2', 'concat_3'])

## VotingClassifier

In [None]:
# from sklearn.ensemble import VotingClassifier

# print(f"Loading Title Model")
# # title = tf.keras.models.load_model(os.path.join('..', 'saved_models', 'voting_classifier', 'title_big_bert'), 
#                                   #  custom_objects={'AdamWeightDecay': optimizer_title})
# print(f"Loading Description Model")
# description = tf.keras.models.load_model(os.path.join('..', 'saved_models', 'voting_classifier', 'description_big_bert'), 
#                                          custom_objects={'AdamWeightDecay': optimizer_description})
# print(f"Loading Concat Model")
# concat = tf.keras.models.load_model(os.path.join('..', 'saved_models', 'voting_classifier', 'concat_big_bert'), 
#                                     custom_objects={'AdamWeightDecay': optimizer_concat})

In [None]:
# def get_voting_classifier_df(models, X_test, names):

#     y_pred_cat = []
#     for idx, model in enumerate(models):
#         y_pred_cat = model.predict(X_test, batch_size=None, verbose=1, steps=None, 
#                                    callbacks=None, max_queue_size=10,
#                                    workers=1, use_multiprocessing=False)
#         labels = [names[idx] + '_' + str(i) for i in range(len(y_pred[0]))]
#         print(labels)
#         y_pred_cat_df = pd.DataFrame(y_pred_cat, index=X_test.index.values, columns=labels)
#         y_pred_cat_df.index.rename('id', inplace=True)
#         y_pred_cat.append(y_pred_cat_df)
#     return pd.concat(y_pred_cat, axis=1)

# def train_voting_classifier(X_train, y_train, model_name, epochs):

#     def build_voting_classifier_model():
#         inputs = tf.keras.Input(shape=(12,))
#         x = tf.keras.layers.Dense(8, activation=tf.nn.relu)(inputs)
#         outputs = tf.keras.layers.Dense(4, activation=tf.nn.softmax)(x)
#         model = tf.keras.Model(inputs=inputs, outputs=outputs)
#         return model
    
#     voting_classifier_model = build_voting_classifier_model()
#     loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
#     metrics = tfa.metrics.F1Score(num_classes=4)
  
#     voting_classifier_model.compile(optimizer=optimizer='adam',
#                          loss=loss,
#                          metrics=metrics)
    
#     checkpoint = ModelCheckpoint(model_name + ".hdf5", monitor='val_f1_score', verbose=1,
#                                  save_best_only=True, mode='auto', period=1)

#     history = voting_classifier_model.fit(X_train, y_train_cat, 32, epochs=epochs, validation_split=0.20)

#     return voting_classifier_model


# voting_df = get_voting_classifier_df([title, description, concat], X_train, ['title', 'description', 'concat'])
# voting_df.head()

## 128next tokens

In [None]:
# Xy_train['description'].fillna(Xy_train['title'], inplace=True)
# Xy_train['len_description'] = Xy_train['description'].apply(lambda x: len(x.split(' ')))

# X_test['description'].fillna(X_test['title'], inplace=True)
# X_test['len_description'] = X_test['description'].apply(lambda x: len(x.split(' ')))

In [None]:
# def fill_next(description, len):

#     if len <= 128:
#         return description
#     elif len > 128 and len <= 256:
#         desc_list = description.split(' ')
#         # print(desc_list)
#         desc_list = desc_list[-128:]
#         # print(desc_list)
#         return ' '.join(desc_list)
#     else:
#         desc_list = description.split(' ')[128:256]
#         return ' '.join(desc_list)

# Xy_train['next'] = Xy_train.apply(lambda x: fill_next(x['description'], x['len_description']), axis=1)
# X_test['next'] = X_test.apply(lambda x: fill_next(x['description'], x['len_description']), axis=1)

In [None]:
# X_train_next, y_train_next_cat, X_test_next = load_data(Xy_train, X_test, 'next')
# classifier_model_next = train_model(X_train_next, y_train_next_cat, tfhub_handle_encoder, tfhub_handle_preprocess, epochs=5)
# predict_and_save(classifier_model_next, X_test_next, 'next_submission6.csv', 'next_softmax6.csv', 'next_big_bert1', labels=['next_0', 'next_1', 'next_2', 'next_3'])