In [None]:
!pip install tensorflow_text

In [None]:
!pip install tensorflow_addons

In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
import tensorflow_addons as tfa
import tensorflow_datasets as tfds

In [11]:
df = pd.read_csv("procData_nosub.csv")
df = df.sample(frac=1).reset_index(drop=True) # shuffling since train-test-val does not
df.head()

Unnamed: 0,title,files,description,category,fileAmount,more100Files,fileSize
0,Fire Emblem - Engage OST (2023).zip,Fire Emblem - Engage OST (2023).zip,\n Credit to NintendoMelody...,Audio,1,0,19.713708
1,[JAM_CLUB]_Mob_Psycho_100_3rd_Season_11_[1080p...,[JAM_CLUB]_Mob_Psycho_100_3rd_Season_11_[1080p...,\n #### No description.\n ...,Anime,1,0,20.670658
2,[FS Pavilion][斗破苍穹 第5季] Battle Through The Hea...,[FS Pavilion][斗破苍穹 第5季][Battle Through The Hea...,\n **English subs by Fallin...,Anime,1,0,20.000671
3,[Naruto-Kun.Hu] Kage no Jitsuryokusha ni Narit...,[Naruto-Kun.Hu] Kage no Jitsuryokusha ni Narit...,\n #### No description.\n ...,Anime,1,0,19.316126
4,Yakuza Fiancé - Raise wa Tanin ga Ii v01-02 (2...,Yakuza Fiancé - Raise wa Tanin ga Ii v01 (2022...,\n **Yakuza Fiancé - Raise ...,Literature,2,0,20.529137


In [12]:
df = df[(df.category != 'Pictures')] # small categories
df = df[(df.category != 'Software')]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69983 entries, 0 to 70293
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         69983 non-null  object 
 1   files         69983 non-null  object 
 2   description   69983 non-null  object 
 3   category      69983 non-null  object 
 4   fileAmount    69983 non-null  int64  
 5   more100Files  69983 non-null  int64  
 6   fileSize      69983 non-null  float64
dtypes: float64(1), int64(2), object(4)
memory usage: 4.3+ MB


In [13]:
train, val, test = np.split(df.sample(frac=1), [int(0.8 * len(df)), int(0.9 * len(df))])

In [14]:
print(len(train), len(val), len(test), len(df))

55986 6998 6999 69983


In [15]:
from sklearn.utils import class_weight
class_weights = list(class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(df['category']), y=df['category']))

In [16]:
weights = {}
for index, w in enumerate(class_weights):
  weights[index] = w
weights 

{0: 0.3540503075926825,
 1: 5.833861287095699,
 2: 1.5651950259438183,
 3: 2.7379890453834115}

In [17]:
np.unique(df['category'])

array(['Anime', 'Audio', 'Literature', 'Live Action'], dtype=object)

In [18]:
def df_to_dataset(dataframe, shuffle=True, batch_size=64):
  df = dataframe.copy()
  labels = df.pop('category')
  df = {key: value[:,tf.newaxis] for key, value in df.items()}
  ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(df))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [19]:
train_data = df_to_dataset(train)
test_data = df_to_dataset(test)
validation_data = df_to_dataset(val)

  df = {key: value[:,tf.newaxis] for key, value in df.items()}


In [20]:
train['category'].value_counts()

Anime          39588
Literature      8879
Live Action     5162
Audio           2357
Name: category, dtype: int64

In [21]:
catVals = np.unique(df['category'])
table = tf.lookup.StaticHashTable(
    initializer = tf.lookup.KeyValueTensorInitializer(
        keys = tf.constant(catVals),
        values = list(range(len(catVals)))
    ),
    default_value = -1,
    name = "target_encoding"
)

In [22]:
catVals

array(['Anime', 'Audio', 'Literature', 'Live Action'], dtype=object)

In [23]:
@tf.function
def target(x):
  return table.lookup(x)

In [24]:
def fetch(features, labels):
  return features, tf.one_hot(target(labels), len(catVals))

In [25]:
train_data_f = train_data.map(fetch)
test_data_f = test_data.map(fetch)
validation_data_f = validation_data.map(fetch)

In [26]:
embedding = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
hub_layer = hub.KerasLayer(embedding, output_shape = 512, input_shape = [], dtype=tf.string, trainable=True)



In [27]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  if dtype == 'string':
    index = tf.keras.layers.StringLookup(max_tokens=max_tokens)
  else:
    index = tf.keras.layers.IntegerLookup(max_tokens=max_tokens)

  feature_ds = dataset.map(lambda x, y: x[name])
  index.adapt(feature_ds)
  encoder = tf.keras.layers.CategoryEncoding(num_tokens=index.vocabulary_size())
  return lambda feature: encoder(index(feature))

In [28]:
def get_normalization_layer(name, dataset):
  normalizer = tf.keras.layers.Normalization(axis=None)
  feature_ds = dataset.map(lambda x, y: x[name])
  normalizer.adapt(feature_ds)
  return normalizer

In [29]:
all_inputs = []
encoded_features = []

In [30]:
for header in ['fileSize']: # fileAmount
  numeric_col = tf.keras.Input(shape=(1,), name=header)
  normalization_layer = get_normalization_layer(header, train_data_f)
  encoded_numeric_col = normalization_layer(numeric_col)
  all_inputs.append(numeric_col)
  encoded_features.append(encoded_numeric_col)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [None]:
# for header in ['more100Files']:
#   num_cat_col = tf.keras.Input(shape=(1,), name=header)
#   categ_layer = get_category_encoding_layer(header, train_data_f, "int64")
#   encoded_categ_col = categ_layer(num_cat_col)
#   all_inputs.append(num_cat_col)
#  encoded_features.append(encoded_categ_col)

In [31]:
for header in ['title', 'description', 'files']:
  text_col = tf.keras.Input(shape=(), name=header, dtype='string')
  encoded_text_col = hub_layer(text_col)
  all_inputs.append(text_col)
  encoded_features.append(encoded_text_col)

In [35]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = all_features
x = tf.keras.layers.Reshape(target_shape=(1537, 1))(x)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True))(x)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16))(x)
for units in [128, 64]:
  x = tf.keras.layers.Dense(units, activation='relu')(x)
  # x = tf.keras.layers.Dropout(0.2)(x)
output = tf.keras.layers.Dense(len(catVals), activation='softmax')(x)

In [36]:
model = tf.keras.Model(all_inputs, output)

In [37]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
              loss=tf.keras.losses.CategoricalCrossentropy(), # deleted from_logits=True
              metrics=["categorical_accuracy",
                        tf.keras.metrics.Precision(),
                        tf.keras.metrics.Recall(),
                        tfa.metrics.F1Score(num_classes=len(catVals),
                                            average='macro',
                                            threshold=0.5)])

In [38]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 fileSize (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 title (InputLayer)             [(None,)]            0           []                               
                                                                                                  
 description (InputLayer)       [(None,)]            0           []                               
                                                                                                  
 files (InputLayer)             [(None,)]            0           []                               
                                                                                              

In [39]:
history = model.fit(train_data_f, epochs=5, validation_data=validation_data_f) # removed class weights

Epoch 1/5


  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [40]:
model.evaluate(test_data_f)



[0.05197759345173836,
 0.9859979748725891,
 0.9868345856666565,
 0.985283613204956,
 0.9677801728248596]

In [41]:
model.evaluate(validation_data_f)



[0.047415804117918015,
 0.9874249696731567,
 0.9882655739784241,
 0.9868533611297607,
 0.9706678986549377]

In [43]:
model.save("trainedModel_LSTM")



In [44]:
# test_x = test_data_f.unbatch().map(lambda x, y: x)
test_x = test_data_f.unbatch().map(lambda x, y: x)
test_y = test_data_f.unbatch().map(lambda x, y: y)

In [45]:
test_predicted = model.predict(test_x)

  inputs = self._flatten_to_reference_inputs(inputs)




In [46]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [47]:
test_y = np.array([x for x in test_y])

In [48]:
print(classification_report(test_y.argmax(1), test_predicted.argmax(1)))

              precision    recall  f1-score   support

           0       0.72      0.73      0.73      4850
           1       0.12      0.12      0.12       329
           2       0.26      0.26      0.26      1199
           3       0.16      0.15      0.15       621

    accuracy                           0.57      6999
   macro avg       0.32      0.31      0.32      6999
weighted avg       0.57      0.57      0.57      6999



In [49]:
print(tf.math.confusion_matrix(test_y.argmax(1), test_predicted.argmax(1)))

tf.Tensor(
[[3535  208  746  361]
 [ 206   40   53   30]
 [ 736   49  310  104]
 [ 402   34   92   93]], shape=(4, 4), dtype=int32)
