In [None]:
!pip install tensorflow_text

In [None]:
!pip install tensorflow_addons

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
import tensorflow_addons as tfa
import tensorflow_datasets as tfds

In [None]:
df = pd.read_csv("procData_nosub.csv")
df = df.sample(frac=1).reset_index(drop=True) # shuffling since train-test-val does not
df.head()

Unnamed: 0,title,files,description,category,fileAmount,more100Files,fileSize
0,[ADF] Love Live! Nijigasaki Gakuen School Idol...,[ADF] Love Live! Nijigasaki Gakuen School Idol...,"\n Nijigaku T2 terminó, per...",Anime,1,0,21.05678
1,1 comment,Kodomo no Omocha - E051 - English Dub Commenta...,\n Episodes 33-51 of the du...,Anime,20,0,22.423656
2,浅草鬼嫁日記 あやかし夫婦は君の名前をまだ知らない。 第01-02巻 [Asakusa On...,DLRAW.NET-Asakusa Oniyome Nikki Ayakashi v01-0...,\n More info; https://dlraw...,Literature,1,0,19.296666
3,[Tsundere-Raws] Akuyaku Reijou nano de Last Bo...,[Tsundere-Raws] Akuyaku Reijou nano de Last Bo...,\n ## **Akuyaku Reijou nano...,Anime,1,0,21.05678
4,[Lilith-Raws] 給不滅的你 / Fumetsu no Anata e S02 -...,[Lilith-Raws] Fumetsu no Anata e S02 - 04 [Bah...,\n Telegram Update Notifica...,Anime,1,0,20.632356


In [None]:
# df = df[(df.category != 'Pictures')] # small categories
# df = df[(df.category != 'Software')]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70294 entries, 0 to 70293
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         70294 non-null  object 
 1   files         70294 non-null  object 
 2   description   70294 non-null  object 
 3   category      70294 non-null  object 
 4   fileAmount    70294 non-null  int64  
 5   more100Files  70294 non-null  int64  
 6   fileSize      70294 non-null  float64
dtypes: float64(1), int64(2), object(4)
memory usage: 3.8+ MB


In [None]:
train, val, test = np.split(df.sample(frac=1), [int(0.8 * len(df)), int(0.9 * len(df))])

In [None]:
print(len(train), len(val), len(test), len(df))

56235 7029 7030 70294


In [None]:
from sklearn.utils import class_weight
class_weights = list(class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(df['category']), y=df['category']))

In [None]:
weights = {}
for index, w in enumerate(class_weights):
  weights[index] = w
weights 

{0: 0.23708245642436998,
 1: 3.9065243970212293,
 2: 1.0481004353790182,
 3: 1.833437663015128,
 4: 109.49221183800623,
 5: 57.4297385620915}

In [None]:
np.unique(df['category'])

array(['Anime', 'Audio', 'Literature', 'Live Action', 'Pictures',
       'Software'], dtype=object)

In [None]:
def df_to_dataset(dataframe, shuffle=True, batch_size=64):
  df = dataframe.copy()
  labels = df.pop('category')
  df = {key: value[:,tf.newaxis] for key, value in df.items()}
  ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(df))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [None]:
train_data = df_to_dataset(train)
test_data = df_to_dataset(test)
validation_data = df_to_dataset(val)

  df = {key: value[:,tf.newaxis] for key, value in df.items()}


In [None]:
train['category'].value_counts()

Anime          39505
Literature      9029
Live Action     5051
Audio           2394
Software         170
Pictures          86
Name: category, dtype: int64

In [None]:
catVals = np.unique(df['category'])
table = tf.lookup.StaticHashTable(
    initializer = tf.lookup.KeyValueTensorInitializer(
        keys = tf.constant(catVals),
        values = list(range(len(catVals)))
    ),
    default_value = -1,
    name = "target_encoding"
)

In [None]:
catVals

array(['Anime', 'Audio', 'Literature', 'Live Action', 'Pictures',
       'Software'], dtype=object)

In [None]:
@tf.function
def target(x):
  return table.lookup(x)

In [None]:
def fetch(features, labels):
  return features, tf.one_hot(target(labels), len(catVals))

In [None]:
train_data_f = train_data.map(fetch)
test_data_f = test_data.map(fetch)
validation_data_f = validation_data.map(fetch)

In [None]:
embedding = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
hub_layer = hub.KerasLayer(embedding, output_shape = 512, input_shape = [], dtype=tf.string, trainable=True)



In [None]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  if dtype == 'string':
    index = tf.keras.layers.StringLookup(max_tokens=max_tokens)
  else:
    index = tf.keras.layers.IntegerLookup(max_tokens=max_tokens)

  feature_ds = dataset.map(lambda x, y: x[name])
  index.adapt(feature_ds)
  encoder = tf.keras.layers.CategoryEncoding(num_tokens=index.vocabulary_size())
  return lambda feature: encoder(index(feature))

In [None]:
def get_normalization_layer(name, dataset):
  normalizer = tf.keras.layers.Normalization(axis=None)
  feature_ds = dataset.map(lambda x, y: x[name])
  normalizer.adapt(feature_ds)
  return normalizer

In [None]:
all_inputs = []
encoded_features = []

In [None]:
for header in ['fileSize']: # fileAmount
  numeric_col = tf.keras.Input(shape=(1,), name=header)
  normalization_layer = get_normalization_layer(header, train_data_f)
  encoded_numeric_col = normalization_layer(numeric_col)
  all_inputs.append(numeric_col)
  encoded_features.append(encoded_numeric_col)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [None]:
# for header in ['more100Files']:
#   num_cat_col = tf.keras.Input(shape=(1,), name=header)
#   categ_layer = get_category_encoding_layer(header, train_data_f, "int64")
#   encoded_categ_col = categ_layer(num_cat_col)
#   all_inputs.append(num_cat_col)
#  encoded_features.append(encoded_categ_col)

In [None]:
for header in ['title', 'description', 'files']:
  text_col = tf.keras.Input(shape=(), name=header, dtype='string')
  encoded_text_col = hub_layer(text_col)
  all_inputs.append(text_col)
  encoded_features.append(encoded_text_col)

In [None]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = all_features
for i in range(3):
  x = tf.keras.layers.Dense(16, activation='relu')(x)
  # x = tf.keras.layers.Dropout(0.2)(x)
output = tf.keras.layers.Dense(len(catVals), activation='softmax')(x)

In [None]:
model = tf.keras.Model(all_inputs, output)

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
              loss=tf.keras.losses.CategoricalCrossentropy(), # deleted from_logits=True
              metrics=["categorical_accuracy",
                        tf.keras.metrics.Precision(),
                        tf.keras.metrics.Recall(),
                        tfa.metrics.F1Score(num_classes=len(catVals),
                                            average='macro',
                                            threshold=0.5)])

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 fileSize (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 title (InputLayer)             [(None,)]            0           []                               
                                                                                                  
 description (InputLayer)       [(None,)]            0           []                               
                                                                                                  
 files (InputLayer)             [(None,)]            0           []                               
                                                                                              

In [29]:
history = model.fit(train_data_f, epochs=5, validation_data=validation_data_f) # removed class weights

Epoch 1/5


  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [30]:
model.evaluate(test_data_f)



[0.04509061574935913,
 0.9903271794319153,
 0.9914371371269226,
 0.9881934523582458,
 0.7895412445068359]

In [31]:
model.evaluate(validation_data_f)



[0.04278787598013878,
 0.9908948540687561,
 0.9917320013046265,
 0.98975670337677,
 0.8411054611206055]

In [38]:
model.save("trainedModel_final_allmaincats")



In [32]:
# test_x = test_data_f.unbatch().map(lambda x, y: x)
test_x = test_data_f.unbatch().map(lambda x, y: x)
test_y = test_data_f.unbatch().map(lambda x, y: y)

In [33]:
test_predicted = model.predict(test_x)



In [34]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [35]:
test_y = np.array([x for x in test_y])

In [36]:
print(classification_report(test_y.argmax(1), test_predicted.argmax(1)))

              precision    recall  f1-score   support

           0       0.73      0.74      0.73      4923
           1       0.13      0.13      0.13       303
           2       0.23      0.23      0.23      1090
           3       0.19      0.19      0.19       681
           4       0.08      0.08      0.08        13
           5       0.15      0.10      0.12        20

    accuracy                           0.57      7030
   macro avg       0.25      0.24      0.25      7030
weighted avg       0.57      0.57      0.57      7030



In [37]:
print(tf.math.confusion_matrix(test_y.argmax(1), test_predicted.argmax(1)))

tf.Tensor(
[[3628  179  687  417    5    7]
 [ 193   38   49   23    0    0]
 [ 703   52  247   81    4    3]
 [ 412   30  110  126    2    1]
 [  10    1    1    0    1    0]
 [  13    1    1    3    0    2]], shape=(6, 6), dtype=int32)
