In [None]:
!pip install tensorflow_text

In [None]:
!pip install tensorflow_addons

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_addons
  Downloading tensorflow_addons-0.19.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m53.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow_addons
Successfully installed tensorflow_addons-0.19.0


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
import tensorflow_addons as tfa
import tensorflow_datasets as tfds

In [None]:
df = pd.read_csv("procData_nosub.csv")
df = df.sample(frac=1).reset_index(drop=True) # shuffling since train-test-val does not
df.head()

In [None]:
df = df[(df.category != 'Pictures')] # small categories
df = df[(df.category != 'Software')]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69983 entries, 0 to 70293
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         69983 non-null  object 
 1   files         69983 non-null  object 
 2   description   69983 non-null  object 
 3   category      69983 non-null  object 
 4   fileAmount    69983 non-null  int64  
 5   more100Files  69983 non-null  int64  
 6   fileSize      69983 non-null  float64
dtypes: float64(1), int64(2), object(4)
memory usage: 4.3+ MB


In [None]:
train, val, test = np.split(df.sample(frac=1), [int(0.8 * len(df)), int(0.9 * len(df))])

In [None]:
print(len(train), len(val), len(test), len(df))

55986 6998 6999 69983


In [None]:
from sklearn.utils import class_weight
class_weights = list(class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(df['category']), y=df['category']))

In [None]:
weights = {}
for index, w in enumerate(class_weights):
  weights[index] = w
weights 

{0: 0.3540503075926825,
 1: 5.833861287095699,
 2: 1.5651950259438183,
 3: 2.7379890453834115}

In [None]:
np.unique(df['category'])

array(['Anime', 'Audio', 'Literature', 'Live Action'], dtype=object)

In [None]:
def df_to_dataset(dataframe, shuffle=True, batch_size=64):
  df = dataframe.copy()
  labels = df.pop('category')
  df = {key: value[:,tf.newaxis] for key, value in df.items()}
  ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(df))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [None]:
train_data = df_to_dataset(train)
test_data = df_to_dataset(test)
validation_data = df_to_dataset(val)

  df = {key: value[:,tf.newaxis] for key, value in df.items()}


In [None]:
train['category'].value_counts()

Anime          39517
Literature      8971
Live Action     5103
Audio           2395
Name: category, dtype: int64

In [None]:
catVals = np.unique(df['category'])
table = tf.lookup.StaticHashTable(
    initializer = tf.lookup.KeyValueTensorInitializer(
        keys = tf.constant(catVals),
        values = list(range(len(catVals)))
    ),
    default_value = -1,
    name = "target_encoding"
)

In [None]:
catVals

array(['Anime', 'Audio', 'Literature', 'Live Action'], dtype=object)

In [None]:
@tf.function
def target(x):
  return table.lookup(x)

In [None]:
def fetch(features, labels):
  return features, tf.one_hot(target(labels), len(catVals))

In [None]:
train_data_f = train_data.map(fetch)
test_data_f = test_data.map(fetch)
validation_data_f = validation_data.map(fetch)

In [None]:
embedding = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
hub_layer = hub.KerasLayer(embedding, output_shape = 512, input_shape = [], dtype=tf.string, trainable=True)



In [None]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  if dtype == 'string':
    index = tf.keras.layers.StringLookup(max_tokens=max_tokens)
  else:
    index = tf.keras.layers.IntegerLookup(max_tokens=max_tokens)

  feature_ds = dataset.map(lambda x, y: x[name])
  index.adapt(feature_ds)
  encoder = tf.keras.layers.CategoryEncoding(num_tokens=index.vocabulary_size())
  return lambda feature: encoder(index(feature))

In [None]:
def get_normalization_layer(name, dataset):
  normalizer = tf.keras.layers.Normalization(axis=None)
  feature_ds = dataset.map(lambda x, y: x[name])
  normalizer.adapt(feature_ds)
  return normalizer

In [None]:
all_inputs = []
encoded_features = []

In [None]:
for header in ['fileSize']: # fileAmount
  numeric_col = tf.keras.Input(shape=(1,), name=header)
  normalization_layer = get_normalization_layer(header, train_data_f)
  encoded_numeric_col = normalization_layer(numeric_col)
  all_inputs.append(numeric_col)
  encoded_features.append(encoded_numeric_col)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [None]:
# for header in ['more100Files']:
#   num_cat_col = tf.keras.Input(shape=(1,), name=header)
#   categ_layer = get_category_encoding_layer(header, train_data_f, "int64")
#   encoded_categ_col = categ_layer(num_cat_col)
#   all_inputs.append(num_cat_col)
#  encoded_features.append(encoded_categ_col)

In [None]:
for header in ['title', 'description', 'files']:
  text_col = tf.keras.Input(shape=(), name=header, dtype='string')
  encoded_text_col = hub_layer(text_col)
  all_inputs.append(text_col)
  encoded_features.append(encoded_text_col)

In [None]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = all_features
for i in range(3):
  x = tf.keras.layers.Dense(16, activation='relu')(x)
  # x = tf.keras.layers.Dropout(0.2)(x)
output = tf.keras.layers.Dense(len(catVals), activation='softmax')(x)

In [None]:
model = tf.keras.Model(all_inputs, output)

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
              loss=tf.keras.losses.CategoricalCrossentropy(), # deleted from_logits=True
              metrics=["categorical_accuracy",
                        tf.keras.metrics.Precision(),
                        tf.keras.metrics.Recall(),
                        tfa.metrics.F1Score(num_classes=len(catVals),
                                            average='macro',
                                            threshold=0.5)])

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 fileSize (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 title (InputLayer)             [(None,)]            0           []                               
                                                                                                  
 description (InputLayer)       [(None,)]            0           []                               
                                                                                                  
 files (InputLayer)             [(None,)]            0           []                               
                                                                                              

In [None]:
history = model.fit(train_data_f, epochs=5, validation_data=validation_data_f) # removed class weights

Epoch 1/5


  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model.evaluate(test_data_f)



[0.04479888454079628,
 0.9907129406929016,
 0.9907129406929016,
 0.9907129406929016,
 0.9827419519424438]

In [None]:
model.evaluate(validation_data_f)



[0.03281523659825325,
 0.9922835230827332,
 0.9922835230827332,
 0.9922835230827332,
 0.9858689308166504]

In [None]:
model.save("trainedModel_final")



In [None]:
# test_x = test_data_f.unbatch().map(lambda x, y: x)
test_x = test_data_f.unbatch().map(lambda x, y: x)
test_y = test_data_f.unbatch().map(lambda x, y: y)

In [None]:
test_predicted = model.predict(test_x)



In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
test_y = np.array([x for x in test_y])

In [None]:
print(classification_report(test_y.argmax(1), test_predicted.argmax(1)))

              precision    recall  f1-score   support

           0       0.74      0.75      0.74      4992
           1       0.10      0.10      0.10       305
           2       0.21      0.21      0.21      1080
           3       0.18      0.17      0.18       622

    accuracy                           0.58      6999
   macro avg       0.31      0.31      0.31      6999
weighted avg       0.58      0.58      0.58      6999



In [None]:
print(tf.math.confusion_matrix(test_y.argmax(1), test_predicted.argmax(1)))

tf.Tensor(
[[3730  194  708  360]
 [ 208   31   45   21]
 [ 725   45  226   84]
 [ 388   30   99  105]], shape=(4, 4), dtype=int32)
