In [None]:
!pip install tensorflow_text

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text

In [2]:
df = pd.read_csv("procData_nosub.csv")
df = df.sample(frac=1).reset_index(drop=True) # shuffling since train-test-val does not
df.head()

Unnamed: 0,title,files,description,category,fileAmount,more100Files,fileSize
0,Serial Justice {2021} {Digital} {YameteOnii-sama},Serial Justice 000 (2021) (Digital) (YameteOni...,\n #### **Serial Justice** ...,Literature,36,0,20.083931
1,クローズアップ現代＋▽緊迫ルポ・アフガニスタン国外脱出▽人道危機・飢餓が迫る今,クローズアップ現代＋▽緊迫ルポ・アフガニスタン国外脱出▽人道危機・飢餓が迫る今.mp4,\n クローズアップ現代＋▽緊迫ルポ・アフガニスタン国...,Live Action,1,0,19.363794
2,小さな旅「木を伐（き）る民～奈良県\u3000吉野～」,小さな旅「木を伐（き）る民～奈良県\u3000吉野～」.mp4,\n 小さな旅「木を伐（き）る民～奈良県\u3000吉...,Live Action,1,0,19.667175
3,[Erai-raws] Digimon Ghost Game - 12 [1080p][Mu...,[Erai-raws] Digimon Ghost Game - 12 [1080p][Mu...,\n #### **To know which sub...,Anime,1,0,21.05678
4,僕とロボコ 第01-11巻 [Boku to Roboko vol 01-11],DLRAW.NET-Boku to Roboko vol 01-11.rar,\n More info; https://dlraw...,Literature,1,0,21.130888


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70294 entries, 0 to 70293
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         70294 non-null  object 
 1   files         70294 non-null  object 
 2   description   70294 non-null  object 
 3   category      70294 non-null  object 
 4   fileAmount    70294 non-null  int64  
 5   more100Files  70294 non-null  int64  
 6   fileSize      70294 non-null  float64
dtypes: float64(1), int64(2), object(4)
memory usage: 3.8+ MB


In [4]:
train, val, test = np.split(df.sample(frac=1), [int(0.8 * len(df)), int(0.9 * len(df))])

In [5]:
print(len(train), len(val), len(test), len(df))

56235 7029 7030 70294


In [6]:
from sklearn.utils import class_weight
class_weights = list(class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(df['category']), y=df['category']))

In [7]:
weights = {}
for index, w in enumerate(class_weights):
  weights[index] = w
weights 

{0: 0.23708245642436998,
 1: 3.9065243970212293,
 2: 1.0481004353790182,
 3: 1.833437663015128,
 4: 109.49221183800623,
 5: 57.4297385620915}

In [8]:
np.unique(df['category'])

array(['Anime', 'Audio', 'Literature', 'Live Action', 'Pictures',
       'Software'], dtype=object)

In [9]:
def df_to_dataset(dataframe, shuffle=True, batch_size=64):
  df = dataframe.copy()
  labels = df.pop('category')
  df = {key: value[:,tf.newaxis] for key, value in df.items()}
  ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(df))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [10]:
train_data = df_to_dataset(train)
test_data = df_to_dataset(test)
validation_data = df_to_dataset(val)

  df = {key: value[:,tf.newaxis] for key, value in df.items()}


In [11]:
train['category'].value_counts()

Anime          39562
Literature      8949
Live Action     5111
Audio           2376
Software         153
Pictures          84
Name: category, dtype: int64

In [12]:
catVals = np.unique(df['category'])
table = tf.lookup.StaticHashTable(
    initializer = tf.lookup.KeyValueTensorInitializer(
        keys = tf.constant(catVals),
        values = list(range(len(catVals)))
    ),
    default_value = -1,
    name = "target_encoding"
)

In [13]:
catVals

array(['Anime', 'Audio', 'Literature', 'Live Action', 'Pictures',
       'Software'], dtype=object)

In [14]:
@tf.function
def target(x):
  return table.lookup(x)

In [15]:
def fetch(features, labels):
  return features, tf.one_hot(target(labels), len(catVals))

In [16]:
train_data_f = train_data.map(fetch)
test_data_f = test_data.map(fetch)
validation_data_f = validation_data.map(fetch)

In [18]:
embedding = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
hub_layer = hub.KerasLayer(embedding, output_shape = 512, input_shape = [], dtype=tf.string, trainable=True)



In [19]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  if dtype == 'string':
    index = tf.keras.layers.StringLookup(max_tokens=max_tokens)
  else:
    index = tf.keras.layers.IntegerLookup(max_tokens=max_tokens)

  feature_ds = dataset.map(lambda x, y: x[name])
  index.adapt(feature_ds)
  encoder = tf.keras.layers.CategoryEncoding(num_tokens=index.vocabulary_size())
  return lambda feature: encoder(index(feature))

In [20]:
def get_normalization_layer(name, dataset):
  normalizer = tf.keras.layers.Normalization(axis=None)
  feature_ds = dataset.map(lambda x, y: x[name])
  normalizer.adapt(feature_ds)
  return normalizer

In [21]:
all_inputs = []
encoded_features = []

In [22]:
# Numerical features.
for header in ['fileSize']: # fileAmount
  numeric_col = tf.keras.Input(shape=(1,), name=header)
  normalization_layer = get_normalization_layer(header, train_data_f)
  encoded_numeric_col = normalization_layer(numeric_col)
  all_inputs.append(numeric_col)
  encoded_features.append(encoded_numeric_col)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [23]:
# for header in ['more100Files']:
#   num_cat_col = tf.keras.Input(shape=(1,), name=header)
#   categ_layer = get_category_encoding_layer(header, train_data_f, "int64")
#   encoded_categ_col = categ_layer(num_cat_col)
#   all_inputs.append(num_cat_col)
#  encoded_features.append(encoded_categ_col)

In [24]:
for header in ['title', 'description', 'files']:
  text_col = tf.keras.Input(shape=(), name=header, dtype='string')
  encoded_text_col = hub_layer(text_col)
  all_inputs.append(text_col)
  encoded_features.append(encoded_text_col)

In [25]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = all_features
for i in range(3):
  x = tf.keras.layers.Dense(16, activation='relu')(x)
  # x = tf.keras.layers.Dropout(0.2)(x)
output = tf.keras.layers.Dense(len(catVals), activation='softmax')(x)

In [26]:
model = tf.keras.Model(all_inputs, output)

In [27]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=["accuracy"])

In [28]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 fileSize (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 title (InputLayer)             [(None,)]            0           []                               
                                                                                                  
 description (InputLayer)       [(None,)]            0           []                               
                                                                                                  
 files (InputLayer)             [(None,)]            0           []                               
                                                                                              

In [29]:
history = model.fit(train_data_f, epochs=10, validation_data=validation_data_f, class_weight=weights)

Epoch 1/10


  inputs = self._flatten_to_reference_inputs(inputs)
  output, from_logits = _get_logits(


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [31]:
model.evaluate(test_data_f)



[0.055727794766426086, 0.9870554804801941]

In [32]:
model.evaluate(validation_data_f)



[0.05321845784783363, 0.9840660095214844]

In [37]:
model.save("trainedModel_balanced_nosub_nomax100_nofileamount")



In [33]:
test_x = test_data_f.unbatch().map(lambda x, y: x)
test_y = np.concatenate([y for x, y in test_data_f], axis=0)
test_predicted = model.predict(test_x)

  inputs = self._flatten_to_reference_inputs(inputs)




In [34]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [35]:
print(classification_report(test_y.argmax(1), test_predicted.argmax(1)))

              precision    recall  f1-score   support

           0       0.73      0.72      0.73      4926
           1       0.11      0.11      0.11       315
           2       0.23      0.23      0.23      1097
           3       0.18      0.19      0.18       648
           4       0.15      0.19      0.17        16
           5       0.07      0.07      0.07        28

    accuracy                           0.57      7030
   macro avg       0.24      0.25      0.25      7030
weighted avg       0.57      0.57      0.57      7030



In [36]:
print(tf.math.confusion_matrix(np.array(test_y).argmax(1), test_predicted.argmax(1)))

tf.Tensor(
[[3561  199  693  442   14   17]
 [ 201   35   50   25    1    3]
 [ 687   55  253   97    1    4]
 [ 410   23   89  121    1    4]
 [  10    0    1    2    3    0]
 [  17    1    6    2    0    2]], shape=(6, 6), dtype=int32)


In [38]:
!zip -r /content/trainedModel_balanced_nosub_nomax100_nofileamount.zip /content/trainedModel_balanced_nosub_nomax100_nofileamount

  adding: content/trainedModel_balanced_nosub_nomax100_nofileamount/ (stored 0%)
  adding: content/trainedModel_balanced_nosub_nomax100_nofileamount/saved_model.pb (deflated 70%)
  adding: content/trainedModel_balanced_nosub_nomax100_nofileamount/variables/ (stored 0%)
  adding: content/trainedModel_balanced_nosub_nomax100_nofileamount/variables/variables.data-00000-of-00001 (deflated 40%)
  adding: content/trainedModel_balanced_nosub_nomax100_nofileamount/variables/variables.index (deflated 80%)
  adding: content/trainedModel_balanced_nosub_nomax100_nofileamount/keras_metadata.pb (deflated 90%)
  adding: content/trainedModel_balanced_nosub_nomax100_nofileamount/fingerprint.pb (stored 0%)
  adding: content/trainedModel_balanced_nosub_nomax100_nofileamount/assets/ (stored 0%)


In [39]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [40]:
!mv "/content/trainedModel_balanced_nosub_nomax100_nofileamount.zip" "/content/drive/My Drive/trainedModel_balanced_nosub_nomax100_nofileamount.zip"