In [None]:
!pip install tensorflow_text

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text

In [None]:
df = pd.read_csv("procData_nosub.csv")
df = df.sample(frac=1).reset_index(drop=True) # shuffling since train-test-val does not
df.head()

In [4]:
df = df[(df.category != 'Pictures')] # small categories
df = df[(df.category != 'Software')]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69983 entries, 0 to 70293
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         69983 non-null  object 
 1   files         69983 non-null  object 
 2   description   69983 non-null  object 
 3   category      69983 non-null  object 
 4   fileAmount    69983 non-null  int64  
 5   more100Files  69983 non-null  int64  
 6   fileSize      69983 non-null  float64
dtypes: float64(1), int64(2), object(4)
memory usage: 4.3+ MB


In [5]:
train, val, test = np.split(df.sample(frac=1), [int(0.8 * len(df)), int(0.9 * len(df))])

In [6]:
print(len(train), len(val), len(test), len(df))

55986 6998 6999 69983


In [7]:
from sklearn.utils import class_weight
class_weights = list(class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(df['category']), y=df['category']))

In [8]:
weights = {}
for index, w in enumerate(class_weights):
  weights[index] = w
weights 

{0: 0.3540503075926825,
 1: 5.833861287095699,
 2: 1.5651950259438183,
 3: 2.7379890453834115}

In [9]:
np.unique(df['category'])

array(['Anime', 'Audio', 'Literature', 'Live Action'], dtype=object)

In [10]:
def df_to_dataset(dataframe, shuffle=True, batch_size=64):
  df = dataframe.copy()
  labels = df.pop('category')
  df = {key: value[:,tf.newaxis] for key, value in df.items()}
  ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(df))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [11]:
train_data = df_to_dataset(train)
test_data = df_to_dataset(test)
validation_data = df_to_dataset(val)

  df = {key: value[:,tf.newaxis] for key, value in df.items()}


In [12]:
train['category'].value_counts()

Anime          39582
Literature      8890
Live Action     5136
Audio           2378
Name: category, dtype: int64

In [13]:
catVals = np.unique(df['category'])
table = tf.lookup.StaticHashTable(
    initializer = tf.lookup.KeyValueTensorInitializer(
        keys = tf.constant(catVals),
        values = list(range(len(catVals)))
    ),
    default_value = -1,
    name = "target_encoding"
)

In [14]:
catVals

array(['Anime', 'Audio', 'Literature', 'Live Action'], dtype=object)

In [15]:
@tf.function
def target(x):
  return table.lookup(x)

In [16]:
def fetch(features, labels):
  return features, tf.one_hot(target(labels), len(catVals))

In [17]:
train_data_f = train_data.map(fetch)
test_data_f = test_data.map(fetch)
validation_data_f = validation_data.map(fetch)

In [18]:
embedding = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
hub_layer = hub.KerasLayer(embedding, output_shape = 512, input_shape = [], dtype=tf.string, trainable=True)



In [19]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  if dtype == 'string':
    index = tf.keras.layers.StringLookup(max_tokens=max_tokens)
  else:
    index = tf.keras.layers.IntegerLookup(max_tokens=max_tokens)

  feature_ds = dataset.map(lambda x, y: x[name])
  index.adapt(feature_ds)
  encoder = tf.keras.layers.CategoryEncoding(num_tokens=index.vocabulary_size())
  return lambda feature: encoder(index(feature))

In [20]:
def get_normalization_layer(name, dataset):
  normalizer = tf.keras.layers.Normalization(axis=None)
  feature_ds = dataset.map(lambda x, y: x[name])
  normalizer.adapt(feature_ds)
  return normalizer

In [21]:
all_inputs = []
encoded_features = []

In [22]:
for header in ['fileSize']: # fileAmount
  numeric_col = tf.keras.Input(shape=(1,), name=header)
  normalization_layer = get_normalization_layer(header, train_data_f)
  encoded_numeric_col = normalization_layer(numeric_col)
  all_inputs.append(numeric_col)
  encoded_features.append(encoded_numeric_col)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [None]:
# for header in ['more100Files']:
#   num_cat_col = tf.keras.Input(shape=(1,), name=header)
#   categ_layer = get_category_encoding_layer(header, train_data_f, "int64")
#   encoded_categ_col = categ_layer(num_cat_col)
#   all_inputs.append(num_cat_col)
#  encoded_features.append(encoded_categ_col)

In [23]:
for header in ['title', 'description', 'files']:
  text_col = tf.keras.Input(shape=(), name=header, dtype='string')
  encoded_text_col = hub_layer(text_col)
  all_inputs.append(text_col)
  encoded_features.append(encoded_text_col)

In [24]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = all_features
for i in range(1):
  x = tf.keras.layers.Dense(16, activation='relu')(x)
  # x = tf.keras.layers.Dropout(0.2)(x)
output = tf.keras.layers.Dense(len(catVals), activation='softmax')(x)

In [25]:
model = tf.keras.Model(all_inputs, output)

In [26]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
              loss=tf.keras.losses.CategoricalCrossentropy(), # deleted from_logits=True
              metrics=["categorical_accuracy"])

In [27]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 fileSize (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 title (InputLayer)             [(None,)]            0           []                               
                                                                                                  
 description (InputLayer)       [(None,)]            0           []                               
                                                                                                  
 files (InputLayer)             [(None,)]            0           []                               
                                                                                              

eneiniearoeshntae im hereeee


In [28]:
history = model.fit(train_data_f, epochs=5, validation_data=validation_data_f, class_weight=weights)

Epoch 1/5


  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [30]:
model.evaluate(test_data_f)



[0.03183261305093765, 0.9911416172981262]

In [31]:
model.evaluate(validation_data_f)



[0.030281998217105865, 0.9919977188110352]

In [36]:
model.save("trainedModel_balanced_nosub_extra_maincat")



In [29]:
test_x = test_data_f.unbatch().map(lambda x, y: x)
test_y = np.concatenate([y for x, y in test_data_f], axis=0)
test_predicted = model.predict(test_x)



In [32]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [33]:
print(classification_report(test_y.argmax(1), test_predicted.argmax(1)))

              precision    recall  f1-score   support

           0       0.72      0.72      0.72      4911
           1       0.10      0.10      0.10       310
           2       0.24      0.24      0.24      1166
           3       0.15      0.16      0.16       612

    accuracy                           0.56      6999
   macro avg       0.30      0.31      0.30      6999
weighted avg       0.57      0.56      0.56      6999



In [34]:
print(tf.math.confusion_matrix(test_y.argmax(1), test_predicted.argmax(1)))

tf.Tensor(
[[3536  204  736  435]
 [ 215   32   42   21]
 [ 750   54  279   83]
 [ 384   22  109   97]], shape=(4, 4), dtype=int32)


In [37]:
!zip -r /content/trainedModel_balanced_nosub_extra_maincat.zip /content/trainedModel_balanced_nosub_extra_maincat

  adding: content/trainedModel_balanced_nosub_extra_maincat/ (stored 0%)
  adding: content/trainedModel_balanced_nosub_extra_maincat/saved_model.pb (deflated 70%)
  adding: content/trainedModel_balanced_nosub_extra_maincat/variables/ (stored 0%)
  adding: content/trainedModel_balanced_nosub_extra_maincat/variables/variables.data-00000-of-00001 (deflated 40%)
  adding: content/trainedModel_balanced_nosub_extra_maincat/variables/variables.index (deflated 80%)
  adding: content/trainedModel_balanced_nosub_extra_maincat/keras_metadata.pb (deflated 88%)
  adding: content/trainedModel_balanced_nosub_extra_maincat/fingerprint.pb (stored 0%)
  adding: content/trainedModel_balanced_nosub_extra_maincat/assets/ (stored 0%)


In [38]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [39]:
!mv "/content/trainedModel_balanced_nosub_extra_maincat.zip" "/content/drive/My Drive/trainedModel_balanced_nosub_extra_maincat.zip"