<a href="https://colab.research.google.com/github/le-incroyable1-dev/tinker_with_Tensorflow/blob/main/stackOverflow_QuestionTagClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses

In [25]:
url = "https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz"

dataset = tf.keras.utils.get_file("stack_overflow_16k", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

print(dataset)

dataset_dir = os.path.join(os.path.dirname(dataset), './')

./stack_overflow_16k


In [26]:
os.listdir(dataset_dir)

['.config',
 'train',
 'stack_overflow_16k.tar.gz',
 'test',
 'README.md',
 'sample_data']

In [27]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['java', 'javascript', 'python', 'csharp']

In [28]:
csharp_dir = os.path.join(train_dir, 'csharp')

In [29]:
#checking a sample csharp entry

sample_file = os.path.join(csharp_dir, '90.txt')
with open(sample_file) as f:
  print(f.read())

"the type `t' must be convertible in order to use it as parameter `t' in the generic type or method i have these two main classes. first the fsmsystem class:..public class fsmsystem&lt;t&gt; : monobehaviour where t : fsmsystem&lt;t&gt;.{.    private t m_owner = default(t);..    protected fsmstate&lt;t&gt; currentstate;..    private dictionary&lt;int, fsmstate&lt;t&gt;&gt; m_states;..    public fsmsystem(t owner).    {.        m_owner = gameobject.findobjectoftype(typeof(t)) as t; //owner;.        m_states = new dictionary&lt;int, fsmstate&lt;t&gt;&gt;();.    }..    protected void addstate( fsmstate&lt;t&gt; state ).    {.        m_states.add( state.getstateid(), state );.    }.}...and the second class, fsmstate:..public abstract class fsmstate&lt;t&gt;.{   .    public abstract int getstateid();..    public abstract void onenter (fsmsystem&lt;t&gt; fsm, fsmstate&lt;t&gt; prevstate);.    public abstract void onupdate (fsmsystem&lt;t&gt; fsm);.    public abstract void onexit (fsmsystem&lt

In [30]:
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'train', 
    batch_size=batch_size, 
    validation_split=0.25, 
    subset='training', 
    seed=seed)

Found 8000 files belonging to 4 classes.
Using 6000 files for training.


In [31]:
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    'train', 
    batch_size=batch_size, 
    validation_split=0.25, 
    subset='validation', 
    seed=seed)

Found 8000 files belonging to 4 classes.
Using 2000 files for validation.


In [21]:
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    'test', 
    batch_size=batch_size)

Found 8000 files belonging to 4 classes.


In [None]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [36]:
max_features = 10000
sequence_length = 250

vectorize_layer = layers.TextVectorization(
    standardize="lower_and_strip_punctuation",
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [37]:
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)


In [38]:
raw_train_ds

<BatchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

In [39]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [42]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [43]:
embedding_dim = 16

model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(4)])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          160016    
                                                                 
 dropout (Dropout)           (None, None, 16)          0         
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense (Dense)               (None, 4)                 68        
                                                                 
Total params: 160,084
Trainable params: 160,084
Non-trainable params: 0
__________________________________________________

In [47]:
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=['accuracy'])

In [49]:
epochs = 20
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [50]:
loss, accuracy = model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.5659465789794922
Accuracy:  0.7919999957084656


In [52]:
#export the model

export_model = tf.keras.Sequential([
  vectorize_layer,
  model,
  layers.Activation('sigmoid')
])

export_model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer="adam", metrics=['accuracy']
)

# Test it with `raw_test_ds`, which yields raw strings
loss, accuracy = export_model.evaluate(raw_test_ds)
print(accuracy)

  return dispatch_target(*args, **kwargs)


0.7919999957084656


In [55]:
#test the model on some random samples!

examples = [
  "Does print statement in java effect any variables (without using increment)?",
  "Reading data with Custom HID in C#",
  "is javascript object oriented?"
]

print(['java', 'javascript', 'python', 'csharp'])
export_model.predict(examples)

['java', 'javascript', 'python', 'csharp']


array([[0.46026048, 0.45788324, 0.5680035 , 0.5561012 ],
       [0.57538766, 0.45803604, 0.53347605, 0.4392433 ],
       [0.5604208 , 0.43794104, 0.6412549 , 0.37614554]], dtype=float32)