In [124]:
%matplotlib inline

In [125]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import random
from pathlib import Path

import tensorflow as tf
from tensorflow.keras.utils import text_dataset_from_directory
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import TextVectorization, Input, Dense, Dropout, Embedding, GlobalAveragePooling1D

# StackOverflow Multi-class Classification

The dataset contains StackOverflow posts on 4 different programming languages.

In [126]:
SEED = 42

## Read data and create `tf.data.Dataset`s

In [127]:
dataset_path = Path().home() / 'Desktop\\datasets\\stackoverflow\\stack_overflow_16k'
train_path = dataset_path / 'train'
test_path = dataset_path / 'test'

In [128]:
raw_train_ds, raw_val_ds = text_dataset_from_directory(
    train_path,
    seed=SEED,
    validation_split=0.2,
    subset='both'
)

raw_train_ds, raw_val_ds

Found 8000 files belonging to 4 classes.
Using 6400 files for training.
Using 1600 files for validation.


(<BatchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>,
 <BatchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>)

In [129]:
raw_test_ds = text_dataset_from_directory(
    test_path
)
raw_test_ds

Found 8000 files belonging to 4 classes.


<BatchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

## Explore datasets

### Readme

In [130]:
with open(dataset_path / 'README.md') as f:
    print(f.read())

This dataset is an extract from the public [Stack Overflow dataset](https://console.cloud.google.com/marketplace/details/stack-exchange/stack-overflow) for use as a tutorial on tensorflow.org. 

It contains the body of 16,000 posts on four languages (Java, Python, CSharp, and Javascript), which are equally divided into train and test. 

The keywords "Java", "Python", "CSharp" and "JavaScript" have been replaced in each post by the word "BLANK" in order to increase the difficulty of this dataset in classification examples.




### Sample post

Показване на случаен пост. Текстът е преди почистване. 

In [134]:
for posts_batch, labels_batch in raw_train_ds.take(1):
    post = posts_batch[0]    
    label = raw_train_ds.class_names[labels_batch[0]]
    print(label, '='*80, post.numpy(), sep='\n')

python
b'"set blank to quit on exception? i\'m using blank 3..i\'ve been looking around for an answer to this, but i haven\'t found it yet. basically, i\'m running several blank scripts into a game engine, and each script has its own entry point...i\'d rather not add try: except blocks through all of my code, so i was wondering if it\'s at all possible to tell blank to quit (or perhaps assign a custom function to that ""callback"") on finding its first error, regardless of where or what it found? ..currently, the game engine will continue after finding and hitting an error, making it more difficult than necessary to diagnose issues since running into one error may make a subsequent script not work (as it relies on variables that the error-ing script set, for example). any ideas? ..i know that i could redirect the console to a file to allow for easier scrolling, but just capturing the first error and stopping the game prematurely would be really useful...okay, a couple of extra bits of 

## Model

### Vectorization

Стратегията е да подготвим dataset-овете предварително, преди да ги подадем на модела. Подготовката се състой от:
- превръщане на текстовете във вектори (=векторизация)
- настройване за бързо изпълнение

In [165]:
# Define vectorizer layer.

MAX_TOKENS = 10_000
OUTPUT_SEQUENCE_LENGTH = 250

text_vectorizer = TextVectorization(
    max_tokens=MAX_TOKENS,
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    ngrams=None,
    output_mode='int',
    output_sequence_length=OUTPUT_SEQUENCE_LENGTH,
    encoding='utf-8',
)
text_vectorizer

<keras.layers.preprocessing.text_vectorization.TextVectorization at 0x26a40505270>

In [166]:
# Teach vocabulary from training data.
train_questions = raw_train_ds.map(lambda question, language: question)
text_vectorizer.adapt(train_questions)

In [147]:
# Print out top 10 most frequent words (tokens).
top_10 = text_vectorizer.get_vocabulary()[:10]
top_10

['', '[UNK]', 'the', 'i', 'to', 'a', 'is', 'in', 'and', 'of']

In [159]:
# Print out a sample question and its vector 
for question_batch in train_questions.take(1):
    single_question = question_batch[0]
    print(single_question,
          text_vectorizer(single_question), 
          sep='\n\n')

tf.Tensor(b'"how to change data format in write function in blank? how to change the data format in f.write function? ..loaded_data = 349.00  or 3.00..i want to change data format in write function like %6f in print function. ..ex)  349.00 -> 349.000000 ,   3.00 -> 3.000000..f = open(""test.txt"", \'w\').f.write( str.(loaded_data).zfill(?) )  ...what is the code that performs above function?"\n', shape=(), dtype=string)

tf.Tensor(
[  24    4  175   80  290    7  174   38    7   16   24    4  175    2
   80  290    7 7865   38    1    1   45    1   46    4  175   80  290
    7  174   38   48    1    7   75   38  507    1    1 1544    1    1
    1    1   55    6    2   30   14 3374  250   38    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0  

In [167]:
# Create vectorized datasets

def vectorize_record(question, label):
    """Takes a single pair of question and label and 
    returns the vectorized question and unmodified label.
    """
    return (text_vectorizer(tf.expand_dims(question, -1)),
            label)
        

train_ds_vectorized = raw_train_ds.map(vectorize_record)
val_ds_vectorized = raw_val_ds.map(vectorize_record)
test_ds_vectorized = raw_test_ds.map(vectorize_record)

train_ds_vectorized, val_ds_vectorized, test_ds_vectorized

(<MapDataset element_spec=(TensorSpec(shape=(None, 250), dtype=tf.int64, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>,
 <MapDataset element_spec=(TensorSpec(shape=(None, 250), dtype=tf.int64, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>,
 <MapDataset element_spec=(TensorSpec(shape=(None, 250), dtype=tf.int64, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>)

### Configure for performance

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds_vectorized = train_ds_vectorized.cache().prefetch(AUTOTUNE)
val_ds_vectorized   = val_ds_vectorized.cache().prefetch(AUTOTUNE)  
test_ds_vectorized  = test_ds_vectorized.cache().prefetch(AUTOTUNE) 

## Demo `Dataset`

In [41]:
dataset = tf.data.Dataset.range(5)

In [56]:
list(dataset.take(2).as_numpy_iterator())
list(dataset.take(2).as_numpy_iterator())

[0, 1]

In [57]:
for _ in dataset.take(2):
    pass

for _ in dataset.take(2):
    print(_)

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
