# Stack Overflow Multilabel Classification

In [1]:
import os
import tensorflow as tf

from pathlib import Path


# Multilabel Classification (Dataset: StackOverflow)

## Dataset Fetch

In [5]:
dataset_url = "https://storage.googleapis.com/iobruno/datasets/stack_overflow_16k_pt-br.zip"

In [6]:
datasets_dir = Path(os.path.abspath('')).parent.joinpath('datasets')
datasets_dir.mkdir(parents=True, exist_ok=True)

In [10]:
dataset = tf.keras.utils.get_file(fname="stack_overflow_16k_pt-br.zip", 
                                  origin=dataset_url,
                                  extract=True,
                                  cache_dir=datasets_dir,
                                  cache_subdir='')

Downloading data from https://storage.googleapis.com/iobruno/datasets/stack_overflow_16k_pt-br.zip


In [11]:
so_dataset_dir = datasets_dir.joinpath("stack_overflow_16k")

In [12]:
so_train_dataset = so_dataset_dir.joinpath("train")
so_test_dataset = so_dataset_dir.joinpath("test")

In [14]:
os.listdir(so_train_dataset)

['python', 'java', 'csharp', 'javascript']

## Dataset Load

In [15]:
"""
Loads the Dataset in Batches/Lists of 32 entries each,
meaning that each iteration over the dataset will return 32 reviews
"""
batch_size = 32
seed = 42

### Loading Training Dataset

In [32]:
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    directory=so_train_dataset,
    batch_size=batch_size,
    seed=seed,
    validation_split=0.2,
    subset='training'
)

Found 8000 files belonging to 4 classes.
Using 6400 files for training.


### Analysis of the Training Dataset

In [47]:
raw_train_ds.class_names

['csharp', 'java', 'javascript', 'python']

In [49]:
for t_corpus, t_label in raw_train_ds.take(1):
    for i in range(5):
        label  = t_label.numpy()[i]
        class_name = raw_train_ds.class_names[label]
        corpus = t_corpus.numpy()[i]
        print(f"Row: {i}")
        print(f"Class Label: {label} ({class_name})")
        print(f"Review: {corpus}")
        print()

Row: 0
Class Label: 2 (javascript)
Review: b'"Ol\xc3\xa1, gostaria de saber como pegar o caminho de uma imagem contida no value de um input do tipo  file. Estou a usar o seguinte c\xc3\xb3digo:\r\n\r\n\r\n$(document).on("change",\'#Upload\',function(){\r\n         var valor=$(this).attr(\'value\');\r\n         var ext= (valor.substring(valor.lastIndexOf("."))).toLowerCase();\r\n     if (ext==".jpg" || ext==".jpeg"){         \r\n         $("#Image").attr("src", valor);\r\n     }else{alert(\'Extensao "\'+ext+\'" nao permitida!\');}\r\n});\r\n\r\n\r\nEle retorna um fakepath, ou seja, um caminho falso. algu\xc3\xa9m poderia me ajudar?\r\n"'

Row: 1
Class Label: 0 (csharp)
Review: b'"Essa d\xc3\xbavida surgiu a partir desse post Enviar dados para uma ActionResult de um Controller diferente do atual respondido pelo Tiago S, atualmente tenho essa ActionResult que faz download de arquivo atrav\xc3\xa9s de um Post em uma View e preciso fazer atrav\xc3\xa9s do $.ajax():\r\n\r\n $(document).ready

### Loading Validation Dataset

In [51]:
raw_validation_ds = tf.keras.preprocessing.text_dataset_from_directory(
    directory=so_train_dataset,
    batch_size=batch_size,
    seed=seed,
    validation_split=0.2,
    subset='validation'
)

Found 8000 files belonging to 4 classes.
Using 1600 files for validation.


### Loading Test Dataset

In [52]:
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    directory=so_test_dataset,
    batch_size=batch_size,
    seed=seed
)

Found 8000 files belonging to 4 classes.


## Dataset Preprocessing