<a href="https://colab.research.google.com/github/lakshaygola/Stack-OverFlow-Tags/blob/main/StackOverFlow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stack Overflow Tag Prediction

In this notebook we create the model which help us to predict the tag of the question posted on the stack overflow website

This tag can be helpfull to the user to classify the question and also help them to search the question according to the tags



In [None]:
# Importing some necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
from tensorflow.keras.preprocessing import text
from tensorflow.keras import layers

import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import shuffle

In [None]:
# Let download the dataset from the Bigquery
!gsutil cp 'gs://cloudml-demo-lcm/SO_ml_tags_avocado_188k_v2.csv' ./

Copying gs://cloudml-demo-lcm/SO_ml_tags_avocado_188k_v2.csv...
/ [1 files][276.7 MiB/276.7 MiB]                                                
Operation completed over 1 objects/276.7 MiB.                                    


In [None]:
# Reading the data
data = pd.read_csv('SO_ml_tags_avocado_188k_v2.csv',
                   names= ['tags', 'original tags', 'text'], 
                   header = 0)

# Shuffling the data
data = shuffle(data, random_state = 20)
data.head()

Unnamed: 0,tags,original tags,text
70357,pandas,"python,mysql,pandas,sqlalchemy","sqlalchemy is too slow, did i do anything wron..."
152810,tensorflow,"python-2.7,tensorflow,pip,anaconda","getting ""no module named queue"" when installin..."
180803,"tensorflow,keras","python,tensorflow,keras,classification,cross-e...",why does sigmoid & crossentropy of avocado/avo...
186450,"pandas,matplotlib","python,pandas,matplotlib",plot avocado columns with secondary y -axis an...
52130,pandas,"python,pandas,parsing",“unknown string format”-error when parsing url...


In [None]:
# Droping the original tags columns and all the NaN values from the dataset
data = data.drop('original tags', axis=1)
data.dropna()

Unnamed: 0,tags,text
70357,pandas,"sqlalchemy is too slow, did i do anything wron..."
152810,tensorflow,"getting ""no module named queue"" when installin..."
180803,"tensorflow,keras",why does sigmoid & crossentropy of avocado/avo...
186450,"pandas,matplotlib",plot avocado columns with secondary y -axis an...
52130,pandas,“unknown string format”-error when parsing url...
...,...,...
178569,"tensorflow,keras","avocado lstm, is the time_step equal to 1 like..."
31962,pandas,"new to avocado, need to create a df from 2 oth..."
23775,pandas,adding rows to a avocado dataframe from anothe...
37135,pandas,how to lag data by x specific days on a multi ...


In [None]:
# Some of the tags are comma separted lets create the list of the following
tag_splits = [tag.split(',') for tag in data['tags']] 
print(tag_splits)

[['pandas'], ['tensorflow'], ['tensorflow', 'keras'], ['pandas', 'matplotlib'], ['pandas'], ['matplotlib'], ['scikitlearn'], ['pandas'], ['pandas'], ['pandas'], ['pandas'], ['pandas'], ['pandas'], ['pandas'], ['pandas'], ['pandas'], ['pandas'], ['pandas'], ['tensorflow', 'keras'], ['keras'], ['tensorflow'], ['tensorflow'], ['pandas'], ['pandas'], ['pandas'], ['keras'], ['tensorflow', 'keras'], ['pandas'], ['pandas'], ['pandas'], ['tensorflow'], ['pandas'], ['scikitlearn'], ['tensorflow', 'keras'], ['pandas'], ['matplotlib'], ['pandas'], ['keras'], ['pandas'], ['pandas'], ['pandas'], ['keras'], ['pandas'], ['pandas'], ['pandas'], ['matplotlib'], ['scikitlearn'], ['tensorflow'], ['pandas'], ['pandas'], ['scikitlearn'], ['pandas'], ['tensorflow'], ['scikitlearn'], ['scikitlearn'], ['pandas', 'scikitlearn'], ['matplotlib'], ['pandas'], ['pandas'], ['matplotlib'], ['pandas'], ['pandas'], ['pandas'], ['pandas'], ['pandas'], ['pandas'], ['pandas'], ['pandas'], ['tensorflow'], ['pandas'], ['ma

In [None]:
# Making the tags into numeric values such that we can pass them in model
tags_encoder = MultiLabelBinarizer()
tags_encoded= tags_encoder.fit_transform(tag_splits)
num_tags = len(tags_encoded[0])
print(num_tags)
print(tags_encoded[0])
print(tags_encoder.classes_)

5
[0 0 1 0 0]
['keras' 'matplotlib' 'pandas' 'scikitlearn' 'tensorflow']


In [None]:
# Perform train and test split of the tags
train_size = int(len(data) * 0.8)         # Coverting the whole value in integer so that we dont get decimal value
train_tags = tags_encoded[:train_size]
test_tags = tags_encoded[train_size:]

In [None]:
# Number of rows on train data and test data
print('Train data size: ', train_size)
print('Test data size: ', len(data) - train_size)

Train data size:  150559
Test data size:  37640


In [None]:
# Making class to perfrom preprocessing on the text data
# Creating the tokenizer then converting the text data into a matrix

class textpreprocessing(object):
  def __init__(self, vocal_size):
      self.vocal_size_ = vocal_size
      self.tokenizer_ = None

  def tokenizer_formation(self, txt):
      tokenizer = text.Tokenizer(num_words= self.vocal_size_)
      tokenizer.fit_on_texts(txt)
      self.tokenizer_= tokenizer

  def toknizer_matrix(self, txt_list):
    text_matrix = self.tokenizer_.texts_to_matrix(txt_list)
    return text_matrix

In [None]:
# split the data into train and test data and applying the preprocessing 
vocal_size = 400

train_txt = data['text'].values[:train_size]
test_txt = data['text'].values[train_size: ]

processor = textpreprocessing(vocal_size)
processor.tokenizer_formation(train_txt)

train_body = processor.toknizer_matrix(train_txt)
test_body = processor.toknizer_matrix(test_txt)

In [None]:
# let see the train data
print(len(train_body[0]))
print(train_body[0])

400
[0. 1. 1. 1. 1. 0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 1. 0. 1. 1.
 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0

In [None]:
# Save the tokenizer (so we dont have to mak BOW)
import pickle

with open ('./processor_state.pkl', 'wb') as f:
  pickle.dump(processor, f)

# Creating the model and train it on train data



In [None]:
# Creating the model
def create_model(vocal_size, num_tags):

    model= tf.keras.models.Sequential()
    model.add(layers.Dense(50, input_shape = (vocal_size, ), activation = 'relu'))
    model.add(layers.Dense(25, activation = 'relu'))
    model.add(layers.Dense(num_tags, activation = 'sigmoid'))

    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [None]:
# Collect the summary of the model
model = create_model(vocal_size, num_tags)
model.summary()

# Train and evaluate the model
model.fit(train_body, train_tags, epochs = 3, validation_split = 0.2, batch_size = 128)
model_result = model.evaluate(test_body, test_tags, batch_size = 128)
print('Evaluation  Loss: {},  accuracy: {}'.format(model_result[0], model_result[1]))

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 50)                20050     
_________________________________________________________________
dense_10 (Dense)             (None, 25)                1275      
_________________________________________________________________
dense_11 (Dense)             (None, 5)                 130       
Total params: 21,455
Trainable params: 21,455
Non-trainable params: 0
_________________________________________________________________
Epoch 1/3
Epoch 2/3
Epoch 3/3
Evaluation  Loss: 0.10282833129167557,  accuracy: 0.8961743116378784


In [None]:
# Saving the modsl in the file
model.save('stackoverflow_model.h5')