<a href="https://colab.research.google.com/github/lakshaygola/Stack-OverFlow-Tags/blob/main/StackOverFlow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stack Overflow Tag Prediction

In this notebook we create the model which help us to predict the tag of the question posted on the stack overflow website

This tag can be helpfull to the user to classify the question and also help them to search the question according to the tags



In [17]:
# Importing some necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
from tensorflow.keras.preprocessing import text
from tensorflow.keras import layers

import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import shuffle
import os

In [2]:
# Let download the dataset from the Bigquery
!gsutil cp 'gs://cloudml-demo-lcm/SO_ml_tags_avocado_188k_v2.csv' ./

Copying gs://cloudml-demo-lcm/SO_ml_tags_avocado_188k_v2.csv...
\ [1 files][276.7 MiB/276.7 MiB]                                                
Operation completed over 1 objects/276.7 MiB.                                    


In [3]:
# Reading the data
data = pd.read_csv('SO_ml_tags_avocado_188k_v2.csv',
                   names= ['tags', 'original tags', 'text'], 
                   header = 0)

# Shuffling the data
data = shuffle(data, random_state = 20)
data.head()

Unnamed: 0,tags,original tags,text
70357,pandas,"python,mysql,pandas,sqlalchemy","sqlalchemy is too slow, did i do anything wron..."
152810,tensorflow,"python-2.7,tensorflow,pip,anaconda","getting ""no module named queue"" when installin..."
180803,"tensorflow,keras","python,tensorflow,keras,classification,cross-e...",why does sigmoid & crossentropy of avocado/avo...
186450,"pandas,matplotlib","python,pandas,matplotlib",plot avocado columns with secondary y -axis an...
52130,pandas,"python,pandas,parsing",“unknown string format”-error when parsing url...


In [4]:
# Droping the original tags columns and all the NaN values from the dataset
data = data.drop('original tags', axis=1)
data.dropna()

Unnamed: 0,tags,text
70357,pandas,"sqlalchemy is too slow, did i do anything wron..."
152810,tensorflow,"getting ""no module named queue"" when installin..."
180803,"tensorflow,keras",why does sigmoid & crossentropy of avocado/avo...
186450,"pandas,matplotlib",plot avocado columns with secondary y -axis an...
52130,pandas,“unknown string format”-error when parsing url...
...,...,...
178569,"tensorflow,keras","avocado lstm, is the time_step equal to 1 like..."
31962,pandas,"new to avocado, need to create a df from 2 oth..."
23775,pandas,adding rows to a avocado dataframe from anothe...
37135,pandas,how to lag data by x specific days on a multi ...


In [5]:
# Some of the tags are comma separted lets create the list of the following
tag_splits = [tag.split(',') for tag in data['tags']] 
print(tag_splits[:15])

[['pandas'], ['tensorflow'], ['tensorflow', 'keras'], ['pandas', 'matplotlib'], ['pandas'], ['matplotlib'], ['scikitlearn'], ['pandas'], ['pandas'], ['pandas'], ['pandas'], ['pandas'], ['pandas'], ['pandas'], ['pandas']]


In [6]:
# Making the tags into numeric values such that we can pass them in model
tags_encoder = MultiLabelBinarizer()
tags_encoded= tags_encoder.fit_transform(tag_splits)
num_tags = len(tags_encoded[0])
print(num_tags)
print(tags_encoded[0])
print(tags_encoder.classes_)

5
[0 0 1 0 0]
['keras' 'matplotlib' 'pandas' 'scikitlearn' 'tensorflow']


In [7]:
# Perform train and test split of the tags
train_size = int(len(data) * 0.8)         # Converting the whole value in integer so that we dont get decimal value
train_tags = tags_encoded[:train_size]
test_tags = tags_encoded[train_size:]

In [8]:
# Number of rows on train data and test data
print('Train data size: ', train_size)
print('Test data size: ', len(data) - train_size)

Train data size:  150559
Test data size:  37640


In [33]:
# Making class to perfrom preprocessing on the text data
# Creating the tokenizer then converting the text data into a matrix

class textpreprocessing(object):
  def __init__(self, vocal_size):
      self.vocal_size_ = vocal_size
      self.tokenizer_ = None

  def tokenizer_formation(self, txt):
      tokenizer = text.Tokenizer(num_words= self.vocal_size_)
      tokenizer.fit_on_texts(txt)
      self.tokenizer_= tokenizer

  def tokenizer_matrix(self, txt_list):
    text_matrix = self.tokenizer_.texts_to_matrix(txt_list)
    return text_matrix

In [38]:
# Spliting the text data 

vocal_size= 400

train_txt = data['text'].values[:train_size]
test_txt = data['text'].values[train_size:]

processor =  textpreprocessing(vocal_size)
processor.tokenizer_formation(train_txt)

train_body = processor.tokenizer_matrix(train_txt)
test_body = processor.tokenizer_matrix(test_body)

In [39]:
# let see the train data
print(len(train_body[0]))
print(train_body[0])

400
[0. 1. 1. 1. 1. 0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 1. 0. 1. 1.
 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0

In [37]:
# Save the tokenizer (so we dont have to mak BOW)
import pickle

with open ('./processor_state.pkl', 'wb') as f:
  pickle.dump(processor, f)

# Creating the model and train it on train data



In [13]:
# Creating the model
def create_model(vocal_size, num_tags):

    model= tf.keras.models.Sequential()
    model.add(layers.Dense(50, input_shape = (vocal_size, ), activation = 'relu'))
    model.add(layers.Dense(25, activation = 'relu'))
    model.add(layers.Dense(num_tags, activation = 'sigmoid'))

    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [16]:
# Collect the summary of the model
model = create_model(vocal_size, num_tags)
model.summary()

# Train and evaluate the model
model.fit(train_body, train_tags, epochs = 3, validation_split = 0.1, batch_size = 128)
model_result = model.evaluate(test_body, test_tags, batch_size = 128)
print('Evaluation  Loss: {},  accuracy: {}'.format(model_result[0], model_result[1]))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 50)                20050     
_________________________________________________________________
dense_4 (Dense)              (None, 25)                1275      
_________________________________________________________________
dense_5 (Dense)              (None, 5)                 130       
Total params: 21,455
Trainable params: 21,455
Non-trainable params: 0
_________________________________________________________________
Epoch 1/3
Epoch 2/3
Epoch 3/3
Evaluation  Loss: 0.10187453776597977,  accuracy: 0.896785318851471


In [15]:
# Saving the modsl in the file
model.save('stackoverflow_model.h5')

# Taking Prediction from the model

Here we have one question which is not present in our dataset we will take the prediction on that question.
 

In [19]:
# Question on which we will take the prediction

test_requests = [
  "How to preprocess strings in Keras models Lambda layer? I have the problem that the value passed on to the Lambda layer (at compile time) is a placeholder generated by keras (without values). When the model is compiled, the .eval () method throws the error: You must feed a value for placeholder tensor 'input_1' with dtype string and shape [?, 1] def text_preprocess(x): strings = tf.keras.backend.eval(x) vectors = [] for string in strings: vector = string_to_one_hot(string.decode('utf-8')) vectors.append(vector) vectorTensor = tf.constant(np.array(vectors),dtype=tf.float32) return vectorTensor input_text = Input(shape=(1,), dtype=tf.string) embedding = Lambda(text_preprocess)(input_text) dense = Dense(256, activation='relu')(embedding) outputs = Dense(2, activation='softmax')(dense) model = Model(inputs=[input_text], outputs=outputs) model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy']) model.summary() model.save('test.h5') If I pass a string array into the input layer statically, I can compile the model, but I get the same error if I want to convert the model to tflite. #I replaced this line: input_text = Input(shape=(1,), dtype=tf.string) #by this lines: test = tf.constant(['Hello', 'World']) input_text = Input(shape=(1,), dtype=tf.string, tensor=test) #but calling this ... converter = TFLiteConverter.from_keras_model_file('string_test.h5') tfmodel = converter.convert() #... still leads to this error: InvalidArgumentError: You must feed a value for placeholder tensor 'input_3' with dtype string and shape [2] [[{{node input_3}}]] ",
  "Change the bar item name in Pandas I have a test excel file like: df = pd.DataFrame({'name':list('abcdefg'), 'age':[10,20,5,23,58,4,6]}) print (df) name  age 0    a   10 1    b   20 2    c    5 3    d   23 4    e   58 5    f    4 6    g    6 I use Pandas and matplotlib to read and plot it: import pandas as pd import numpy as np import matplotlib.pyplot as plt import os excel_file = 'test.xlsx' df = pd.read_excel(excel_file, sheet_name=0) df.plot(kind='bar') plt.show() the result shows: enter image description here it use index number as item name, how can I change it to the name, which stored in column name?"
]

In [47]:
# Defining class to take the prediction 

class CustomOutput(object):
  def __init__(self, model, processor):
    self.model_ = model
    self.processor_ = processor
  
  def predicition(self, data):
    preprocess_data = self.processor_.tokenizer_matrix(data)
    prediction = self.model_.predict(preprocess_data)
    return prediction.tolist() 

In [53]:
# Taking prediction

classifier = CustomOutput(model, processor)
outputs = classifier.predicition(test_requests)
print(outputs)

# For loop to print the tags
for i in range(len(outputs)):
  print('Prediction Tags: ')
  for idx, tag in enumerate(outputs[i]):
    if tag > 0.7:
      print(tags_encoder.classes_[idx])
  print('\n')

[[0.9738432168960571, 0.0001360476016998291, 0.0003139376640319824, 0.00040218234062194824, 0.58158278465271], [1.9019031242351048e-05, 0.5976572036743164, 0.8521859645843506, 0.0003109574317932129, 3.447831841185689e-05]]
Prediction Tags: 
keras


Prediction Tags: 
pandas




# Visualising Model

In this section we will visualise the model and how its able to predict the tags in order to do this we have to install two more libraries 

SHAP and COLOR

In [54]:
!pip install shap
!pip install color

Collecting shap
[?25l  Downloading https://files.pythonhosted.org/packages/b9/f4/c5b95cddae15be80f8e58b25edceca105aa83c0b8c86a1edad24a6af80d3/shap-0.39.0.tar.gz (356kB)
[K     |█                               | 10kB 12.0MB/s eta 0:00:01[K     |█▉                              | 20kB 8.8MB/s eta 0:00:01[K     |██▊                             | 30kB 7.1MB/s eta 0:00:01[K     |███▊                            | 40kB 6.1MB/s eta 0:00:01[K     |████▋                           | 51kB 4.8MB/s eta 0:00:01[K     |█████▌                          | 61kB 5.1MB/s eta 0:00:01[K     |██████▍                         | 71kB 5.2MB/s eta 0:00:01[K     |███████▍                        | 81kB 5.1MB/s eta 0:00:01[K     |████████▎                       | 92kB 5.2MB/s eta 0:00:01[K     |█████████▏                      | 102kB 4.6MB/s eta 0:00:01[K     |██████████▏                     | 112kB 4.6MB/s eta 0:00:01[K     |███████████                     | 122kB 4.6MB/s eta 0:00:01[K     |

In [55]:
import shap
import color