In [1]:
import requests
import random
import re
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words
from spacy.lang.en.stop_words import STOP_WORDS as spacy_stopwords
import joblib
import nltk
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import joblib

mlb = MultiLabelBinarizer()
mlb.fit_transform([(c,) for c in ['anova', 'bayesian', 'classification', 'clustering', 'correlation',
                                  'distributions', 'hypothesis-testing', 'logistic',
                                  'machine-learning', 'mathematical-statistics', 'mixed-model',
                                  'multiple-regression', 'neural-networks', 'normal-distribution',
                                  'probability', 'r', 'regression', 'self-study',
                                  'statistical-significance', 'time-series']])
tokenizer = RegexpTokenizer(r"\w+")
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('words')
nltk_stop_words = nltk.corpus.stopwords.words('english')
stopwords = list(nltk_stop_words) + \
    list(sklearn_stop_words) + list(spacy_stopwords)
stopwords = list(dict.fromkeys(stopwords))
lemmatizer = WordNetLemmatizer()
pipeline = joblib.load("svm.joblib")


def strip_html_tags(Text):
    regex = re.compile('<.*?>')
    return re.sub(regex, '', Text)

def preprocessor(input_sentence):
    input_sentence = pd.DataFrame([input_sentence])[0]
    input_sentence = input_sentence.apply(
        lambda x: re.sub('<code>.*?</code>', '', x, flags=re.DOTALL))
    input_sentence = input_sentence.str.replace(r'<[^<]+?>', '', regex=True)
    input_sentence = input_sentence.astype(str)

    input_sentence = input_sentence.apply(tokenizer.tokenize)
    input_sentence = input_sentence.apply(
        lambda x: [w for w in x if all(ord(c) < 128 for c in w)])
    input_sentence = input_sentence.apply(
        lambda x: [word for word in x if not bool(re.search(r'\d', word))])
    input_sentence = input_sentence.apply(
        lambda x: [word for word in x if not bool(re.search(r'_', word))])

    input_sentence = input_sentence.map(
        lambda x: [word for word in x if word not in stopwords])
    input_sentence = input_sentence.map(lambda x: [word.lower() for word in x])

    input_sentence = input_sentence.apply(
        lambda x: [lemmatizer.lemmatize(word, pos=wordnet.VERB) for word in x])
    input_sentence = input_sentence.apply(
        lambda x: [lemmatizer.lemmatize(word, pos=wordnet.NOUN) for word in x])
    input_sentence = input_sentence.apply(
        lambda x: [lemmatizer.lemmatize(word, pos=wordnet.ADJ) for word in x])
    input_sentence = input_sentence.apply(
        lambda x: [lemmatizer.lemmatize(word, pos=wordnet.ADV) for word in x])
    input_sentence = input_sentence.apply(
        lambda x: (' '.join([str(word) for word in x])))

    return input_sentence.tolist()


def convert_to_tags(predicted_binaries):
    return mlb.inverse_transform(predicted_binaries)


def requestResults(kw):
    inputs = preprocessor(kw)
    y_pred = pipeline.predict(inputs)
    results = convert_to_tags(y_pred)
    return results

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Prabha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Prabha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Prabha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Prabha\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
#pip install dill

In [15]:
# Connect to stackexchange.com url and get questions with the associated tags
stack_url = "https://api.stackexchange.com/2.2/questions/unanswered?pagesize=10&order=desc&sort=votes&site=stats&filter=!-MOiNm40F1U6n0W(EFNR1)GdsWAepKpT_"

# Get the data in the json format
response_data = requests.get(stack_url).json()
data = random.choice(response_data.get('items'))
questions = data.get('title') + " " + strip_html_tags(data.get('body'))

# Print the question with the associated tags
print("Sample Question: ", questions)
print("Tags assigned: ", data.get('tags'))


Sample Question:  Training a neural network for regression always predicts the mean I am training a simple convolutional neural network for regression, where the task is to predict the (x,y) location of a box in an image, e.g.:











The output of the network has two nodes, one for x, and one for y. The rest of the network is a standard convolutional neural network. The loss is a standard mean squared error between the predicted position of the box, and the ground truth position. I am training on 10000 of these images, and validating on 2000.

The problem I am having, is that even after significant training, the loss does not really decrease. After observing the output of the network, I notice that the network tends to output values close to zero, for both output nodes. As such, the prediction of the box's location is always the centre of the image. There is some deviation in the predictions, but always around zero. Below shows the loss:



I have run this for many more epochs tha

In [16]:
# Check for model prediction

Processed_question = preprocessor(questions)
Predicted_tags = requestResults(Processed_question)


In [17]:
# Print the Actual and the Predicted tags

print("Actual Tags: ", data.get('tags'))
print("Predicted Tags: ", Predicted_tags)

Actual Tags:  ['machine-learning', 'neural-networks', 'optimization', 'deep-learning', 'loss-functions']
Predicted Tags:  [('machine-learning', 'neural-networks')]
