In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import spacy
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)

In [2]:
# read data
df_questions = pd.read_hdf('auto_tagging_data_v2.h5')

In [3]:
df_questions.head()

Unnamed: 0,Id,Title,Body,Tags
0,6,The Two Cultures: statistics vs. machine learning?,"<p>Last year, I read a blog post from <a href=""http://anyall.org/"">Brendan O'Connor</a> entitled <a href=""http://anyall.org/blog/2008/12/statistics-vs-machine-learning-fight/"">""Statistics vs. Mach...",[machine-learning]
1,21,Forecasting demographic census,<p>What are some of the ways to forecast demographic census with some validation and calibration techniques?</p>\n\n<p>Some of the concerns:</p>\n\n<ul>\n<li>Census blocks vary in sizes as rural\n...,[forecasting]
2,22,Bayesian and frequentist reasoning in plain English,<p>How would you describe in plain English the characteristics that distinguish Bayesian from Frequentist reasoning?</p>\n,[bayesian]
3,31,What is the meaning of p values and t values in statistical tests?,"<p>After taking a statistics course and then trying to help fellow students, I noticed one subject that inspires much head-desk banging is interpreting the results of statistical hypothesis tests....","[hypothesis-testing, t-test, p-value, interpretation]"
4,36,Examples for teaching: Correlation does not mean causation,"<p>There is an old saying: ""Correlation does not mean causation"". When I teach, I tend to use the following standard examples to illustrate this point:</p>\n\n<ol>\n<li>number of storks and birth ...",[correlation]


In [4]:
# combine title and body
df_questions['Text'] = df_questions["Title"] + " " + df_questions["Body"]

In [5]:
def clean_text(text):
    # remove html tags and url links
    text = re.sub(r'<.*?>', '', text)
    # remove everything alphabets
    text = re.sub("[^a-zA-Z]"," ",text)
    # remove whitespaces
    text = ' '.join(text.split())
    
    return text

In [6]:
df_questions['Text'] = df_questions['Text'].apply(lambda x: clean_text(x))
df_questions['Text'] = df_questions['Text'].str.lower()

In [7]:
df_questions[['Id', 'Text', 'Tags']].sample(5)

Unnamed: 0,Id,Text,Tags
21065,8916,are there any available implementations of density or conditional density tree learning greetings i am working on joint and conditional density trees for approximating clique potentials in bayesia...,"[bayesian, multivariate-analysis, cart]"
63629,145620,error bars on graph skewed data negative standard deviation i have only had basic stats i conducted some research where i have four different treatments and results for the data range from i don t...,[standard-deviation]
27069,99983,clustering data that has mixture of continuous and categorical variables i have data that represent some aspect of human behavior i want to cluster it unsupervised into behavioral profiles of some...,"[r, clustering, categorical-data]"
52762,52842,orthogonal sets of variables in multiple imputation separate imputation models first thanks to those who gave me useful input on this project in a previous thread on this site i ve got a new ish q...,[missing-data]
47853,94700,how to interpret the expression of ma as ar infty when ar is expressed as ma infty i can interpret it as let s say my wage this year depends only on last year s wage and a random shock my boss moo...,[time-series]


In [8]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_questions['Text'])

In [10]:
# check unique words count
len(tokenizer.word_index)

81956

In [11]:
# check unique words count
vocab_size = len(tokenizer.word_index) + 1
vocab_size

81957

In [12]:
sequences = tokenizer.texts_to_sequences(df_questions['Text'])

In [13]:
i = 0
print(df_questions['Text'][i], '\n'), print(sequences[i])

the two cultures statistics vs machine learning last year i read a blog post from brendan o connor entitled statistics vs machine learning fight that discussed some of the differences between the two fields andrew gelman responded favorably to this simon blomberg from r s fortunes package to paraphrase provocatively machine learning is statistics minus any checking of models and assumptions brian d ripley about the difference between machine learning and statistics user vienna may season s greetings andrew gelman in that case maybe we should get rid of checking of models and assumptions more often then maybe we d be able to solve some of the problems that the machine learning people can solve but we can t there was also the statistical modeling the two cultures paper by leo breiman in which argued that statisticians rely too heavily on data modeling and that machine learning techniques are making progress by instead relying on the predictive accuracy of models has the statistics field 

(None, None)

In [14]:
seq_lengths = []

for i in sequences:
    seq_lengths.append(len(i))

In [20]:
print("30th percentile: ", pd.Series(seq_lengths).quantile(0.3))
print("40th percentile: ", pd.Series(seq_lengths).quantile(0.4))
print("50th percentile: ", pd.Series(seq_lengths).quantile(0.5))
print("60th percentile: ", pd.Series(seq_lengths).quantile(0.6))
print("70th percentile: ", pd.Series(seq_lengths).quantile(0.7))
print("80th percentile: ", pd.Series(seq_lengths).quantile(0.8))
print("90th percentile: ", pd.Series(seq_lengths).quantile(0.9))
print("95th percentile: ", pd.Series(seq_lengths).quantile(0.95))
print("99th percentile: ", pd.Series(seq_lengths).quantile(0.99))

30th percentile:  97.0
40th percentile:  116.0
50th percentile:  137.0
60th percentile:  162.0
70th percentile:  193.0
80th percentile:  238.0
90th percentile:  320.0
95th percentile:  411.0
99th percentile:  678.0


In [15]:
max_length = 125

# padding
padded_seq = pad_sequences(sequences, maxlen=max_length)

In [16]:
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df_questions['Tags'])
y = multilabel_binarizer.transform(df_questions['Tags'])

In [17]:
padded_seq.shape, y.shape

((76365, 125), (76365, 100))

In [18]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(padded_seq, y, 
                                                    test_size=0.2, 
                                                    random_state=9)

In [19]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, GlobalMaxPool1D, Dropout, Conv1D
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [20]:
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

In [1]:
#def create_model(dropout_rate=0.0,filter_number=100,filter_size=3):
    #model=Sequential()
    #model.add(Embedding(vocab_size+1,128, input_length=max_length))
    #model.add(Dropout(dropout_rate))
    #model.add(Conv1D(filter_number,filter_size,padding = 'valid', activation = "relu", strides = 1))
    #model.add(GlobalMaxPool1D())
    #model.add(Dense(100,activation="sigmoid"))
    #model.compile(optimizer="adam",loss='binary_crossentropy', metrics=['accuracy'])
    
    #return model

In [6]:
#batch_size = [512]
#epochs = [10]
#dropout_rate = [0.1]
#filter_number=[100,300,500]
#filter_size=[5]

In [7]:
#param_grid = dict(batch_size=batch_size, epochs=epochs,dropout_rate=dropout_rate,filter_number=filter_number,filter_size=filter_size)

In [8]:
#model = KerasClassifier(build_fn=create_model, verbose=0)

In [None]:
#grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
#grid_result = grid.fit(x_train, y_train)

In [None]:
#print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
#means = grid_result.cv_results_['mean_test_score']
#stds = grid_result.cv_results_['std_test_score']
#params = grid_result.cv_results_['params']
#for mean, stdev, param in zip(means, stds, params):
    #print("%f (%f) with: %r" % (mean, stdev, param))

In [22]:
model = Sequential()
model.add(Embedding(vocab_size +1, 128, input_length = max_length))
model.add(Dropout(0.15))
model.add(Conv1D(300, 5, padding = 'valid', activation = "relu", strides = 1))
model.add(GlobalMaxPool1D())
model.add(Dense(100, activation = "sigmoid"))
#model.add(Activation('sigmoid'))

In [23]:
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 125, 128)          10490624  
_________________________________________________________________
dropout_2 (Dropout)          (None, 125, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 123, 300)          115500    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 300)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 100)               30100     
Total params: 10,636,224
Trainable params: 10,636,224
Non-trainable params: 0
_________________________________________________________________


In [24]:
callbacks = [
             EarlyStopping(patience=3),
             ModelCheckpoint(filepath='model-conv1d_v1.h5', save_best_only=True)
            ]

In [39]:
# train model
history = model.fit(x_train, y_train,
                    epochs=15,
                    batch_size=128,
                    validation_split=0.1,
                    callbacks=callbacks)

Train on 54982 samples, validate on 6110 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15


In [None]:
# use the code below to load the saved model
# model = load_model('model-conv1d_v1.h5')

In [40]:
preds = model.predict(x_val)

In [34]:
preds.shape

(15273, 100)

In [41]:
# set threshold to 0.45
preds_int = (preds >= 0.45).astype(int)

In [42]:
from sklearn.metrics import f1_score

# calculate F1 score
f1_score(y_val, preds_int, average="micro")

0.5046885934219735

In [95]:
# model = load_model('model-conv1d_v1.h5')

In [43]:
def infer_tags(q):
    q = clean_text(q)
    q = q.lower()
    q_seq = tokenizer.texts_to_sequences([q])
    q_seq_padded = pad_sequences(q_seq, maxlen=125)
    q_pred = model.predict(q_seq_padded)
    q_pred = (q_pred >= 0.3).astype(int)
    
    return multilabel_binarizer.inverse_transform(q_pred)

In [44]:
# give new question
new_q = "Regression line in ggplot doesn't match computed regression Im using R and created a chart using ggplot2. I then create a regression so I can make some predicitions I pass my data frame of to the predict function predict(regression, Measures) I'd expect the predictions to be the same as if I used the regression line on the chart, but they aren't the same. Why would this be the case? Is there a setting in ggplot or is my expectation incorrect?"

# get tags
infer_tags(new_q)

[('prediction', 'r', 'regression')]

In [None]:
# grid search hyperparatmeters tuning 

In [None]:
hyperparameters : 

In [45]:
# give new question
new_q = "Regression line in ggplot doesn't match computed regression Im using R and created a chart using ggplot2. I then create a regression so I can make some predicitions I pass my data frame of to the predict function predict(regression, Measures) I'd expect the predictions to be the same as if I used the regression line on the chart, but they aren't the same. Why would this be the case? Is there a setting in ggplot or is my expectation incorrect?"

# get tags
infer_tags(new_q)

[('prediction', 'r', 'regression')]