In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import nltk
import spacy
import re
import string
import gc

In [2]:
robotics_data = pd.read_csv('transfer-learning-on-stack-exchange-tags/robotics.csv')
diy_data = pd.read_csv('transfer-learning-on-stack-exchange-tags/diy.csv')
biology_data = pd.read_csv('transfer-learning-on-stack-exchange-tags/biology.csv')
crypto_data = pd.read_csv('transfer-learning-on-stack-exchange-tags/crypto.csv')
travel_data = pd.read_csv('transfer-learning-on-stack-exchange-tags/travel.csv')
cooking_data = pd.read_csv('transfer-learning-on-stack-exchange-tags/cooking.csv')

In [3]:
data = pd.concat([robotics_data,diy_data,biology_data,travel_data,cooking_data,crypto_data])

In [4]:
data.shape

(87000, 4)

In [5]:
data.head()

Unnamed: 0,id,title,content,tags
0,1,What is the right approach to write the spin c...,<p>Imagine programming a 3 wheel soccer robot....,soccer control
1,2,How can I modify a low cost hobby servo to run...,"<p>I've got some hobby servos (<a href=""http:/...",control rcservo
2,3,What useful gaits exist for a six legged robot...,"<p><a href=""http://www.oricomtech.com/projects...",gait walk
3,4,Good Microcontrollers/SOCs for a Robotics Project,<p>I am looking for a starting point for my pr...,microcontroller arduino raspberry-pi
4,5,Nearest-neighbor data structure for non-Euclid...,<p>I'm trying to implement a nearest-neighbor ...,motion-planning rrt


In [6]:
def remove_html_tags(html):
    soup = BeautifulSoup(html,'lxml')
    text = soup.get_text()
    return text

data['content'] = data['content'].apply(lambda x : remove_html_tags(x))

In [7]:
stopwords = nltk.corpus.stopwords.words('english')
#print(stopwords)
def remove_urls(text):
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text)
    return text

def remove_punctuation(text):
    chars = [char for char in text if char not in string.punctuation]
    text = ''.join([char for char in chars])
    return text

def remove_stopwords(text):
    text_lower = [x.lower() for x in text]
    text = ''.join([x for x in text_lower])
    tokens = nltk.word_tokenize(text)
    processed_text = [word for word in tokens if word not in stopwords]
    processed_text = ' '.join([word for word in processed_text])
    return processed_text

def lemmatize_text(text):
    lemmatizer = nltk.WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    clean_text = ' '.join([word for word in lemmas])
    return clean_text

In [8]:
data['title'] = data['title'].apply(lambda x: remove_urls(x))
data['title'] = data['title'].apply(lambda x: remove_punctuation(x))
data['title'] = data['title'].apply(lambda x: remove_stopwords(x))
data['title'] = data['title'].apply(lambda x: lemmatize_text(x))

data['content'] = data['content'].apply(lambda x: remove_urls(x))
data['content'] = data['content'].apply(lambda x: remove_punctuation(x))
data['content'] = data['content'].apply(lambda x: remove_stopwords(x))
data['content'] = data['content'].apply(lambda x: lemmatize_text(x))

data['tags'] = data['tags'].apply(lambda x:x.split())

In [9]:
data.head()

Unnamed: 0,id,title,content,tags
0,1,right approach write spin controller soccer robot,imagine programming 3 wheel soccer robot type ...,"[soccer, control]"
1,2,modify low cost hobby servo run freely,ive got hobby servo power hd 1501mgs id like a...,"[control, rcservo]"
2,3,useful gait exist six legged robot pro con,tripod wave ripple improved relative pro con a...,"[gait, walk]"
3,4,good microcontrollerssocs robotics project,looking starting point project preferably usin...,"[microcontroller, arduino, raspberry-pi]"
4,5,nearestneighbor data structure noneuclidean co...,im trying implement nearestneighbor structure ...,"[motion-planning, rrt]"


In [10]:
import gensim
embeddings = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True,limit=600000)



In [11]:
embeddings.most_similar('physics')

  if np.issubdtype(vec.dtype, np.int):


[('quantum_mechanics', 0.697684645652771),
 ('Physics', 0.690711259841919),
 ('quantum_physics', 0.6846468448638916),
 ('astrophysics', 0.6702420711517334),
 ('particle_physics', 0.659159779548645),
 ('thermodynamics', 0.6546549797058105),
 ('theoretical_physics', 0.6381757259368896),
 ('physicist', 0.6186479926109314),
 ('fluid_dynamics', 0.6175768375396729),
 ('quantum_theory', 0.6106815338134766)]

In [12]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_words = 50000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(list(data['content']))
sequences = tokenizer.texts_to_sequences(list(data['content']))
data_set = pad_sequences(sequences,maxlen=max_len)

one_hot = preprocessing.MultiLabelBinarizer()
one_hot_labels = one_hot.fit_transform(data['tags'])
print(one_hot_labels.shape)                    

Using TensorFlow backend.


(87000, 4268)


In [13]:
train_X = data_set[:int(-0.3*data_set.shape[0])]
val_X = data_set[int(-0.3*data_set.shape[0]):]
train_y = one_hot_labels[:int(-0.3*data_set.shape[0])]
val_y = one_hot_labels[int(-0.3*data_set.shape[0]):]

In [14]:
embeddings_index = {}
embedding_size = 300
for word in embeddings.wv.vocab:
    embeddings_index[word] = embeddings.word_vec(word)

all_embeddings = np.stack(list(embeddings_index.values()))
embed_mean,embed_std = all_embeddings.mean(),all_embeddings.std()
num_words = len(tokenizer.word_index)

embedding_matrix = np.random.normal(embed_mean,embed_std,(num_words,embedding_size))

for word,index in tokenizer.word_index.items():
    index -= 1
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

del(embeddings_index)
gc.collect()

  This is separate from the ipykernel package so we can avoid doing imports until


60

In [44]:
from keras.layers import Dense,Embedding,Conv1D,Dropout,LSTM,MaxPool1D
from keras.models import Sequential

embedding_layer = Embedding(len(tokenizer.word_index),300,weights = [embedding_matrix],trainable=False)
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(filters=512,kernel_size=7,activation='relu'))
model.add(MaxPool1D(pool_size=5))
model.add(Dropout(0.3))
model.add(Conv1D(filters=512,kernel_size=7,activation='relu'))
model.add(MaxPool1D(pool_size=5))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dense(300,activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(len(one_hot.classes_), activation='relu'))

model.compile(loss='mean_squared_error',optimizer='adam',metrics=['accuracy'])
model.fit(train_X,train_y,validation_data=(val_X,val_y),epochs=3,batch_size=256,verbose=1)


Train on 60900 samples, validate on 26100 samples
Epoch 1/3


Epoch 2/3




Epoch 3/3




NameError: name 'os' is not defined

In [48]:
model.save('tags_model.h5')

In [15]:
from keras.models import load_model
model = load_model('tags_model.h5')

In [16]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 300)         48883500  
_________________________________________________________________
conv1d_4 (Conv1D)            (None, None, 512)         1075712   
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, None, 512)         0         
_________________________________________________________________
dropout_4 (Dropout)          (None, None, 512)         0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, None, 512)         1835520   
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, None, 512)         0         
_________________________________________________________________
dropout_5 (Dropout)          (None, None, 512)         0         
__________

In [25]:
test_data = pd.read_csv('transfer-learning-on-stack-exchange-tags/test.csv')
test_data['content'] = test_data['content'].apply(lambda x:remove_html_tags(x))
test_data['content'] = test_data['content'].apply(lambda x:remove_punctuation(x))
test_data['content'] = test_data['content'].apply(lambda x:remove_stopwords(x))
test_data['content'] = test_data['content'].apply(lambda x:lemmatize_text(x))

In [27]:
test_X = pad_sequences(tokenizer.texts_to_sequences(list(test_data['content'])),maxlen=max_len)

In [28]:
predictions = model.predict(test_X,batch_size=256,verbose=1)



In [36]:
from joblib import Parallel,delayed
from sklearn.metrics import matthews_corrcoef
threshold = np.arange(0,0.02,0.00025)
out = model.predict(val_X)
out = np.array(out)
def bestThreshold(y_prob, threshold, i):
    acc = []
    for j in threshold:
        y_pred = np.greater_equal(y_prob, j)*1
        acc.append(matthews_corrcoef(val_y[:,i], y_pred))
    acc = np.array(acc)
    index = np.where(acc==acc.max())
    return threshold[index[0][0]]
best_threshold = Parallel(n_jobs=4, verbose=1)(delayed(bestThreshold)(out[:,i], threshold, i) for i in range(out.shape[1]))

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   28.7s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  4.5min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  8.0min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed: 12.6min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed: 18.7min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed: 26.7min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed: 35.3min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed: 44.7min
[Parallel(n_jobs=4)]: Done 4268 out of 4268 | elapsed: 47.1min finished
  .format(folder_path, RM_SUBDIRS_N_RETRY))


PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\kushal\\AppData\\Local\\Temp\\joblib_memmapping_folder_16796_4664122654\\16796-3159832263704-7040d39116f94e3c8e7ba92a31a536eb.pkl'

In [37]:
best_threshold

NameError: name 'best_threshold' is not defined