In [108]:
import numpy as np
import pandas as pd
import os

In [2]:
data = pd.read_csv("cleaned_data.csv")

In [3]:
data.drop(["Unnamed: 0"],axis='columns',inplace=True)

In [45]:
y_data = pd.get_dummies(data["Tags_Filtered"])

In [46]:
y_data.head()

Unnamed: 0,android,c#,c++,html,ios,java,javascript,jquery,php,python
0,0,1,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0


In [6]:
data.head()

Unnamed: 0,Body,Tags_Filtered
0,little game write c use database back-end trad...,c#
1,work collection class use video playback recor...,c++
2,would like version property application increm...,c#
3,simplest way connect query database set record c#,c#
4,need grab base64-encoded representation viewst...,c#


In [7]:
from gensim.models import word2vec

In [29]:
sent_list = []
for sent in data["Body"].values:
    filtered_words = []

    for word in sent.split():
        if word.isalpha():
            filtered_words.append(word)
    sent_list.append(filtered_words)

In [33]:
w2v_model = word2vec.Word2Vec(sent_list,min_count=10,
                             size=300,workers=12)

In [41]:
w2v_model.save("stackoverflow_w2v_embeddings.bin")

In [35]:
len(list(w2v_model.wv.vocab))

190596

In [36]:
w2v_model.wv.most_similar('django')

[('flask', 0.6306920051574707),
 ('viewspy', 0.5938119888305664),
 ('modelspy', 0.5867741703987122),
 ('settingspy', 0.5735112428665161),
 ('pyramid', 0.5551276206970215),
 ('urlspy', 0.5454357862472534),
 ('gae', 0.5434261560440063),
 ('cherrypy', 0.5369576811790466),
 ('adminpy', 0.5316402316093445),
 ('sqlalchemy', 0.5277770757675171)]

In [57]:
w2v_model.wv.most_similar('xcode')

[('simulator', 0.6682143211364746),
 ('cocoapods', 0.6403347253799438),
 ('ios', 0.6082570552825928),
 ('testflight', 0.5963464975357056),
 ('ipa', 0.5945266485214233),
 ('monotouch', 0.5908800363540649),
 ('swift', 0.588433027267456),
 ('storyboards', 0.5620737075805664),
 ('watchos', 0.5604506134986877),
 ('monodevelop', 0.5538150668144226)]

In [42]:
from sklearn.model_selection import train_test_split

In [111]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks.callbacks import ModelCheckpoint

In [62]:
vocb_size = 50000
token = Tokenizer(num_words=vocb_size, filters= '!"$%&()*,-./:;<=>?@[\\]^_`{|}~\t\n',oov_token='<OOV>')

In [63]:
token.fit_on_texts(data['Body'])

In [64]:
sequences = token.texts_to_sequences(data['Body'])

In [81]:
len(sequences)
embedd_dim = 300

In [90]:
seq_len = 600
sequences_padded = pad_sequences(sequences,maxlen=seq_len,padding='post',truncating='post')

In [91]:
len(sequences[0]),len(sequences_padded[0])

(93, 600)

In [92]:
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Conv1D,Dense,Dropout,BatchNormalization,MaxPooling1D

In [93]:
x_train,x_test,y_train,y_test = train_test_split(sequences_padded,
                                                     y_data,
                                                     test_size = 0.2,
                                                    random_state = 2)

In [94]:
embedding_matrix = np.zeros((vocb_size,embedd_dim))
for word, index in token.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

In [95]:
embedding_matrix.shape

(50000, 300)

In [123]:
model = Sequential()

In [103]:
seq_len

600

In [124]:
model.add(Embedding(input_dim = vocb_size, output_dim = embedd_dim,
                    weights = [embedding_matrix],input_length=seq_len, trainable = False))

model.add(LSTM(200,activation = 'relu',return_sequences=True))
model.add(LSTM(100,activation = 'relu',return_sequences=False))

model.add(Dropout(0.40))

model.add(Dense(1024,activation = 'relu'))
model.add(Dense(512,activation = 'relu'))

model.add(Dropout(0.40))
model.add(BatchNormalization())

model.add(Dense(256,activation = 'relu'))
model.add(Dense(128,activation = 'relu'))

model.add(Dense(10,activation = 'softmax'))

In [125]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 600, 300)          15000000  
_________________________________________________________________
lstm_9 (LSTM)                (None, 600, 200)          400800    
_________________________________________________________________
lstm_10 (LSTM)               (None, 100)               120400    
_________________________________________________________________
dropout_5 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 1024)              103424    
_________________________________________________________________
dense_12 (Dense)             (None, 512)               524800    
_________________________________________________________________
dropout_6 (Dropout)          (None, 512)              

In [109]:
checkpoint_filepath = "./check_points/"
if os.path.isdir(checkpoint_filepath) is not True:
    print("Folder for Checkpoints doen't exists")
    os.mkdir(checkpoint_filepath)
    
if os.path.isdir(checkpoint_filepath) is True:
    print("Folder of Checkpoints exists")

Folder for Checkpoints doen't exists
Folder of Checkpoints exists


In [112]:
checkpoint = ModelCheckpoint(checkpoint_filepath + '/{epoch:03d}-{val_accuracy:.4f}.hdf5',
                             monitor='val_accuracy',
                             verbose=1,
                             save_best_only=True,
                             mode='max')

In [126]:
model.compile(optimizer='adam',loss = 'categorical_crossentropy',metrics=['accuracy'])

In [127]:
EPOCHS = 500
model.fit(x=x_train,
          y=y_train,
          batch_size=250,
          epochs=EPOCHS,
         callbacks=[checkpoint],
         validation_split= 0.2)

Train on 470115 samples, validate on 117529 samples
Epoch 1/500

Epoch 00001: val_accuracy improved from -inf to 0.16900, saving model to ./check_points//001-0.1690.hdf5
Epoch 2/500

Epoch 00002: val_accuracy did not improve from 0.16900
Epoch 3/500

Epoch 00003: val_accuracy did not improve from 0.16900
Epoch 4/500

KeyboardInterrupt: 

In [None]:
y.shape

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence