In [None]:
import keras
from keras.models import Model
import pandas as pd
from keras import regularizers
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD, RMSprop, Adam
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
import json
import gensim
import glob
from keras.preprocessing import image
import numpy as np
import re
from keras import utils
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#Load Google's pre-trained Word2Vec model.
from gensim.models import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('google_w2vec.bin', binary=True)

In [None]:
#Print Word2Vec Specifications
vecsize = word_vectors.vector_size
print('Vocabulary Size:', len(word_vectors.vocab))
print(type(word_vectors.vocab))
print('Vector Size:', vecsize)

In [None]:
#Define File-Path to Users Folder
folder_list = ['#foodporn',"#nightlife","#cosmetics","#rockclimbing"]
image_path = "/Users/kmotwani/Desktop/Me/Education/Courses/Capstone Project/Insta Images/"

In [None]:
#Helper Function to get images from path
def get_df(path, list_input, thresh):
    final_list = []
    for ind, i in enumerate(list_input):
        temp_path = path + i
        print(i,"\n")
        count = 0
        for j in glob.glob(temp_path + '/*.jpg'):
            temp_dict = {}
            file_name = j.replace(temp_path,'')[1:]
            img = image.load_img(j, target_size=(128, 128))
            try:
                with open(temp_path+"/"+file_name[:-4]+'.txt', encoding="utf-8") as f:
                    content = f.readlines()
                    caption = ' '.join([x.strip() for x in content])
            except FileNotFoundError:
                continue
            temp_dict['File'], temp_dict['Response'] = file_name, int(ind)
            temp_dict['Image'], temp_dict['Caption'] = np.array(img), caption
            final_list.append(temp_dict)
            count += 1
            if count==thresh:
                break
    return pd.DataFrame(final_list) 


#Get Images from User List and Path
df = get_df(image_path, folder_list, 9000)
print("Number of images loaded:", len(df))
display(df.head())

In [None]:
#Remove Hashtags
df['Caption'] = df['Caption'].str.replace('#foodporn','', case=False)
df['Caption'] = df['Caption'].str.replace('#rockclimbing','', case=False)
df['Caption'] = df['Caption'].str.replace('#nightlife','', case=False)
df['Caption'] = df['Caption'].str.replace('#cosmetics','', case=False)
display(df.head())

In [None]:
#Split data into Train and Test set
use_df = df.sample(frac=1).reset_index(drop=True)
display(use_df.head())
np.random.seed(9001)
msk = np.random.rand(len(use_df)) < 0.7
total_data_train = use_df[msk]
total_data_test = use_df[~msk]

In [None]:
#Helper function to get text vector
def get_vector(x, limit):
    sequence, count = np.zeros((limit, 300), dtype=float), 0
    x = re.sub(r'[^\w\s]','',x)
    for word in x.split():
        if word in word_vectors.vocab:
            if count<limit: 
                sequence[count] = word_vectors.get_vector(word)
        count += 1
    return sequence

#Define Train and Test Dataframes
x_train, x_test = [], []
total_data_train['Caption'].astype(str).apply(lambda x: x_train.append(get_vector(x, 100)))
total_data_test['Caption'].astype(str).apply(lambda x: x_test.append(get_vector(x, 100)))
y_train = utils.to_categorical(total_data_train['Response'].as_matrix(), num_classes=4)
y_test = utils.to_categorical(total_data_test['Response'].as_matrix(), num_classes=4)

In [None]:
#Change X,Y to Numpy Arrays
x_train = np.array(x_train)
x_test = np.array(x_test)

In [None]:
#Helper function to create CNN Model for Image Classification
def createModel(size, classes):
    model = Sequential()
    model.add(keras.layers.LSTM(size, input_shape=(100, 300), return_sequences=False))
    model.add(Dense(classes, activation='relu'))  
    model.summary()
    return model

#Helper function to run model and save intermediate weights
def run_model(model, x_train, y_train, x_test, y_test, batch_size, epochs):
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
    filepath="KRM_LSTM_weights-{epoch:02d}-{val_acc:.2f}.hdf5"
    check = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max', period = 5)
    callbacks_list = [check]
    history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, 
                       validation_data=(x_test, y_test), callbacks=callbacks_list)
    return model, history

In [None]:
#Create and Fit Model
model = createModel(64, 4)
model, history = run_model(model, x_train, y_train, x_test, y_test, batch_size=512, epochs=50)

In [None]:
from keras.models import load_model
best_model = load_model('KRM_LSTM_weights-10-0.86.hdf5')
best_model.summary()

In [None]:
#Helper function to create feature maps
def get_feature_maps(model, layer_id, input_text):
    model_ = Model(inputs=[model.input], outputs=[model.layers[layer_id].output])
    return model_.predict(input_text)

In [None]:
def get_array(x, y, label):
    final_list = []
    count = 0
    for ind, i in enumerate(x):
        print(ind,"/", len(x))
        temp_dict = {}
        temp_map = list(get_feature_maps(best_model, 0, i.reshape(1,100,300))[0])
        temp_dict['Response'] = np.argmax(y[ind])
        for ind, j in enumerate(temp_map):
            temp_dict[label+str(ind)] = j
        final_list.append(temp_dict)
    return pd.DataFrame(final_list)

df_vec1 = get_array(x_train, y_train, "Feature_")
df_vec2 = get_array(x_test, y_test, "Feature_")
main_df = pd.concat([df_vec1, df_vec2])
print(df_vec1.shape)
print(df_vec2.shape)
print(main_df.shape)
display(main_df.head())

In [None]:
main_df.to_csv('LSTM_DF.csv')

In [None]:
%%file Text_LSTM.py

from keras.models import load_model
import numpy as np
from keras.models import Model
from gensim.models import KeyedVectors
import re 

class Main:
    
    #Initialize with saved model and embeddings
    def __init__(self, path_model, path_embedding):
        self.model = load_model(path_model)
        self.word_vectors = KeyedVectors.load_word2vec_format(path_embedding, binary=True)
        
    #Helper function to create feature maps
    def get_feature_maps(self, layer_id, input_text):
        model_ = Model(inputs=[self.model.input], outputs=[self.model.layers[layer_id].output])
        return model_.predict(input_text)
    
    #Helper function to get embedding
    def embedd_text(self, x):
        final = []
        for i in x:
            sequence, count = np.zeros((100, 300), dtype=float), 0
            i = re.sub(r'[^\w\s]','',i)
            for word in i.split():
                if word in self.word_vectors.vocab:
                    if count<100: 
                        sequence[count] = self.word_vectors.get_vector(word)
                count += 1
            final.append(sequence)
        return final
        
    #Helper function to predict 
    def predict(self, x):
        final_list = []
        print("LSTM Prediction in progress.\n")
        for ind, i in enumerate(x):
            print(ind,"/", len(x))
            temp_map = list(self.get_feature_maps(0, i.reshape(1,100,300))[0])
            final_list.append(temp_map)
        return final_list
    

    #Helper fucntion to combine DF to predictions
    def combine(self, df, prediction, label):
        new_cols = np.zeros((len(df),len(prediction[0])))
        for ind, i in enumerate(prediction):
            new_cols[ind,:] = i
        for i in range(len(prediction[0])):
            df[label+str(i+1)] = new_cols[:,i]
        return df

In [None]:
import Text_LSTM

#Import Model and Embedding
obj = Text_LSTM.Main('KRM_LSTM_New_weights-40-0.87.hdf5', 'google_w2vec.bin')

In [None]:
#Create DataFrame
import pandas as pd
df = pd.DataFrame([["Hey",123,"www.google.co.in"],["Who are you?",123,"www.hotmail.co.in"]], columns=['Post', 'ID','URL'])
display(df.head())

In [None]:
#Define Post
x = df['Post']

In [None]:
#Get embedding for input text
x = obj.embedd_text(x)
print(x)

In [None]:
#Get Predictions
pred = obj.predict(x)
print(pred)

In [None]:
#Combine Prediction
df = obj.combine(df, pred, "LSTM_Feature_")
display(df.head())