# 1st Approach - Word Embedding - keras one_hot() - cosine_similarity

In [62]:
import pandas as pd
import numpy as np

In [63]:
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Hitesh
[nltk_data]     khatana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [87]:
from tensorflow.keras.preprocessing.text import one_hot , Tokenizer

In [65]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [66]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential

In [67]:
from sklearn.metrics.pairwise import cosine_similarity

In [109]:
full_data = pd.read_csv("caption_data.csv")
full_data["friends"][1] = "Dogs are real friends of a human being"
full_data.head(2)


Unnamed: 0,friends,funny,Selfie,couple,question,Winter,summer,spring,food,travel
0,I never let my friends do stupid thingsalone!,"In this section, we include funny Instagram ca...",,,Where do you travel next?,To appreciate the beauty of a snowflake it is ...,"This summer fashion? My bikini!,",April showers bring May flowers.,I like people that protect their food like it...,"Catch flights, not feelings."
1,Dogs are real friends of a human being,My excuse is that I'm young.,"I'm not perfect, I'm original.",What made you fall for him? He never asked me ...,Have you ever been to Bali?,"Give me a hot drink, and Im happy. Hot cider, ...","Less Monday, More Summer.,","O, wind, if winter comes, can spring be far be...",#foodporn,"Travel is fatal to prejudice, bigotry, and nar..."


In [231]:
# replace missing values to zero
# remove stopwords
# returns filtered data and vocabalry size
def preprocessing(data):
    for column in data.columns:
        data[column] = data[column].fillna(0)
    vocab = 0
    for column in data.columns:
        for i in range(0,len(data)):
            filter_words = []
            for words in str(data[column][i]).lower().split():
                if words not in stopwords.words('english'):
                    filter_words.append(words)
                    vocab += 1
            data[column][i] = " ".join(filter_words)
    return data , vocab

In [232]:
# one hot encoding with keras one_hot function but unicity is not guaranteed in keras one_hot function.
def one_hot_func(data , vocab):
    final = []
    for column in data.columns:
        col_rep = []
        for sent in data[column]:
            sent_rep = one_hot(sent , vocab)
            col_rep.append(sent_rep)
        final.append(col_rep)
    return final

In [233]:
# embedds the data with specific sentence length.
def embedd(final):
    sent_length=50
    embedded = []
    for col in range(0,np.array(final).shape[0]):
        c_embed = []
        for i in range(0,np.array(final).shape[1]):
            embedded_docs = pad_sequences([final[col][i]],padding='pre',maxlen=sent_length)
            c_embed.append(embedded_docs)
        embedded.append(c_embed)
    return embedded , sent_length

In [234]:
# perceptron model for predicting vectors.
#here "100" is number of features for every input.
def vector_model(embedded , vocab , sent_length):
    model=Sequential()
    model.add(Embedding(vocab,100,input_length=sent_length))
    model.compile('adam','mse')
    
    
    vectors = []
    for col in range(0, np.array(embedded).shape[0]):
        col_v = []
        for i in range(0,np.array(embedded).shape[1]):
            col_vectors = model.predict(embedded[col][i])
            col_v.append(col_vectors)
        vectors.append(col_v)
    return vectors , model

test data from object detection

In [235]:
def test_data(data , target_value):
    data["target"] = pd.Series([target_value])
    data , vocab = preprocessing(data)
    
    final = one_hot_func(data , vocab)
    
    embedded , sent_length = embedd(final)
    
    vectors , model = vector_model(embedded , vocab , sent_length)
    
    return vectors

In [236]:
target = " dogs person"
vectors = test_data(full_data.copy() , target)

In [237]:
np.array(vectors).shape

(11, 27, 1, 50, 100)

In [238]:
# returns cosine similarity between two vectors
def get_cosine_similarity(feature_vec_1, feature_vec_2):    
    return cosine_similarity(feature_vec_1.reshape(1, -1), feature_vec_2.reshape(1, -1))[0][0]

 get genre from 'friends', 'funny', 'Selfie', 'couple', 'question', 'Winter', 'summer',
       'spring', 'food', 'travel'

In [239]:
# predict cosine similarity between target vector and captions vector
# returns top 5 vectors having maximum vectors..
# input (caption data , vectors , genre provided to search in specific column)
def predict(data , vectors ,genre):
    column = data.columns.get_loc(genre)
    result = []
    test_vector = vectors[10][0]
    for i in range(0,data.shape[0]):
        result.append(get_cosine_similarity(test_vector,vectors[column][i]))
        
        
    indexes = np.argsort(np.array(result))[-5:]
    
    
    return data[genre].loc[indexes] , result

In [240]:
result , res = predict(full_data , vectors , "friends")

In [241]:
res.index(max(res))

1

In [242]:
full_data["friends"][0]

'I never let my friends do stupid thingsalone!'

In [245]:
keras_ = result
keras_

18    I and my best friend can communicate with just...
22    You had me at We'll make it look like an accid...
6     You have to be crazy to hang out with me I'll ...
5     I know what tighter, our jeans or our friendship.
1                Dogs are real friends of a human being
Name: friends, dtype: object

# 2nd Approach - Word embedding - One-hot encoding using dictionary

In [246]:
def dictionary(data):
    for column in data.columns:
        data[column] = data[column].fillna(0)
    vocab = 0
    all_words = []
    for column in data.columns:
        for i in range(0 ,len(data)):
            filter_words = []
            for words in str(data[column][i]).lower().split():
                if words not in stopwords.words('english'):
                    all_words.append(words)
                    filter_words.append(words)
                    vocab += 1
                    
            data[column][i] = " ".join(filter_words)
    all_words = " ".join(all_words)    
    t = Tokenizer()
    t.fit_on_texts([all_words])
    
    
    return t.word_index , vocab , data

In [247]:
def one_hot_encoding(dic ,data):
    e_whole = []
    for column in data.columns:
        e_column = []
        for sent in data[column]:
            e_sent = []
            for word in sent.split():
                try:
                    word = re.sub(r"[^a-zA-Z0-9]+", '', word)
                    e_sent.append(int(dic[word]))
                except:
                    continue
                   
            e_column.append(e_sent)
        e_whole.append(e_column)
            
    return e_whole

In [248]:
def embedd(final):
    sent_length=50
    embedded = []
    for column in final:
        emb_column = []
        for sentence in column:
            embedded_sent = pad_sequences([sentence],padding='pre',maxlen=sent_length)
            emb_column.append(embedded_sent)
        embedded.append(emb_column)
    return embedded , sent_length

In [249]:
def vector_model(data , embedded , vocab , sent_length):
    model=Sequential()
    model.add(Embedding(vocab,80,input_length=sent_length))
    model.compile('adam','mse')
    
    
    vectors = []
    for col in range(0,data.shape[0]):
        c_vectors = []
        for i in range(0,data.shape[1]):
            col_vectors = model.predict(embedded[i])
            c_vectors.append(col_vectors)
        vectors.append(c_vectors)
    return vectors , model

In [250]:
def test_data(data , target_value):
    data["target"] = pd.Series([target_value])
    dic , vocab ,filtered= dictionary(data)

    encoded_result = one_hot_encoding(dic , filtered)
    
    embedded , sent_length = embedd(encoded_result)
    
    vectors , model = vector_model(data , embedded , vocab , sent_length)
    
    return vectors

# ----------------------------- target from object detection -------------------------------------

In [251]:
target = "dog person"
vectors = test_data(full_data.copy() , target)

In [252]:
def get_cosine_similarity(feature_vec_1, feature_vec_2):    
    return cosine_similarity(feature_vec_1.reshape(1, -1), feature_vec_2.reshape(1, -1))[0][0]

In [253]:
def predict(data , vectors ,genre):
    column = data.columns.get_loc(genre)
    test_vector = vectors[0][10]
    result = []
    for i in range(0,data.shape[0]):
        result.append(get_cosine_similarity(test_vector,vectors[i][column]))
        
    indexes = np.argsort(np.array(result))[-5:]
    
    
    return data[genre].loc[indexes] , result

In [254]:
np.array(vectors).shape

(27, 11, 1, 50, 80)

In [255]:
result , res = predict(full_data , vectors , "friends")

In [256]:
res.index(max(res))

0

In [257]:
res.index(max(res))

0

In [258]:
dic_ = result
dic_

3     Friendship is like peeing on yourself: everyon...
2     I like to hang out with people who make me for...
1                Dogs are real friends of a human being
12    Pay close attention to people who clap when yo...
26    walk behind me; I may not lead. walk in front ...
Name: friends, dtype: object

# 3rd Approach - spacy Library - Pretrained model(en_core_Web_leg)
## [ Model contains 1 M word vectors.]

In [207]:
import spacy
import pandas as pd
import numpy as np

import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Hitesh
[nltk_data]     khatana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [209]:
import en_core_web_lg
nlp = en_core_web_lg.load()

In [265]:
def preprocessing(data):
    vocab = 0
    for column in data.columns:
        data[column] = data[column].fillna(0)
        for i in range(0,len(data)):
            filter_words = []
            for words in str(data[column][i]).lower().split():
                if words not in stopwords.words('english'):
                    filter_words.append(words)
            
                    vocab += 1
            data[column][i] = " ".join(filter_words)
            
    return data , vocab

In [266]:
captions , vocab = preprocessing(full_data.copy())

In [267]:
captions.head(2)

Unnamed: 0,friends,funny,Selfie,couple,question,Winter,summer,spring,food,travel
0,never let friends stupid thingsalone!,"section, include funny instagram captions shor...",0,0,travel next?,appreciate beauty snowflake necessary stand co...,"summer fashion? bikini!,",april showers bring may flowers.,like people protect food like would baby.,"catch flights, feelings."
1,dogs real friends human,excuse i'm young.,"i'm perfect, i'm original.",made fall him? never asked justify past.,ever bali?,"give hot drink, im happy. hot cider, hot choco...","less monday, summer.,","o, wind, winter comes, spring far behind?",#foodporn,"travel fatal prejudice, bigotry, narrow minded..."


In [268]:
def predict(nlp , column , test):
    test_vector = nlp(test)
    result = []
    for caption in column:
        result.append(test_vector.similarity(nlp(caption)))
        
        
    indexes = np.argsort(np.array(result))[-5:]
    
    return indexes , result

In [269]:
indexes , result = predict(nlp , full_data["friends"] , "dog person")

In [270]:
indexes

array([23, 10, 13, 17,  1], dtype=int64)

In [271]:
spacy_ = full_data["friends"].loc[indexes]
spacy_

23    Walking with a friend in the dark is better th...
10    It takes a long time to grow an old friend. “ ...
13    A real friend is one who walks in when the res...
17    Friends can help each other. A true friend is ...
1                Dogs are real friends of a human being
Name: friends, dtype: object

In [272]:
result.index(max(result))

1

In [273]:
dic_

3     Friendship is like peeing on yourself: everyon...
2     I like to hang out with people who make me for...
1                Dogs are real friends of a human being
12    Pay close attention to people who clap when yo...
26    walk behind me; I may not lead. walk in front ...
Name: friends, dtype: object

In [274]:
keras_

18    I and my best friend can communicate with just...
22    You had me at We'll make it look like an accid...
6     You have to be crazy to hang out with me I'll ...
5     I know what tighter, our jeans or our friendship.
1                Dogs are real friends of a human being
Name: friends, dtype: object

In [275]:
spacy_

23    Walking with a friend in the dark is better th...
10    It takes a long time to grow an old friend. “ ...
13    A real friend is one who walks in when the res...
17    Friends can help each other. A true friend is ...
1                Dogs are real friends of a human being
Name: friends, dtype: object