In [1]:
import warnings

warnings.filterwarnings('ignore')

In [2]:
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
stopword = set(stopwords.words('english') )
#stopword

# Data Preprocessing

In [4]:

## Read the file 
def readFile(file): 
    f = open(file,'r',encoding='utf-8')
    text = f.read()
    sentences = nltk.sent_tokenize(text)
    
    data = []
    for sent in sentences:
        words =  nltk.word_tokenize(sent)
        words = [w.lower() for w in words if len(w)>2 and w not in stopword]
        data.append(words)
        
    return data

In [5]:

file = "/content/Word_data_set.txt"
text = readFile(file)

print(text)

[['deepika', 'padukone', 'ranveer', 'singh', 'wedding', 'one', 'biggest', 'bollywood', 'events', 'happened', '2018'], ['the', 'deepveer', 'celebrations', 'hooked', 'phones', 'waiting', 'come', 'also', 'gave', 'enough', 'reason', 'believe', 'stylish', 'two', 'couple'], ['from', 'airport', 'looks', 'reception', 'parties', 'everything', 'entire', 'timeline', 'ranveer', 'wedding', 'style', 'file'], ['not', 'ambanis', 'deepika', 'ranveer', 'priyanka', 'nick'], ['man', 'proves', 'wedding', 'the', 'year', 'this', 'year', 'year', 'big', 'fat', 'lavish', 'extravagant', 'weddings'], ['from', 'isha', 'ambani', 'anand', 'piramal', 'deepika', 'padukone', 'ranveer', 'singh', 'priyanka', 'chopra', 'nick', 'jonas', 'kapil', 'sharma', 'ginni', 'chatrath', '2018', 'saw', 'many', 'grand', 'weddings'], ['but', 'nothing', 'beats', 'man', 'wedding', 'the', 'year', 'award', 'social', 'media'], ['the', 'wedding', 'season', 'year', 'kicked', 'deepika', 'padukone', 'ranveer', 'singh', 'flew', 'lake', 'como', 't

# Creating our model 

In [6]:
import gensim 
from gensim.models import Word2Vec

from sklearn.metrics.pairwise import cosine_similarity

In [7]:
model = Word2Vec(text, size = 300, window=10, min_count=1 )
print(model)

Word2Vec(vocab=915, size=300, alpha=0.025)


In [8]:
vocab = list(model.wv.vocab)

print(vocab)

['deepika', 'padukone', 'ranveer', 'singh', 'wedding', 'one', 'biggest', 'bollywood', 'events', 'happened', '2018', 'the', 'deepveer', 'celebrations', 'hooked', 'phones', 'waiting', 'come', 'also', 'gave', 'enough', 'reason', 'believe', 'stylish', 'two', 'couple', 'from', 'airport', 'looks', 'reception', 'parties', 'everything', 'entire', 'timeline', 'style', 'file', 'not', 'ambanis', 'priyanka', 'nick', 'man', 'proves', 'year', 'this', 'big', 'fat', 'lavish', 'extravagant', 'weddings', 'isha', 'ambani', 'anand', 'piramal', 'chopra', 'jonas', 'kapil', 'sharma', 'ginni', 'chatrath', 'saw', 'many', 'grand', 'but', 'nothing', 'beats', 'award', 'social', 'media', 'season', 'kicked', 'flew', 'lake', 'como', 'tie', 'knot', 'days', 'november', 'they', 'several', 'bengaluru', 'mumbai', 'even', 'continued', 'tied', 'american', 'singer', 'jodhpur', 'december', 'yet', 'another', 'spread', 'week', 'hosted', 'delhi', 'host', 'party', 'los', 'angeles', 'time', 'could', 'move', 'mukesh', 'nita', 'dau

# Testing Our Model

In [9]:

def predict_actor(a,b,c,word_vectors):
    """Accepts a triad of words, a,b,c and returns d such that a is to b : c is to d"""
    a,b,c = a.lower(),b.lower(),c.lower()
    max_similarity = -100 
    
    d = None
    words =  ["ranveer","deepika","padukone","singh","nick","jonas","chopra","priyanka","virat","anushka","ginni"]
    
    wa,wb,wc = word_vectors[a],word_vectors[b],word_vectors[c]
    
    #to find d s.t similarity(|b-a|,|d-c|) should be max
    
    for w in words:
        if w in [a,b,c]:
            continue
        
        wv = word_vectors[w]
        sim = cosine_similarity([wb-wa],[wv-wc])
        
        if sim > max_similarity:
            max_similarity = sim
            d = w
    return d

In [10]:

triad = ("nick","priyanka","virat")
predict_actor(*triad,model)

'anushka'

In [11]:
triad = ("ranveer","padukone","jonas")
predict_actor(*triad,model)

'priyanka'

In [12]:

triad = ("priyanka","jonas","nick")
predict_actor(*triad,model)

'chopra'

In [13]:
triad = ("deepika","padukone","priyanka")
predict_actor(*triad,model)

'jonas'

In [14]:
triad = ("jonas","priyanka","virat")
predict_actor(*triad,model)

'anushka'

In [19]:
triad = ("deepika","priyanka","ranveer")
predict_actor(*triad,model)

'nick'