## Importing required libraries

In [1]:
import pandas as pd
import gensim

## Merged 2 excel sheets datasets so in total we have used 329 asanas benefits

In [2]:
df = pd.read_csv('test.csv')

In [3]:
df

Unnamed: 0,Asana,Benefits
0,PADOTTHANASANA,This asana strengthens\nthe abdominal muscles ...
1,PARVATASANA,This pose strengthens the nerves and muscles i...
2,ARDHA TITALI ASANA,This is an excellent \npreparatory practice fo...
3,GATYATMAK MERU \nVAKRASANA,This asana removes stiffness \nof the back and...
4,SIDEWAYS VIEWING,Sideways viewing relaxes the \ntension of the ...
...,...,...
321,kapilasana,"1. Helps in stretching legs, hamstrings, arms,..."
322,omkarasana,It relieves the pain of hands and feet( It is ...
323,kashyapawsana,1. This asana provides a deep muscular massage...
324,bhunamanasana,1. Bhunamanasana stretches and improves the fl...


In [4]:
df.shape

(326, 2)

## Removing stop words at first and converting it into lower cases so that all stop words can be removed which were in upper case.


In [5]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')
#  1st step for removing stop words is to use the library and remove stop words, this will make 
# sure that some basic stop words and numerical values is been removed from the Benefits column.

# 2nd step is that there might be stop words present in upper case for example "This" is a stop word
# which was not removed in 1st step so I converted the Benefits column to lower case. Moreover we can only  
# lowercase any sentence or entire column when that column is free of any numerical value.

# 3rd then further removing stopwords, in this way we get the whole benefits column free from stop words


from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_non_alphanum
from gensim.parsing.preprocessing import strip_numeric
from gensim.parsing.preprocessing import strip_multiple_whitespaces
df['Benefits'] = df['Benefits'].apply(str.lower)
df['Benefits']=df.Benefits.apply(remove_stopwords)
df['Benefits']=df.Benefits.apply(strip_non_alphanum)
df['Benefits']=df.Benefits.apply(strip_numeric)
df['Benefits']=df.Benefits.apply(strip_multiple_whitespaces)
df['Asana']=df.Asana.apply(strip_multiple_whitespaces)
df['Asana'] = df['Asana'].apply(str.lower)
df

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,Asana,Benefits
0,padotthanasana,asana strengthens abdominal muscles massages o...
1,parvatasana,pose strengthens nerves muscles limbs back hel...
2,ardha titali asana,excellent preparatory practice loosening knee ...
3,gatyatmak meru vakrasana,asana removes stiffness increases flexibility ...
4,sideways viewing,sideways viewing relaxes tension muscles strai...
...,...,...
321,kapilasana,helps stretching legs hamstrings arms chest b...
322,omkarasana,relieves pain hands feet helpful rheumatism gi...
323,kashyapawsana,asana provides deep muscular massage abdomen ...
324,bhunamanasana,bhunamanasana stretches improves flexibility ...


## We have further removed special characters and tokenized each row of benefits. 

In [6]:
benefits =df['Benefits'].apply(gensim.utils.simple_preprocess)
print(benefits)

0      [asana, strengthens, abdominal, muscles, massa...
1      [pose, strengthens, nerves, muscles, limbs, ba...
2      [excellent, preparatory, practice, loosening, ...
3      [asana, removes, stiffness, increases, flexibi...
4      [sideways, viewing, relaxes, tension, muscles,...
                             ...                        
321    [helps, stretching, legs, hamstrings, arms, ch...
322    [relieves, pain, hands, feet, helpful, rheumat...
323    [asana, provides, deep, muscular, massage, abd...
324    [bhunamanasana, stretches, improves, flexibili...
325    [stretches, strengthens, lengthens, mandalasan...
Name: Benefits, Length: 326, dtype: object


In [7]:
benefits[0] # printing the tokenized words of the first benefit column

['asana',
 'strengthens',
 'abdominal',
 'muscles',
 'massages',
 'organs',
 'strengthens',
 'digestive',
 'system',
 'lower',
 'back',
 'pelvic',
 'perineal',
 'muscles',
 'helps',
 'correct',
 'prolapse']

##Word2vec implementation 

In [11]:
# We have taken the window size as 5 and size of each vector embedding to be 50
model = gensim.models.Word2Vec(
    window=5,
    min_count=2,
    workers=4,
    size = 50,
)


##Building our vocabulary of unique words

In [12]:
model.build_vocab(benefits, progress_per=5)

##Training our Word2vec model

In [13]:
model.train(benefits, total_examples=model.corpus_count, epochs=2000) #epochs size is 2000

(22225305, 29294000)

## We can test our model for any word suppose 'sciatica' and then we will get all the similar words 

In [14]:
model.wv.most_similar("sciatica", topn= 100)

[('mild', 0.45928409695625305),
 ('waist', 0.39905768632888794),
 ('time', 0.3939394950866699),
 ('disc', 0.39091750979423523),
 ('slipped', 0.3907482922077179),
 ('pains', 0.3755643963813782),
 ('sleep', 0.36869293451309204),
 ('sinus', 0.35761627554893494),
 ('directions', 0.35165175795555115),
 ('sciatic', 0.3484801650047302),
 ('spondylitis', 0.34108608961105347),
 ('spinal', 0.33181777596473694),
 ('discs', 0.3316720426082611),
 ('rounded', 0.3283521831035614),
 ('injured', 0.3262965679168701),
 ('carotid', 0.3242065906524658),
 ('calves', 0.32205623388290405),
 ('chin', 0.3176332712173462),
 ('periods', 0.31546464562416077),
 ('rid', 0.3147170841693878),
 ('levels', 0.31083300709724426),
 ('certain', 0.3080895245075226),
 ('backache', 0.3064400255680084),
 ('focused', 0.2935173809528351),
 ('movement', 0.2876668870449066),
 ('heels', 0.286886990070343),
 ('heals', 0.2845405340194702),
 ('flexibility', 0.28397995233535767),
 ('titli', 0.2824893891811371),
 ('ailments', 0.275742858

## Printing the vector form of a word in our vocabulary

In [15]:
# suppose we want to know the vector embeddings of a word 'pain' so we have printed the vector embedding of size 50
import numpy as np
print((model.wv.get_vector('pain')))

[ -6.659703     8.92621      3.55505      1.9956577    1.5651033
   1.1927893    3.2711432   -0.33193162  -4.9726915   -2.0623188
   1.2079959    4.5048976   -3.4870558   -0.6522507   -1.6390252
  -0.6603125    1.054047    -4.612692     5.226313   -11.422564
  -4.5877876    1.1274526   -5.5874014  -12.315183     2.797863
  -5.1411667   -3.2106621   -0.8360376   -4.099211    -2.6873517
  -4.0472674    3.795313     3.9301326   -2.2479038   -2.440859
  -4.4600987    1.4331807    8.775812     5.9484344    5.215195
   0.31107217  -1.9708958    5.20776     -1.809946    -0.5112368
  -1.6624253    1.2809285   -3.196001    -7.817722    -0.52106464]


In [16]:
# counts total number of rows in datasets being trained
model.corpus_count

326

##Created a list of unique words from the Benefits column

In [17]:
from gensim.models import Word2Vec
# created list of unique words from the column Benefits. 
words = list(w for w in model.wv.vocab)


In [18]:
# printing unique words from the vocabulary list.
print(words)

['asana', 'strengthens', 'abdominal', 'muscles', 'massages', 'organs', 'digestive', 'system', 'lower', 'back', 'pelvic', 'helps', 'correct', 'prolapse', 'pose', 'nerves', 'limbs', 'increase', 'height', 'stretching', 'ligaments', 'enabling', 'growing', 'bones', 'grow', 'longer', 'circulation', 'stimulated', 'especially', 'upper', 'spine', 'shoulder', 'blades', 'excellent', 'preparatory', 'practice', 'loosening', 'knee', 'hip', 'joints', 'meditative', 'poses', 'people', 'sit', 'legged', 'practise', 'ardha', 'daily', 'morning', 'removes', 'stiffness', 'increases', 'flexibility', 'sideways', 'viewing', 'relaxes', 'tension', 'close', 'work', 'prevents', 'corrects', 'effective', 'suffering', 'slipped', 'disc', 'sciatica', 'certain', 'pain', 'remain', 'extended', 'periods', 'time', 'encourages', 'column', 'resume', 'normal', 'shape', 'releases', 'compression', 'spinal', 'lung', 'ailments', 'simple', 'regularly', 'breath', 'awareness', 'allows', 'air', 'lungs', 'padmasana', 'body', 'held', 'co

In [19]:
# here we have the length of unique words

print(len(words))

1330


## Created an empty dictionary at first to store the unique words as key along with its vector embeddings.

In [20]:
dict_of_word_embeddings = dict({})
for i in words:
    dict_of_word_embeddings[i] = model.wv[i]

In [21]:
# on printing the dictionary, we get the following result.
print(dict_of_word_embeddings)

{'asana': array([-1.6215265 , -2.9498246 , -0.60845935,  2.1405494 ,  4.2702017 ,
       -0.19156015,  0.19165526, -1.3814332 ,  1.0541502 ,  0.09288158,
       -0.9435468 ,  1.5437386 , -0.5238245 ,  1.6398002 ,  0.32082197,
       -2.4062645 ,  1.1405398 ,  1.5691519 ,  3.2310224 , -1.653757  ,
        0.32695675,  2.9202685 , -2.239348  , -0.9516116 , -0.3374356 ,
       -1.4075596 , -0.50712234, -1.6075026 , -1.6083134 ,  0.75188464,
       -2.9462163 ,  0.20514947, -0.73309183, -1.0344026 , -0.40075636,
        0.13980803, -0.21806501, -1.386347  , -0.59286475,  3.1481616 ,
        0.16369706, -1.5332326 , -2.522262  , -2.2598798 , -5.129776  ,
       -0.41129375, -3.4230993 ,  1.9349753 , -1.1868995 , -1.4836173 ],
      dtype=float32), 'strengthens': array([-0.538406  ,  2.4530578 , -0.50095385, -2.2654302 ,  2.110949  ,
        4.042972  , -3.992735  , -2.5465662 ,  1.7607694 , -2.250425  ,
       -1.5829527 ,  3.9945648 ,  1.4404925 , -0.8127251 ,  0.82953227,
       -1.962898

## Creating a Pandas dataframe to store the unique words and its word vectors

In [22]:
Unique_words = dict_of_word_embeddings.keys()
word_vectors  = dict_of_word_embeddings.values()
d = {'Unique_words' : Unique_words , 'Word_Vectors' : word_vectors}
dataframe = pd.DataFrame(data = d)
dataframe

Unnamed: 0,Unique_words,Word_Vectors
0,asana,"[-1.6215265, -2.9498246, -0.60845935, 2.140549..."
1,strengthens,"[-0.538406, 2.4530578, -0.50095385, -2.2654302..."
2,abdominal,"[1.4743589, 1.5471224, -0.3944402, 3.4211702, ..."
3,muscles,"[-3.0863194, -0.20473738, -3.1084447, 0.395654..."
4,massages,"[0.424353, -2.8402915, -7.7556224, -0.21870962..."
...,...,...
1325,nectar,"[-6.476326, -5.1181483, -0.45220834, 4.288335,..."
1326,trataka,"[1.3928177, -4.463665, -0.28648147, 4.0880814,..."
1327,buttock,"[-4.7493706, -1.1005787, 7.318826, -1.4793018,..."
1328,mandalasana,"[-0.7156524, -5.8169804, 1.7904868, -3.024672,..."


##We have made a list of unique asanas

In [23]:
asanas = list(df['Asana'])
print(len(asanas))
asana = []
      

for x in asanas:
  if x not in asana:
    asana.append(x)
print(len(asana)) # length before removing the recurring asanas   

print(asana) # length after removing the recurring asanas   

326
293
['padotthanasana', 'parvatasana', 'ardha titali asana', 'gatyatmak meru vakrasana', 'sideways viewing', 'makarasana', 'padmasana', 'vajrasana', 'ardha chandrasana', 'yogamudrasana', 'bhujangasana', 'saithalyasana', 'bhu namanasana', 'sarvangasana', 'natarajasana', 'poorna bhujangasana', 'koormasana', 'poorna shalabhasana', 'poorna dhanurasana', 'bandha hasta utthanasana ', 'shava udarakarshanasana ', 'chakki chalanasana ', 'kashtha takshanasana ', 'vayu nishkasana', 'ushtrasana', 'samakonasana ', 'matsyasana', 'kandharasana', ' setu asana ', 'paschimottanasana', 'meru akarshanasana', 'pada hastasana', 'seetkari pranayama', 'jalandhara bandha', 'tadagi mudra', 'maha vedha mudra', 'shashankasana', 'janu chakra', 'poorna titali asana', 'manibandha chakra', 'skandha chakra', 'greeva sanchalana', 'padachakrasana', 'pada sanchalanasana', 'supta pawanmuktasana', 'jhulana lurhakanasana', 'supta udarakarshanasana', 'naukasana', 'rajju karshanasana', 'nauka sanchalanasana', 'namaskarasan

## Found One-hot encoded form of each asana name and saving it in a dictionary

In [24]:
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(asana)
#print(integer_encoded)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)

### One hot encoding
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

# onehot_encoded
asan_dict={}

for i in range(len(asana)):
  asan_dict[asana[i]] = onehot_encoded[i]

print(asan_dict) 


{'padotthanasana': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
      

##Importing the libraries

In [25]:
from tensorflow import keras
from tensorflow.keras.layers import Dense

##Created a list of tupples where each tupple contains the context word from the benefits column and its respective asana name as the target word

In [26]:
pair=[]

i=0
a=len(asana)
for x in benefits:
  if(i<a):
    target=asana[i]
    for y in x:
      if(y not in words):
        continue
      pair.append((y,target))
  i+=1  
print(pair)




[('asana', 'padotthanasana'), ('strengthens', 'padotthanasana'), ('abdominal', 'padotthanasana'), ('muscles', 'padotthanasana'), ('massages', 'padotthanasana'), ('organs', 'padotthanasana'), ('strengthens', 'padotthanasana'), ('digestive', 'padotthanasana'), ('system', 'padotthanasana'), ('lower', 'padotthanasana'), ('back', 'padotthanasana'), ('pelvic', 'padotthanasana'), ('muscles', 'padotthanasana'), ('helps', 'padotthanasana'), ('correct', 'padotthanasana'), ('prolapse', 'padotthanasana'), ('pose', 'parvatasana'), ('strengthens', 'parvatasana'), ('nerves', 'parvatasana'), ('muscles', 'parvatasana'), ('limbs', 'parvatasana'), ('back', 'parvatasana'), ('helps', 'parvatasana'), ('increase', 'parvatasana'), ('height', 'parvatasana'), ('stretching', 'parvatasana'), ('muscles', 'parvatasana'), ('ligaments', 'parvatasana'), ('enabling', 'parvatasana'), ('growing', 'parvatasana'), ('bones', 'parvatasana'), ('grow', 'parvatasana'), ('longer', 'parvatasana'), ('circulation', 'parvatasana'), 

##Stacked the word embeddings of the context words in the form of a 2D array

In [27]:
contexts=[dict_of_word_embeddings[context] for context,target in pair]
contexts=np.vstack(contexts)
contexts.shape


(12085, 50)

##Stacked the one-hot encodings of target words in the form of a 2D array

In [28]:
targets=[asan_dict[target] for context,target in pair]
targets=np.vstack(targets)
targets.shape

(12085, 293)

##Building the artificial neural network

In [30]:
from tensorflow import keras
from tensorflow.keras.layers import Dense

network_input = keras.Input(shape=contexts.shape[1], name='input_layer')
                                                                                   
hidden_layer1 = Dense(units=1000, activation='sigmoid', name='hidden_layer1')(network_input) # Create a hidden layer for the network; store under 'hidden_layer'

                                                                                            
output_layer = Dense(units=targets.shape[1], activation='softmax', name='output_layer')(hidden_layer1) # Create an output layer for the network; store under 'output_layer'

                                                                        
embedding_model = keras.Model(inputs=network_input, outputs=output_layer)   # Create a Keras Model; store under 'embedding_model'


embedding_model.compile(loss='categorical_crossentropy',optimizer = 'adam', metrics = ['accuracy']) # Compile the model for training; define loss function

                                                         
embedding_model.summary()  # Print out a summary of the model

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer (InputLayer)    [(None, 50)]              0         
                                                                 
 hidden_layer1 (Dense)       (None, 1000)              51000     
                                                                 
 output_layer (Dense)        (None, 293)               293293    
                                                                 
Total params: 344,293
Trainable params: 344,293
Non-trainable params: 0
_________________________________________________________________


##Fitting the model

In [34]:
# Fit a model to the data
embedding_model.fit(x=contexts,   # inputs
                    y=targets,   # outputs
                    batch_size=1024,  # how many pairs of words processed simultaneously
                    epochs=100,   # how many times we loop through the whole data
                    verbose=1   # do not print training status
                   )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fd52a30bed0>

##Predicting top 7 asanas from the user input

In [36]:
from collections import Counter
from IPython.display import clear_output

def magic():
  predicted_asanas = []
  user_input_words= []
  final_predicted_asanas = []
  number_in_words = ['first','second', 'third', 'fourth']
  for i in range(4):
    user_input_words.append(input(f"Enter {number_in_words[i]} benefit word:  "))
  for i in user_input_words:
    if i in dict_of_word_embeddings:

      input_array = np.expand_dims(dict_of_word_embeddings[i], axis=0)
      prediction = embedding_model.predict(input_array)
      flatten_pred = prediction.flatten()
      result_indices = flatten_pred.argsort()[-10:][::-1]
    
      for result in result_indices:
        predicted_asanas.append(asana[result])
    
    
  counter_found = Counter(predicted_asanas)
  final_predicted_asanas_with_freq = counter_found.most_common(7)

  for yoga, freq in final_predicted_asanas_with_freq:
    final_predicted_asanas.append(yoga)
  
  print(final_predicted_asanas)
  choice=input("Clear output: Y/N ")
  if choice=='Y':
   clear_output()
  
  
magic()

Enter first benefit word:  pain
Enter second benefit word:  back
Enter third benefit word:  neck
Enter fourth benefit word:  abdominal
['parivritti janu sirshasana', 'mandukasana', ' koormasana', 'kurmasana', 'tiryak bhujangasana', 'hamsasana', ' eka padasana']
Clear output: Y/N N
