In [13]:
#Links:
#https://www.kaggle.com/code/ashishpatel26/word-embedding-with-beginner-to-advance

In [14]:
import pandas as pd
data = pd.read_csv('C:/BITsPilaniMTECH2123/Sem3/NLP/NLP_All_Practice/heart.csv')
print("Shape: ", data.shape)
data = data.sample(100).reset_index(drop=True)
data.head(2)

Shape:  (918, 12)


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,63,F,ASY,150,407,0,LVH,154,N,4.0,Flat,1
1,57,M,ASY,130,207,0,ST,96,Y,1.0,Flat,0


### Types of categorical variables conversion

### categorical variables can be converted or transformed in various ways depending on the context and the machine learning or statistical techniques you intend to use.

### Remember that the choice of encoding method depends on the nature of your data and the requirements of your analysis or machine learning task. It's also important to handle missing values in categorical variables appropriately before applying these encoding techniques.

## 1. Label Encoding:

### This involves assigning a unique numerical label to each category. This is suitable when there is an ordinal relationship between categories.

In [15]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(data['ChestPainType'])
encoded_labels

array([0, 0, 2, 2, 3, 2, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 2, 1, 0, 2, 2, 0, 0, 0, 3, 0, 1, 0, 0, 2, 3, 0, 2, 0, 2, 0, 2,
       0, 0, 1, 0, 2, 0, 1, 0, 0, 2, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       2, 0, 1, 2, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 1, 1, 0, 0, 2, 0, 1, 2,
       2, 1, 2, 1, 0, 1, 0, 0, 0, 0, 2, 0])

## 2. One-Hot Encoding:

### This method creates binary columns for each category, indicating the presence (1) or absence (0) of that category. This is suitable for nominal categorical variables.

In [16]:
one_hot_encoded = pd.get_dummies(data, columns=['ChestPainType'])
one_hot_encoded.head(2)

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,63,F,150,407,0,LVH,154,N,4.0,Flat,1,1,0,0,0
1,57,M,130,207,0,ST,96,Y,1.0,Flat,0,1,0,0,0


## 3. Dummy Coding:

### Similar to one-hot encoding, but it encodes the categorical variable into (n - 1) binary columns, where n is the number of categories. This is useful to avoid multicollinearity in regression models.

In [17]:
dummy_coded = pd.get_dummies(data, columns=['ChestPainType'], drop_first=True)
dummy_coded.head(2)

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,63,F,150,407,0,LVH,154,N,4.0,Flat,1,0,0,0
1,57,M,130,207,0,ST,96,Y,1.0,Flat,0,0,0,0


## 4. Ordinal Encoding:

### Similar to label encoding, but you explicitly specify the order of the categories. Useful for categorical variables with a clear ordering.

In [18]:
ordinal_mapping = {'ASY': 1, 'ATA': 2, 'NAP': 3, 'TA':4}
data['ChestPainType_OE'] = data['ChestPainType'].map(ordinal_mapping)
data[['ChestPainType','ChestPainType_OE']].sample(10)

Unnamed: 0,ChestPainType,ChestPainType_OE
55,ASY,1
59,ATA,2
89,ATA,2
69,NAP,3
24,ATA,2
31,TA,4
52,ASY,1
80,ATA,2
45,ASY,1
46,ATA,2


## 5. Frequency Encoding:

### Replace categories with their corresponding frequency of occurrence in the dataset.

In [19]:
frequency_mapping = data['ChestPainType'].value_counts().to_dict()
data['ChestPainType_FE'] = data['ChestPainType'].map(frequency_mapping)
data[['ChestPainType','ChestPainType_FE']].sample(10)

Unnamed: 0,ChestPainType,ChestPainType_FE
43,NAP,23
22,ASY,56
40,ASY,56
7,ASY,56
6,ASY,56
78,ASY,56
85,ASY,56
73,NAP,23
84,NAP,23
93,ATA,18


## 6. Target Encoding:

### Encode categories based on the mean of the target variable for each category. Useful for classification problems.

In [20]:
target_mapping = data.groupby('ChestPainType')['Age'].mean().to_dict()
data['ChestPainType_TE'] = data['ChestPainType'].map(target_mapping)
data[['ChestPainType','ChestPainType_TE']].sample(10)

Unnamed: 0,ChestPainType,ChestPainType_TE
76,ASY,55.839286
96,ASY,55.839286
38,ASY,55.839286
95,ASY,55.839286
77,ASY,55.839286
14,ASY,55.839286
8,ASY,55.839286
43,NAP,55.695652
30,ASY,55.839286
53,NAP,55.695652


## 7. Binary Encoding:

### Encode categories as binary code (represented by binary digits). This method is particularly useful for high-cardinality categorical variables

In [21]:
import category_encoders as ce

encoder = ce.BinaryEncoder(cols=['ChestPainType'])
encoded_data = encoder.fit_transform(data)
encoded_data.sample(10)

Unnamed: 0,Age,Sex,ChestPainType_0,ChestPainType_1,ChestPainType_2,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,ChestPainType_OE,ChestPainType_FE,ChestPainType_TE
30,65,M,0,0,1,135,254,0,LVH,127,N,2.8,Flat,1,1,56,55.839286
40,71,F,0,0,1,112,149,0,Normal,125,N,1.6,Flat,0,1,56,55.839286
24,51,M,1,0,0,125,188,0,Normal,145,N,0.0,Up,0,2,18,50.722222
25,71,M,0,0,1,130,221,0,ST,115,Y,0.0,Flat,1,1,56,55.839286
42,45,M,0,0,1,130,219,0,ST,130,Y,1.0,Flat,1,1,56,55.839286
86,32,M,1,0,0,125,254,0,Normal,155,N,0.0,Up,0,2,18,50.722222
15,46,M,0,0,1,110,236,0,Normal,125,Y,2.0,Flat,1,1,56,55.839286
59,54,F,1,0,0,120,230,1,Normal,140,N,0.0,Up,0,2,18,50.722222
89,57,M,1,0,0,140,260,1,Normal,140,N,0.0,Up,0,2,18,50.722222
6,53,M,0,0,1,80,0,0,Normal,141,Y,2.0,Down,0,1,56,55.839286


## What are Word Embeddings?

### Word embeddings are a type of word representation that allows words with similar meaning to have a similar representation.

### Count/frequency-based Embeddings:
#### These embeddings are based on the frequency of words in a corpus and their co-occurrence with other words. Examples include:

#### Count Vectors or Bag of Words (BoW)
#### N-Grams
#### TF-IDF
#### Co-Occurrence Matrix


### Prediction-based Embeddings:
#### These embeddings are trained by predicting words or contexts in a language modeling task. Examples include:

#### ELMo (Embeddings from Language Models): Generates contextualized embeddings by considering internal states of a bidirectional LSTM.
#### ULMFiT (Universal Language Model Fine-tuning): Fine-tunes a pre-trained language model on a specific task.

### Transformer-based Embeddings:
#### Transformer models have become popular for capturing contextual relationships in words and sentences. Examples include:

#### BERT (Bidirectional Encoder Representations from Transformers): Captures context from both left and right directions.
#### GPT (Generative Pre-trained Transformer): Generates coherent text based on input context.

### FastText Embeddings:
#### FastText embeddings are similar to Word2Vec but also consider subword information, which helps with handling rare words and morphologically rich languages.

### Contextualized Embeddings:
#### These embeddings capture the meaning of a word in its context within a sentence. Examples include:

#### ELMo: Embeddings change based on the surrounding words in a sentence.
#### BERT: Captures contextual information of words based on their surroundings.

### Pre-trained Embeddings:
#### Various pre-trained embeddings are available, trained on large corpora and transferable to various NLP tasks. Examples include:

#### Word2Vec (Google News): Pre-trained Word2Vec embeddings on a large news corpus.
#### GloVe (Common Crawl): Pre-trained GloVe embeddings on a large web crawl.

### Subword Embeddings:
#### These embeddings focus on subword units like characters or character n-grams. Examples include:

#### FastText: Utilizes subword information for word representations.
#### Byte-Pair Encoding (BPE): Segments words into subword units and assigns embeddings.

### Sense Embeddings:
#### These embeddings aim to capture different senses or meanings of a word. Examples include:

#### Word Sense Disambiguation (WSD): Embeddings trained to disambiguate word senses.
#### WordNet-based Embeddings: Incorporates WordNet senses for embeddings.

### Random Embeddings:
#### Simple embeddings generated randomly, often used as baselines for comparison.

### Each type of word embedding has its strengths and limitations, and the choice depends on the nature of your task, the amount of training data, and the resources available.

In [22]:
#Loading data
import pandas as pd 
dat = pd.read_csv('SMS_train.csv',encoding='latin1')
print("Shape",dat.shape)
dat.tail(2)

Shape (957, 3)


Unnamed: 0,S. No.,Message_body,Label
955,956,Wat time ü finish?,Non-Spam
956,957,Just glad to be talking to you.,Non-Spam


## 1. Bag of Words (BoW):
### BoW represents text as a vector where each dimension corresponds to a word in the vocabulary, and the value in each dimension indicates the frequency of that word in the text.

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

CountVec = CountVectorizer(ngram_range=(1,1), # to use bigrams ngram_range=(2,2)
                           stop_words='english')
#transform
Count_data = CountVec.fit_transform(dat['Message_body'])
 
#create dataframe
cv_dataframe=pd.DataFrame(Count_data.toarray(),columns=CountVec.get_feature_names_out())
print("Shape: ", cv_dataframe.shape)
cv_dataframe.sample(5)

Shape:  (957, 2940)


Unnamed: 0,000,0121,02,0207,02073162414,03,04,05,06,07099833605,...,young,younger,yoville,yr,yummy,yuo,yup,zebra,zed,zeros
374,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
851,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
443,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
722,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
287,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 2. N-grams

### N-grams are contiguous sequences of n items (characters, words, etc.) from a given text. They are commonly used in various natural language processing tasks for analyzing and processing text data.

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

CountVec = CountVectorizer(ngram_range=(3,3), # to use bigrams ngram_range=(2,2)
                           stop_words='english')
#transform
Count_data = CountVec.fit_transform(dat['Message_body'])
 
#create dataframe
cv_dataframe=pd.DataFrame(Count_data.toarray(),columns=CountVec.get_feature_names_out())
print("Shape: ", cv_dataframe.shape)
cv_dataframe.sample(5)

Shape:  (957, 5440)


Unnamed: 0,000 bonus caller,000 cash await,000 prize jackpot,0121 2025050 visit,02 09 03,0207 083 6089,02073162414 costs 20p,03 2nd attempt,06 05 05,06 11 04,...,yup hey day,yup izzit raining,yup lor dun,yup lor reach,yup msg tat,yup paragon havent,zebra animation badass,zed 08701417012 profit,zed 08701417012150p logo,zeros savings checking
818,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
910,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
78,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
760,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
517,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 3. Term Frequency-Inverse Document Frequency (TF-IDF):

### TF-IDF is a variation of BoW that takes into account the importance of a word in a document relative to its frequency in the entire corpus.

In [25]:
#Term frequency
#TF = (Number of times term T appears in the particular row) / (number of terms in that row)

#Inverse Document Frequency
#IDF = log(N/n), where, N is the total number of rows and n is the number of rows in which the word was present.

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dat['Message_body'])

#create dataframe
cv_dataframe=pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names_out())
print("Shape: ", cv_dataframe.shape)
cv_dataframe.sample(5)

Shape:  (957, 3154)


Unnamed: 0,000,0121,02,0207,02073162414,03,04,05,06,07099833605,...,yours,yourself,yoville,yr,yummy,yuo,yup,zebra,zed,zeros
318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
601,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Co-occurrence matrix

####  A co-occurrence matrix is a tabular representation that shows the frequency of co-occurrence of items in a given context. In the context of natural language processing (NLP) and text analysis, a co-occurrence matrix is often used to analyze relationships between words within a corpus of text. It helps capture the patterns of how often certain words appear together in the same context.

In [26]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# Example sentences
sentences = [
    "This is an example sentence.",
    "Another example sentence.",
    "Yet another example for illustration."
]

# Create a CountVectorizer
vectorizer = CountVectorizer(tokenizer=lambda text: text.split())

# Fit and transform the sentences
X = vectorizer.fit_transform(sentences)

# Get the vocabulary (words) and transform it into a dictionary
vocabulary = vectorizer.get_feature_names_out()
vocab_dict = {word: idx for idx, word in enumerate(vocabulary)}

# Create an empty co-occurrence matrix
co_occurrence_matrix = np.zeros((len(vocabulary), len(vocabulary)), dtype=int)

# Calculate co-occurrences
for sentence in X.toarray():
    co_occurrence_matrix += np.outer(sentence, sentence)

# Print the co-occurrence matrix
print("Co-occurrence Matrix:")
print(co_occurrence_matrix)


Co-occurrence Matrix:
[[1 0 1 0 0 1 1 1 0]
 [0 2 2 1 1 0 1 0 1]
 [1 2 3 1 1 1 2 1 1]
 [0 1 1 1 1 0 0 0 1]
 [0 1 1 1 1 0 0 0 1]
 [1 0 1 0 0 1 1 1 0]
 [1 1 2 0 0 1 2 1 0]
 [1 0 1 0 0 1 1 1 0]
 [0 1 1 1 1 0 0 0 1]]




In [27]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Sample DataFrame with a text column
df = pd.DataFrame(dat)

# Tokenization function
def tokenize(text):
    return text.split()

# Create a CountVectorizer
vectorizer = CountVectorizer(tokenizer=tokenize, lowercase=False)

# Fit and transform the text data
X = vectorizer.fit_transform(df['Message_body'])

# Get the vocabulary (words) and transform it into a dictionary
vocab = vectorizer.get_feature_names_out()
vocab_dict = {word: idx for idx, word in enumerate(vocab)}

# Co-occurrence matrix parameters
window_size = 2  # Context window size
co_occurrence_matrix = np.zeros((len(vocab), len(vocab)), dtype=int)

# Calculate co-occurrence counts
for sentence in X.toarray():
    indices = np.where(sentence > 0)[0]
    for idx in indices:
        context_indices = np.concatenate((indices[max(0, idx - window_size):idx],
                                          indices[idx + 1:min(idx + window_size + 1, len(vocab))]))
        for context_idx in context_indices:
            co_occurrence_matrix[idx][context_idx] += 1

# Print the co-occurrence matrix
print("Co-Occurrence Matrix:")
print(co_occurrence_matrix)

Co-Occurrence Matrix:
[[0 0 0 ... 0 0 0]
 [0 2 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## 4. Latent Semantic Analysis (LSA)

### LSA applies Singular Value Decomposition (SVD) to a term-document matrix to capture latent semantic relationships between words and documents.

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize

# Example sentences
sentences = [
    "This is an example sentence.",
    "Another example sentence.",
    "Yet another example for illustration."
]

# Create a CountVectorizer
vectorizer = CountVectorizer(tokenizer=lambda text: text.split())

# Fit and transform the sentences
X = vectorizer.fit_transform(sentences)

# Apply Truncated SVD for dimensionality reduction (LSA)
num_dimensions = 2  # Number of dimensions for embeddings
svd = TruncatedSVD(n_components=num_dimensions)
embedding_matrix = svd.fit_transform(X)

# Normalize embeddings
embedding_matrix = normalize(embedding_matrix, norm='l2')

# Print the LSA embeddings
print("LSA Embeddings:")
print(embedding_matrix)

LSA Embeddings:
[[ 7.65313604e-01  6.43657586e-01]
 [ 1.00000000e+00  3.49225053e-16]
 [ 7.65313604e-01 -6.43657586e-01]]


## 5. Latent Dirichlet Allocation (LDA)

### LDA is a topic modeling technique that models documents as mixtures of topics and words as mixtures of topics.

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Example sentences
sentences = [
    "This is an example sentence.",
    "Another example sentence.",
    "Yet another example for illustration."
]

# Create a CountVectorizer
vectorizer = CountVectorizer(tokenizer=lambda text: text.split())

# Fit and transform the sentences
X = vectorizer.fit_transform(sentences)

# Apply Latent Dirichlet Allocation (LDA)
num_topics = 2
lda = LatentDirichletAllocation(n_components=num_topics)
topic_matrix = lda.fit_transform(X)

# Print the LDA topic matrix
print("LDA Topic Matrix:")
print(topic_matrix)

LDA Topic Matrix:
[[0.90898827 0.09101173]
 [0.79601909 0.20398091]
 [0.09909064 0.90090936]]




## 6. Word2Vec

### Word2Vec is a popular technique for generating word embeddings that capture semantic relationships between words.



## Objective: Word2Vec aims to learn word representations that encode semantic information by predicting the likelihood of words based on their context in a given text corpus.

## Two Main Architectures: Word2Vec consists of two main architectures: Continuous Bag of Words (CBOW) and Skip-gram.

## Continuous Bag of Words (CBOW):
### Given a context of surrounding words, CBOW predicts the target word at the center.
### Suited for smaller datasets and frequent words.
### Can be faster to train.

## Skip-gram:
### Given a target word, Skip-gram predicts the context words surrounding it.
### Performs well on larger datasets and captures rare words and phrases better.
### Might take longer to train compared to CBOW.

## Training Process: Word2Vec trains using a neural network that learns word embeddings by optimizing a specific objective function. The process involves adjusting the embeddings to minimize the difference between predicted probabilities and actual co-occurrence frequencies of words.

## Cosine Similarity: Word embeddings generated by Word2Vec can be used to measure semantic relationships between words using cosine similarity. Words with similar meanings have higher cosine similarity values.

## Transferable Representations: Word2Vec embeddings can be pre-trained on a large corpus of text and then fine-tuned for specific downstream tasks, making them transferable and useful in various natural language processing tasks such as sentiment analysis, text classification, and machine translation.

## Advantages:
### Word2Vec embeddings capture semantic relationships and similarities between words.
### They are computationally efficient and can be trained on large corpora.
### Word2Vec embeddings can improve the performance of various NLP tasks by providing meaningful feature representations.

## Limitations:
### Word2Vec embeddings might not capture more complex linguistic nuances.
### They may not fully capture polysemy (multiple meanings of a word) and context-dependent semantics.

## Extensions: Word2Vec has inspired several extensions and variations, including Paragraph Vectors (Doc2Vec), FastText, and more advanced transformer-based models that capture contextual information beyond a fixed context window.

### Overall, Word2Vec has played a significant role in revolutionizing natural language processing and continues to be a foundation for many downstream NLP applications.

In [30]:
import pandas as pd
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# Sample DataFrame with a text column
df = pd.DataFrame(dat)

# Tokenize the sentences
tokenized_sentences = [word_tokenize(sentence) for sentence in df['Message_body']]

# Train Word2Vec model
model = Word2Vec(tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Create embeddings DataFrame
embeddings_df = pd.DataFrame({word: model.wv[word] for word in model.wv.index_to_key})

# Print the embeddings DataFrame
print("Embeddings DataFrame:")
embeddings_df

[nltk_data] Downloading package punkt to C:\Users\Abhishek
[nltk_data]     Jaiswal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Embeddings DataFrame:


Unnamed: 0,.,to,I,",",you,?,!,a,...,the,....1,opener,Hmmm.still,guides,italian,watched,juz,feet,stomps,pouts,Rofl
0,-0.260241,-0.184705,-0.141463,-0.185331,-0.146918,-0.143935,-0.148871,-0.168189,-0.147365,-0.168180,...,0.004253,-0.004446,0.006991,-0.009215,-0.013162,-0.005259,0.004478,-0.010735,-0.003218,0.001046
1,0.330585,0.225669,0.182007,0.233356,0.178182,0.172191,0.193586,0.218974,0.183304,0.199434,...,0.007279,0.012195,0.001013,-0.004308,-0.003971,-0.002628,-0.001929,-0.007150,0.011417,-0.005483
2,0.151565,0.104375,0.073177,0.099744,0.071688,0.074905,0.086944,0.108158,0.082048,0.084181,...,-0.006485,0.005079,0.003622,0.002214,0.001848,0.008305,-0.005860,0.003714,-0.006668,-0.008439
3,0.107617,0.071794,0.051866,0.063776,0.050614,0.040548,0.059845,0.075576,0.060102,0.070251,...,-0.008191,0.010719,0.000556,-0.001132,-0.006380,-0.003179,-0.009278,0.002900,0.004133,-0.000363
4,0.103690,0.084002,0.070620,0.081569,0.064598,0.049474,0.068099,0.072170,0.067135,0.069099,...,-0.004749,0.010826,-0.004759,0.000559,0.011944,0.006252,0.010228,0.009595,-0.006099,-0.004658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.651061,0.439977,0.359018,0.440142,0.340350,0.341025,0.390632,0.437750,0.352413,0.396978,...,-0.003718,0.000246,0.005350,0.009691,0.010988,0.017906,-0.001196,0.002459,0.006076,0.004159
96,0.195104,0.143057,0.116410,0.144240,0.116007,0.103716,0.131356,0.136990,0.111393,0.118466,...,-0.007176,0.011037,-0.005843,-0.003936,0.005959,0.002440,0.010297,0.004398,-0.007847,-0.007085
97,-0.340622,-0.227685,-0.181646,-0.234294,-0.174667,-0.181799,-0.207724,-0.231350,-0.169893,-0.202841,...,-0.002185,-0.009514,-0.007604,-0.005928,-0.005730,-0.006529,0.001803,0.002963,-0.008534,0.007389
98,0.202992,0.126123,0.118021,0.134580,0.109448,0.106854,0.119228,0.141092,0.112944,0.135602,...,0.004673,-0.005454,0.009300,0.010899,0.009094,0.008665,-0.006660,-0.000157,-0.002958,-0.005518


## 7. Doc2Vec 

### Doc2Vec is an extension of Word2Vec that can generate embeddings for entire documents or paragraphs, in addition to word embeddings.

### Doc2Vec is an algorithm that learns fixed-size vector representations (embeddings) for documents and words. It extends Word2Vec to include documents, enabling semantically meaningful representations. By capturing context and relationships, Doc2Vec embeddings find use in tasks like document similarity, classification, and recommendation.

In [31]:
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
#nltk.download('punkt')

# Sample DataFrame with a text column
df = pd.DataFrame(dat)

# Tokenize the sentences and create TaggedDocument objects
tagged_data = [TaggedDocument(words=word_tokenize(sentence), tags=[str(idx)]) for idx, sentence in enumerate(df['Message_body'])]

# Train Doc2Vec model
model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=20)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

# Create embeddings DataFrame
embeddings_df = pd.DataFrame({str(idx): model.docvecs[idx] for idx in range(len(df))})

# Print the embeddings DataFrame
print("Embeddings DataFrame:")
embeddings_df

Embeddings DataFrame:


  embeddings_df = pd.DataFrame({str(idx): model.docvecs[idx] for idx in range(len(df))})


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,947,948,949,950,951,952,953,954,955,956
0,-0.031567,-0.059751,-0.027610,-0.042169,-0.086009,-0.081894,-0.037417,-0.019961,-0.030798,-0.065788,...,-0.023154,-0.036480,-0.033404,-0.031302,-0.106025,-0.088547,-0.008997,-0.046669,-0.017060,-0.012377
1,0.025320,0.045612,0.010906,0.044745,0.072388,0.069750,0.025755,0.027330,0.025934,0.067905,...,0.028361,0.024692,0.003500,0.047040,0.118113,0.094837,0.020902,0.053910,0.013421,0.001678
2,0.014537,0.042150,0.021698,0.036123,0.062557,0.073554,0.020879,0.016052,0.027648,0.054718,...,0.020790,0.028290,0.025006,0.030523,0.104940,0.086808,0.006028,0.034868,0.011726,0.016482
3,0.013849,0.009285,-0.007726,0.010804,0.012727,0.014795,-0.003924,-0.000646,0.002379,-0.000132,...,0.001874,-0.004609,-0.016156,0.009243,0.020514,0.001217,0.009222,-0.000339,0.006964,-0.008173
4,0.016083,0.024623,0.024941,0.018025,0.028489,0.047477,0.007757,0.015637,0.008763,0.034764,...,0.010600,0.019599,0.017423,0.029698,0.064239,0.050218,0.013799,0.034453,0.015430,0.000310
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.074319,0.133505,0.049024,0.093803,0.179916,0.178907,0.055158,0.064538,0.060891,0.167507,...,0.052342,0.091582,0.056398,0.089533,0.263612,0.230674,0.048042,0.113836,0.046020,0.021781
96,0.024610,0.044793,0.012162,0.025050,0.055366,0.057988,0.029720,0.028517,0.020086,0.046339,...,0.006595,0.033978,0.009254,0.030034,0.099197,0.078012,0.012842,0.043641,0.012406,0.014640
97,-0.039214,-0.084640,-0.028508,-0.044174,-0.101424,-0.117242,-0.043578,-0.044309,-0.032762,-0.099960,...,-0.025500,-0.051152,-0.035633,-0.044713,-0.155970,-0.139767,-0.016257,-0.075131,-0.029292,-0.013140
98,0.010583,0.002565,0.009765,0.000507,-0.000328,-0.003422,-0.004914,0.010902,0.004846,-0.007397,...,-0.002383,-0.011661,-0.010289,0.007974,-0.002557,0.009662,0.007475,-0.002610,0.006279,-0.007244


## 8. GloVe (Global Vectors for Word Representation)

### Generate word embeddings by aggregating global word co-occurrence matrices from a given corpus.

### The basic idea behind the GloVe word embedding is to derive the relationship between the words from statistics. Unlike the occurrence matrix, the co-occurrence matrix tells you how often a particular word pair occurs together. Each value in the co-occurrence matrix represents a pair of words occurring together.

In [32]:
## Making a dictionary of the words and their vector representation

embeddings_dictionary = dict()
embedding_dim = 100

# Load GloVe 100D embeddings
with open('C:/BITsPilaniMTECH2123/Sem3/NLP/NLP_All_Practice/glove/glove.6B.100d.txt', encoding='utf-8') as fp:
    for line in fp.readlines():
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        embeddings_dictionary [word] = vector_dimensions

In [33]:
glove_df = pd.DataFrame()
for word in df['Message_body']:
    tt = word.split()
    temp_df = pd.DataFrame()
    for i in range(len(tt)):
        try:
            emd = embeddings_dictionary[tt[i]]
            temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
        except:
            pass
    temp_df = pd.DataFrame(temp_df)
    temp_df = pd.DataFrame(temp_df.mean(axis=0)).T
    glove_df = glove_df.append(temp_df,ignore_index=True)

print(glove_df.shape)    
glove_df   

  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)


  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  glove_df = glove_d

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  glove_df = glove_d

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_d

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_d

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_d

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  glove_df = glove_d

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_d

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)


  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_d

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  glove_df = glove_d

  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_df.append(temp_df,ignore_index=True)
  temp_df = temp_df.append(pd.DataFrame([np.array(emd)]),ignore_index=True)
  glove_df = glove_d

(957, 100)


  glove_df = glove_df.append(temp_df,ignore_index=True)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.071022,-0.038099,0.542177,-0.082492,-0.074202,0.146242,-0.250226,-0.235238,-0.125797,0.216427,...,-0.167291,0.215297,-0.094608,0.137313,-0.493887,-0.474463,-0.669091,-0.586375,0.505684,0.281640
1,0.063427,0.190200,0.401351,-0.246480,-0.033387,0.155975,-0.092402,0.141156,-0.034176,-0.132180,...,-0.016823,-0.270445,-0.051522,-0.073071,-0.559999,0.047039,-0.121020,-0.152188,0.438008,0.269668
2,-0.189746,0.119212,0.318143,-0.164913,0.163019,0.122932,-0.035974,-0.032679,-0.372527,0.105850,...,-0.067655,-0.173700,-0.028307,0.378635,-0.441012,0.277825,-0.090779,-0.364147,0.366227,0.022004
3,0.064375,0.344141,0.083043,0.003813,-0.130680,0.273443,-0.078488,0.054380,-0.158981,-0.073605,...,-0.306002,-0.085749,-0.043410,0.077880,-0.040081,-0.092847,0.185838,-0.226807,-0.021342,-0.063997
4,-0.081304,0.237346,0.342350,-0.070389,-0.049059,0.073196,0.135293,0.247306,-0.215001,-0.090901,...,-0.199220,0.038115,-0.256244,0.126924,-0.544771,-0.053150,-0.088921,-0.261988,0.475009,-0.039638
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
952,0.032300,0.079386,0.291049,0.136844,-0.211392,0.211803,0.129338,0.278136,-0.041441,0.087964,...,-0.090498,0.353701,0.201247,0.378081,-0.347803,-0.140241,-0.070122,-0.134809,0.231225,-0.004141
953,-0.190704,0.410400,0.360252,-0.401491,-0.144118,0.101135,-0.218552,0.311394,-0.086395,-0.039126,...,0.046044,-0.249549,0.178922,-0.150883,-0.487425,-0.225261,0.042887,0.036536,0.621147,0.379457
954,0.054601,0.289558,0.345137,-0.276380,-0.068597,0.109301,-0.188043,0.118917,0.030473,-0.234016,...,-0.086368,-0.172834,-0.237566,0.021708,-0.473080,-0.113162,-0.231089,-0.155028,0.333280,0.091269
955,0.075080,0.246672,0.404400,0.041565,-0.230231,0.357940,-0.133101,-0.219795,0.270625,-0.239820,...,-0.092225,0.142246,0.376095,0.209665,0.124885,-0.151020,0.289370,-0.001859,-0.104055,-0.173673


## 9. FastText

### FastText is a word embedding technique that extends Word2Vec by considering subword information, making it effective for handling out-of-vocabulary words and morphologically rich languages.

In [34]:
import pandas as pd
import fasttext
import numpy as np
from nltk.tokenize import word_tokenize
import nltk
#nltk.download('punkt')

# Sample DataFrame with a text column
data = {'text_column': [
    "This is an example sentence.",
    "Another example sentence.",
    "Yet another example for illustration."
]}
df = pd.DataFrame(data)

# Load pre-trained FastText word vectors
fasttext_path = "path/to/cc.en.100.vec"
fasttext_model = fasttext.load_model(fasttext_path)

# Tokenize the sentences
tokenized_sentences = [word_tokenize(sentence) for sentence in df['text_column']]

# Function to calculate sentence embeddings
def get_sentence_embedding(sentence):
    embeddings = [fasttext_model.get_word_vector(word) for word in sentence]
    if embeddings:
        avg_embedding = np.mean(embeddings, axis=0)
    else:
        avg_embedding = np.zeros(fasttext_model.get_dimension())
    return avg_embedding

# Create embeddings for each sentence
document_embeddings = [get_sentence_embedding(sentence) for sentence in tokenized_sentences]

# Convert document embeddings to DataFrame
document_embeddings_df = pd.DataFrame(document_embeddings)

# Print the document embeddings DataFrame
print("Document Embeddings DataFrame:")
print(document_embeddings_df)

ModuleNotFoundError: No module named 'fasttext'

## 10. BERT (Bidirectional Encoder Representations from Transformers):

### BERT is a transformer-based model that generates contextualized word embeddings by considering both left and right context in a sentence.

In [42]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
#nltk.download('punkt')

# Sample DataFrame with a text column
df = pd.DataFrame(dat)

# Tokenize the sentences
tokenized_sentences = [word_tokenize(sentence) for sentence in df['Message_body']]

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Function to calculate sentence embeddings
def get_sentence_embedding(sentence):
    tokens = [tokenizer.cls_token] + sentence + [tokenizer.sep_token]
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = torch.tensor(input_ids).unsqueeze(0)
    
    with torch.no_grad():
        outputs = model(input_ids)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    
    return embedding

# Create embeddings for each sentence
document_embeddings = [get_sentence_embedding(sentence) for sentence in tokenized_sentences]

# Convert document embeddings to DataFrame
document_embeddings_df = pd.DataFrame(document_embeddings)

# Print the document embeddings DataFrame
print("Document Embeddings DataFrame:")
document_embeddings_df


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Document Embeddings DataFrame:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.230156,-0.001655,0.045540,-0.451545,-0.354539,-0.152481,0.410077,0.384985,-0.038723,-0.090199,...,0.446943,0.017718,0.187562,-0.262093,0.267582,-0.284557,0.219058,-0.243326,0.591780,-0.052901
1,0.166876,-0.217059,0.298765,0.145865,-0.043955,-0.155840,0.362898,0.574956,0.188933,-0.122718,...,-0.151777,-0.338585,0.192610,-0.079918,-0.017437,0.264356,0.033202,-0.239178,0.175996,-0.122160
2,0.137682,0.112664,0.414475,-0.292609,-0.117741,-0.227691,0.100809,0.510741,-0.123243,-0.308917,...,-0.001668,-0.219368,0.127058,-0.221272,-0.009120,-0.120362,0.025500,0.091550,-0.141216,0.025827
3,0.069796,-0.063211,0.406007,-0.547982,0.050551,-0.316418,0.796259,0.059501,-0.248179,0.091626,...,0.385107,-0.127852,0.109649,-0.255704,0.212574,-0.172890,0.023392,-0.178733,0.250864,0.034019
4,0.025411,-0.093361,0.584318,-0.411056,0.057137,-0.195322,0.731716,0.157253,-0.076707,-0.257002,...,0.355832,-0.119119,0.086498,-0.147929,-0.218717,0.148177,0.198564,-0.133088,0.243483,-0.016523
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
952,0.096376,-0.169405,0.611839,-0.679978,-0.208810,0.048761,0.631471,0.298822,-0.186654,-0.128506,...,0.718843,-0.340358,0.047913,-0.260238,0.051883,0.152651,0.240626,-0.145109,-0.024751,-0.025373
953,0.397734,0.149761,0.143669,0.023736,-0.236546,0.005862,0.290763,0.263088,-0.368923,-0.223800,...,0.024272,-0.208017,-0.068236,0.134138,0.263355,0.132620,-0.065179,-0.035586,0.017270,0.220061
954,0.080858,0.093744,0.218561,-0.561989,0.272511,0.030808,0.500007,0.228707,-0.001304,-0.422268,...,0.188593,-0.215229,-0.225334,-0.250760,0.102208,-0.266416,-0.034582,-0.389865,0.138667,-0.212834
955,0.039313,-0.124646,-0.027155,-0.537480,0.415335,0.069320,0.806425,-0.049148,-0.324421,-0.360009,...,0.511260,-0.205628,0.056870,-0.482350,0.255969,0.140929,0.636315,-0.190935,-0.072299,-0.270672


## 11. Universal Sentence Encoder:

### The Universal Sentence Encoder (USE) is a pre-trained model developed by Google that generates high-quality embeddings for sentences or short texts.

In [43]:
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
from nltk.tokenize import sent_tokenize
import nltk

# Sample DataFrame with a text column
data = {'text_column': [
    "This is an example sentence.",
    "Another example sentence.",
    "Yet another example for illustration."
]}
df = pd.DataFrame(data)

# Load Universal Sentence Encoder model
model_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
use_model = hub.load(model_url)

# Function to calculate sentence embeddings
def get_sentence_embedding(sentence):
    embeddings = use_model([sentence])
    return embeddings.numpy()

# Create embeddings for each sentence
document_embeddings = [get_sentence_embedding(sentence) for sentence in df['text_column']]

# Convert document embeddings to DataFrame
document_embeddings_df = pd.DataFrame(np.concatenate(document_embeddings))

# Print the document embeddings DataFrame
print("Document Embeddings DataFrame:")
document_embeddings_df


[nltk_data] Downloading package punkt to C:\Users\Abhishek
[nltk_data]     Jaiswal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Document Embeddings DataFrame:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,0.062383,0.020822,0.003805,0.029249,-0.073227,-0.001973,-0.01665,0.033024,0.006928,0.052622,...,-0.020329,-0.002827,0.100481,-0.003972,0.003707,-0.025104,0.012932,-0.013812,-0.003456,0.076498
1,0.033285,0.012929,-0.000192,0.006394,-0.067875,0.026197,-0.032756,-0.001669,-0.003281,0.041999,...,2e-06,-0.06651,0.073783,-0.003603,0.040417,-0.031705,0.038806,0.006194,-0.039641,0.066813
2,-0.041194,0.001034,0.023589,-0.012045,-0.083658,-0.000794,-0.001071,0.070373,-0.04561,0.073808,...,0.026453,-0.070751,0.028483,-0.004705,0.049323,0.04468,0.004537,-0.012118,0.012535,0.027863


## 12. ELMo (Embeddings from Language Models) 

### ELMo is a deep contextualized word representation model that captures contextual information of words within sentences.

In [51]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from nltk.tokenize import sent_tokenize
import nltk

# Sample DataFrame with a text column
df = pd.DataFrame(dat)

# Load pre-trained ELMo-style model
model_name = "bert-base-nli-mean-tokens"  # You can explore other models as well
model = SentenceTransformer(model_name)

# Function to calculate sentence embeddings
def get_sentence_embedding(sentence):
    return model.encode([sentence], convert_to_tensor=True).numpy()

# Create embeddings for each sentence
document_embeddings = [get_sentence_embedding(sentence) for sentence in df['Message_body']]

# Convert document embeddings to DataFrame
document_embeddings_df = pd.DataFrame(np.concatenate(document_embeddings))

# Print the document embeddings DataFrame
print("Document Embeddings DataFrame:")
document_embeddings_df


Document Embeddings DataFrame:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.254408,-0.226065,1.739215,0.266294,0.432405,-0.253913,-0.491477,0.728188,0.524518,-0.134457,...,-0.512184,-0.618225,0.851644,-1.148151,0.135786,-0.461314,-0.418899,0.344768,-0.212410,0.386700
1,-0.305741,-0.206765,0.952002,0.401451,-0.380246,-0.479462,0.285839,0.387108,0.457585,-0.128602,...,-0.695168,-0.617728,-0.808544,-1.434583,0.081434,-0.258319,-0.210425,0.816292,-0.065558,0.376639
2,-0.056299,0.026331,1.803162,0.236934,0.088542,0.108257,-0.297613,0.784055,-0.001806,-0.305349,...,-0.192036,-0.257273,0.154882,-1.217317,0.047915,-0.333446,-1.020759,0.125009,-0.339734,0.173443
3,0.267183,-0.449553,3.082050,-0.001047,0.723336,0.020067,-0.004679,-0.067268,0.790365,-0.186334,...,0.168275,-1.352041,0.309605,0.288319,0.131835,-0.733855,-0.588562,-0.425120,-1.015332,0.428708
4,-0.780729,0.906983,-0.561059,-0.124744,0.304318,-0.313436,0.020478,0.712407,-0.222117,-0.452324,...,-0.155448,-0.448353,-0.632674,-0.159844,0.441140,-0.469092,-0.301177,0.199395,1.320480,0.368582
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
952,0.166255,0.438176,1.593802,0.076920,-0.438946,-0.061294,0.288123,0.299034,-0.075605,0.012411,...,0.035283,-0.717672,-0.372183,-1.283176,0.320815,-0.613741,-0.320582,0.088251,0.903624,0.360206
953,0.795283,-0.243166,1.641898,0.726404,-0.210473,0.422270,0.171682,0.206328,-0.463834,0.044044,...,0.355463,-0.605451,0.296877,-0.782540,-0.318650,0.135810,-0.395177,0.306452,-0.492432,0.178519
954,0.522895,0.393499,1.716469,0.039209,-0.414545,0.086871,0.690120,0.114986,0.232044,-0.328985,...,0.641495,0.095413,-0.406608,-1.762340,-0.020004,-0.822489,-0.203198,-0.332588,-0.239773,0.530740
955,-0.152890,-0.538673,2.503942,0.222969,0.720362,0.725466,0.661138,0.403712,0.407480,-0.389586,...,0.432074,-0.351038,0.503954,-0.597221,-0.023548,-0.009346,-0.054139,-0.587019,-0.405061,0.221047


### Semantic and contextual meanings are both important concepts in the field of natural language processing and word embeddings. Let me explain the difference between them:

### Semantic Meaning: Semantic meaning refers to the intrinsic meaning of a word or a phrase, irrespective of its context. It's the core, dictionary-like definition of a word. In the context of word embeddings, semantic meaning is typically captured by the relationships between words based on their usage patterns across a wide range of texts. Words that have similar semantic meanings are likely to be close to each other in a high-dimensional vector space.

### Contextual Meaning: Contextual meaning, on the other hand, is all about how the meaning of a word can change based on its surrounding words in a specific sentence or piece of text. Words can have different meanings depending on the context in which they are used. For example, the word "bank" can refer to a financial institution or the side of a river depending on the context.

### Word Embeddings: Word embeddings are numerical representations of words that aim to capture their meanings in a vector space. These embeddings are learned from large corpora of text data using techniques like Word2Vec, GloVe, and contextual models like BERT, GPT, etc.

### Traditional Models (Word2Vec, GloVe): These models capture mainly semantic meanings. Words that are used in similar contexts across a broad range of text data end up being close to each other in the embedding space, indicating their semantic similarity.

### Contextual Models (BERT, GPT, etc.): These models take into account the contextual meanings of words. They generate embeddings based on the entire context of a word within a sentence or passage. This allows them to capture the nuances of word meanings that depend on the surrounding words.

### For instance, consider the word "bat." In a semantic embedding space, "bat" might be close to words like "ball," "racket," and "sports." In a contextual embedding space, however, "bat" could be associated with "flew" or "cave" depending on whether the context is sports-related or related to animals.

### In summary, semantic meaning is the inherent definition of a word, while contextual meaning is the meaning that arises from the specific context in which the word is used. Both types of meaning are important for understanding and processing language, and modern NLP models aim to capture both aspects through sophisticated embedding techniques.

### One-Hot Encoding:
#### Represents words as binary vectors.
#### Simple and interpretable.
#### No semantic understanding or context captured.
#### Suitable for simple classification tasks where word order doesn't matter.

### Bag of Words (BoW):
#### Represents documents with word frequency vectors.
#### Ignores word order and context.
#### Simple and interpretable, suitable for basic text classification.
#### Doesn't capture semantic relationships.

### TF-IDF (Term Frequency-Inverse Document Frequency):
#### Enhances BoW by weighting words based on their importance.
#### Takes into account both term frequency and importance across documents.
#### Useful for information retrieval, ranking, and simple text classification.

### N-Gram:
#### Considers sequences of N consecutive words.
#### Captures local context and word relationships.
#### Suitable for tasks where word order matters, like language modeling and sentiment analysis.

### Word2Vec:
#### Generates dense word embeddings by predicting surrounding words.
#### Captures semantic meaning and relationships.
#### Suitable for tasks like word similarity, analogy, and downstream NLP tasks.

### Doc2Vec:
#### Extends Word2Vec to generate embeddings for entire documents.
#### Captures document semantics and relationships.
#### Useful for document-level classification, recommendation, and similarity tasks.

### GloVe (Global Vectors for Word Representation):
#### Combines global word co-occurrence statistics with local context.
#### Generates word embeddings based on word similarities.
#### Suitable for word-level tasks and capturing word relationships.

### BERT (Bidirectional Encoder Representations from Transformers):
#### Pre-trained transformer model for generating contextualized embeddings.
#### Captures complex linguistic nuances and contextual information.
#### State-of-the-art model for various NLP tasks.

### Universal Sentence Encoder:
#### Generates embeddings for sentences and short texts.
#### Captures sentence-level semantics and relationships.
#### Useful for semantic similarity, clustering, and classification tasks.

### ELMo (Embeddings from Language Models):
#### Deep contextualized word representations.
#### Captures contextual information of words in sentences.
#### Useful for complex tasks that require contextual understanding.

#### Choose the method based on your task requirements. Simple tasks may benefit from BoW or TF-IDF, while tasks requiring semantic understanding might benefit from Word2Vec, BERT, or ELMo. Document-level tasks can benefit from Doc2Vec or Universal Sentence Encoder. Always consider the trade-offs between complexity, interpretability, and performance.