In [2]:
import numpy as np
import pandas as pd
import re
import nltk
from sklearn.metrics.pairwise import cosine_similarity
import math
import itertools
from collections import Counter


In [3]:
df = pd.read_csv("bbc-text.csv")
df.shape


(2225, 2)

In [4]:


nltk.download('punkt')
nltk.download("stopwords")
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

Lemmatizer = WordNetLemmatizer()
corpus = []
for i in range(len(df['text'])):
    review = re.sub('[^a-zA-Z]',' ',df['text'][i])
    review =  review.lower()
    review = review.split()
    review =[Lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = " ".join(review)
    corpus.append(review)
df_new = pd.DataFrame(corpus,columns = ["Text"])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [5]:


class SimpleTfidfVectorizer:
    def __init__(self, min_df=1.0):
        self.min_df = min_df
        self.vocabulary_ = {}
        self.idf_ = {}

    def fit_transform(self, documents):
        term_document_matrix = self._create_term_document_matrix(documents)
        self.idf_ = self._calculate_idf(term_document_matrix)
        tfidf_matrix = self._calculate_tfidf(term_document_matrix)
        return tfidf_matrix

    def _create_term_document_matrix(self, documents):
        term_document_matrix = []
        document_frequency = Counter()

        for doc in documents:
            terms = doc.split()  # Split the document into terms (you might want a more sophisticated tokenizer)
            term_document_matrix.append(terms)
            document_frequency.update(set(terms))

        # Filter terms based on min_df as a percentage
        min_df_threshold = self.min_df * len(documents)
        self.vocabulary_ = {term: index for index, (term, freq) in enumerate(document_frequency.items()) if freq >= min_df_threshold}
        term_document_matrix = [[term for term in terms if term in self.vocabulary_] for terms in term_document_matrix]

        return term_document_matrix

    def _calculate_idf(self, term_document_matrix):
        document_frequency = Counter()

        for terms in term_document_matrix:
            document_frequency.update(set(terms))

        idf = {term: math.log((1 + len(term_document_matrix)) / (1 + document_frequency[term])) + 1
               for term in document_frequency}

        return idf

    def _calculate_tfidf(self, term_document_matrix):
        tfidf_matrix = []
        for terms in term_document_matrix:
            term_counts = Counter(terms)
            tfidf_vector = {term: (term_counts[term] / len(terms)) * self.idf_[term]
                            for term in term_counts}
            tfidf_matrix.append(tfidf_vector)

        return tfidf_matrix

    def get_feature_names(self):
        return list(self.vocabulary_.keys())


documents = corpus

vectorizer = SimpleTfidfVectorizer(min_df=0.008)
tfidf_matrix = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names()

# Display the feature names and TF-IDF matrix
for i in feature_names:
  if(i in stopwords.words('english')):
    feature_names.remove(i)

print("Feature Names:", len(feature_names))


Feature Names: 3254


In [6]:
def by_indexes(iterable):
    output = {}
    for index, key in enumerate(iterable):
        output.setdefault(key, []).append(index)
    return output


def co_occurrence_matrix(corpus, vocabulary, window_size=5):
    def split_tokens(tokens):
        for token in tokens:
            indexs = vocabulary_indexes.get(token)
            if indexs is not None:
                yield token, indexs[0]

    matrix = np.zeros((len(vocabulary), len(vocabulary)), np.float64)
    vocabulary_indexes = by_indexes(vocabulary)

    for sent in corpus:
        tokens = by_indexes(split_tokens(sent.split())).items()
        for ((word_1, x), indexes_1), ((word_2, y), indexes_2) in itertools.permutations(tokens, 2):
            for k in indexes_1:
                for l in indexes_2:
                    if abs(l - k) <= window_size:
                        matrix[x, y] += 1
    return matrix


df_co = pd.DataFrame((co_occurrence_matrix(df_new["Text"].to_numpy(),feature_names )),columns = feature_names)

In [7]:
df_co
# df_co.to_csv("df_co.csv",index = False)
for i in feature_names:
  if(i in stopwords.words('english')):
    feature_names

In [8]:
df_co
# df_co.to_csv("df_co.csv",index = False)
for i in feature_names:
  if(i in stopwords.words('english')):
    feature_names.remove(i)
len(feature_names)

3254

In [9]:
def convert_to_ppmi(co_occurrence_matrix):
    # Calculate sum of elements in the co-occurrence matrix
    total_occurrences = np.sum(co_occurrence_matrix)

    # Calculate row and column sums
    row_sums = np.sum(co_occurrence_matrix, axis=1)
    col_sums = np.sum(co_occurrence_matrix, axis=0)

    # Calculate Positive Pointwise Mutual Information (PPMI)
    pmi_matrix = np.zeros_like(co_occurrence_matrix, dtype=float)

    for i in range(co_occurrence_matrix.shape[0]):
        for j in range(co_occurrence_matrix.shape[1]):
            observed_count = co_occurrence_matrix[i, j]
            row_prob = row_sums[i] / total_occurrences
            col_prob = col_sums[j] / total_occurrences
            expected_count = row_prob * col_prob * total_occurrences

            if expected_count>0:
                pmi = max(0, np.log2(observed_count / expected_count))
                pmi_matrix[i, j] = pmi
            else:
              pmi_matrix[i, j] = 0


    return pmi_matrix

# Example usage:

ppmi_matrix = convert_to_ppmi(df_co.to_numpy())
df_co_ppmi = pd.DataFrame(ppmi_matrix,columns = feature_names)
df_co_ppmi

  pmi = max(0, np.log2(observed_count / expected_count))


Unnamed: 0,electronics,moment,guide,issue,u,suggested,portable,set,box,ipod,...,moody,recommended,upset,exist,detained,tear,grip,stress,intent,spyware
0,0.000000,1.621022,0.000000,0.000000,0.794905,0.000000,2.306021,0.855396,1.366573,0.000000,...,0.0,0.000000,0.0,0.0,0.0000,0.000000,0.000000,0.000000,0.0,0.0
1,1.621022,0.000000,0.000000,0.721327,0.000000,0.000000,0.000000,0.000000,1.424234,2.323707,...,0.0,0.000000,0.0,0.0,0.0000,0.000000,0.000000,0.000000,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,1.281049,0.000000,0.000000,0.000000,3.075110,0.000000,...,0.0,6.077461,0.0,0.0,0.0000,0.000000,0.000000,0.000000,0.0,0.0
3,0.000000,0.721327,0.000000,0.000000,0.000000,0.655614,0.000000,0.000000,0.000000,0.000000,...,0.0,1.884266,0.0,0.0,0.0000,3.036269,0.000000,0.000000,0.0,0.0
4,0.794905,0.000000,1.281049,0.000000,0.000000,0.371815,0.000000,0.086939,1.878224,0.000000,...,0.0,0.000000,0.0,0.0,1.4146,1.945115,0.091707,0.154786,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3249,0.000000,0.000000,0.000000,3.036269,1.945115,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0000,0.000000,0.000000,0.000000,0.0,0.0
3250,0.000000,0.000000,0.000000,0.000000,0.091707,0.000000,0.000000,1.374591,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0000,0.000000,0.000000,0.000000,0.0,0.0
3251,0.000000,0.000000,0.000000,0.000000,0.154786,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0000,0.000000,0.000000,0.000000,0.0,0.0
3252,0.000000,0.000000,0.000000,0.000000,0.000000,4.221468,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0000,0.000000,0.000000,0.000000,0.0,0.0


In [10]:
df_co

Unnamed: 0,electronics,moment,guide,issue,u,suggested,portable,set,box,ipod,...,moody,recommended,upset,exist,detained,tear,grip,stress,intent,spyware
0,0.0,1.0,0.0,0.0,7.0,0.0,1.0,3.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,3.0,5.0,0.0,0.0,3.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,3.0,0.0,0.0,9.0,2.0,0.0,5.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
4,7.0,5.0,3.0,9.0,0.0,7.0,2.0,42.0,34.0,1.0,...,0.0,1.0,0.0,0.0,4.0,4.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3249,0.0,0.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3250,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3251,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3252,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
def truncated_svd(matrix, k):
    """
    Perform truncated Singular Value Decomposition (SVD) on a given matrix.

    Parameters:
    - matrix: The input matrix to decompose.
    - k: The number of singular values and vectors to retain.

    Returns:
    U, Sigma, Vt: The truncated SVD components.
    """
    # Step 1: Compute the covariance matrix
    covariance_matrix = np.dot(matrix.T, matrix)

    # Step 2: Compute the eigenvalues and eigenvectors of the covariance matrix
    eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)

    # Step 3: Sort the eigenvalues and corresponding eigenvectors in descending order
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]

    # Step 4: Select the top k eigenvectors
    top_eigenvectors = eigenvectors[:, :k]

    # Step 5: Compute the singular values and vectors
    singular_values = np.sqrt(eigenvalues[:k])
    U = matrix.dot(top_eigenvectors)
    Vt = top_eigenvectors.T

    return U, singular_values, Vt

# Example usage:
# Create a random matrix
np.random.seed(42)

A = df_co_ppmi.to_numpy()
A = np.nan_to_num(A, nan=0)

U, Sigma, Vt = truncated_svd(A, k=300)

# Reconstruct the original matrix
A_reconstructed = np.dot(U, np.dot(np.diag(Sigma), Vt))
df_co_new = pd.DataFrame(A_reconstructed,columns = feature_names)
# print("Original matrix:")
# print(A)
# print("\nTruncated SVD components:")
# print("U:")
# print(U)
# print("\nSigma:")
# print(np.diag(Sigma))
# print("\nVt:")
# print(Vt)
# print("\nReconstructed matrix:")
# print(A_reconstructed)
df_co_new

Unnamed: 0,electronics,moment,guide,issue,u,suggested,portable,set,box,ipod,...,moody,recommended,upset,exist,detained,tear,grip,stress,intent,spyware
0,910.390649,302.011538,273.291092,239.498147,286.384004,275.346000,496.729934,319.219196,352.023568,346.305695,...,141.686944,121.198773,91.078685,161.279686,114.767826,135.924259,157.432200,146.807316,111.482693,259.291293
1,302.011538,881.541139,226.612824,420.981321,272.510707,384.170062,325.369720,443.435640,445.257450,339.902433,...,394.124347,236.566184,406.287359,192.493605,223.041016,294.913267,218.808267,213.804709,304.494765,231.202703
2,273.291092,226.612824,673.449033,316.411652,180.037588,216.209328,229.517831,262.346194,298.133268,201.535565,...,157.686733,255.300712,112.373965,134.434438,105.441296,105.043603,171.767914,91.626590,87.477964,72.543287
3,239.498147,420.981321,316.411652,986.272523,297.110272,508.443856,209.890895,299.599107,225.320797,247.461457,...,142.586525,339.721369,268.709440,298.065600,271.137962,303.543282,239.331938,319.874486,236.864813,283.635453
4,286.384004,272.510707,180.037588,297.110272,586.509212,311.463175,213.534218,248.320443,365.434021,251.434496,...,146.360763,214.576811,191.252087,174.948002,223.259238,225.617946,150.911746,205.449002,162.670950,255.682659
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3249,135.924259,294.913267,105.043603,303.543282,225.617946,107.272982,101.178209,259.978168,252.184127,157.552244,...,71.298467,85.478776,166.795703,84.356284,105.547861,602.688909,100.375815,136.893585,139.503161,99.844460
3250,157.432200,218.808267,171.767914,239.331938,150.911746,200.743925,130.465236,187.422138,171.798668,165.814418,...,-2.357257,112.535921,110.857557,23.271960,74.734682,100.375815,335.974104,63.235315,101.955124,99.857867
3251,146.807316,213.804709,91.626590,319.874486,205.449002,238.197232,112.233886,239.609945,172.229888,123.184061,...,95.342215,95.466871,147.445605,95.616929,21.863686,136.893585,63.235315,382.376845,83.619715,66.666967
3252,111.482693,304.494765,87.477964,236.864813,162.670950,367.579356,73.501089,221.864311,148.678669,71.066224,...,119.634904,135.551103,131.426207,58.494701,38.530343,139.503161,101.955124,83.619715,438.833095,63.194362


In [12]:
# I am taking here U vector as my Word vector
df_svd  = pd.DataFrame(U)
df_svd
def rms_normalize(row):
    rms_value = np.sqrt(np.mean(np.square(row)))
    return row / rms_value if rms_value != 0 else row

# Apply RMS normalization function to each row
df_svd = df_svd.apply(rms_normalize, axis=0)
df_svd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.886402,0.861556,1.626613,1.240278,0.155198,0.553120,0.432701,0.634562,-0.090697,0.788849,...,-1.039510,0.120705,1.148060,1.560173,-1.294518,-1.049759,-0.540123,-0.181462,0.581130,0.491934
1,-1.344424,-1.148996,0.330978,0.138221,1.435915,-1.119131,-0.725651,-0.356198,0.303466,-0.191695,...,-0.102726,1.984205,-0.563851,-0.773651,1.520311,-0.654480,0.342524,0.709109,-0.194458,-0.329935
2,-0.717304,0.074887,0.908573,-0.609118,0.028004,0.637978,-0.525119,0.219843,0.031709,0.404327,...,0.600687,-1.945669,1.451733,-0.660580,0.580075,0.260480,0.468689,0.462840,-0.396998,0.228873
3,-1.332490,1.309603,-1.430819,-2.641728,-0.174157,-2.024473,-1.959949,-0.283013,-0.186330,0.140906,...,-1.146848,0.279705,1.489608,-0.130224,-0.184515,0.514922,-1.370145,0.977204,0.397306,-0.130708
4,-0.996555,0.723427,-0.306079,1.248816,-1.607930,-1.739381,1.506920,-0.358101,-0.777798,-0.663208,...,-0.300453,-0.194080,0.151264,-0.270356,-0.312743,-0.714342,0.253866,0.044179,0.382210,0.160421
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3249,-0.688503,-0.441396,0.001201,-0.171675,-0.048685,0.532386,-0.287857,-0.851329,-0.577856,0.549965,...,0.466619,-1.119413,-0.348728,-1.183785,-0.261618,0.322894,1.590205,0.216578,0.689742,0.307559
3250,-0.543749,0.485287,-0.022617,0.065935,0.308622,0.991755,-0.586673,-0.121605,-0.137739,0.241433,...,1.049017,0.026628,-0.255477,-1.374012,-0.883866,-0.736975,2.061690,0.209864,0.405819,0.670536
3251,-0.600240,0.266668,-0.795783,0.030973,0.309906,1.247758,-0.702829,-0.314766,0.223497,-0.257166,...,-1.986251,-0.549986,0.420335,-0.106474,-1.571122,-0.600891,-1.733069,-0.147002,-1.000879,0.945081
3252,-0.623305,-0.148459,-0.339792,-0.491391,0.355114,1.240355,-0.058591,-0.178118,-0.328614,-0.245164,...,0.494287,-1.093519,-1.627087,0.504154,-1.756125,0.190346,-0.132808,0.034401,-2.030696,-0.465552


In [13]:
df_svd['words'] = df_co_new.columns

In [14]:
df_svd.set_index("words",inplace = True)

In [15]:
df_svd


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
electronics,-0.886402,0.861556,1.626613,1.240278,0.155198,0.553120,0.432701,0.634562,-0.090697,0.788849,...,-1.039510,0.120705,1.148060,1.560173,-1.294518,-1.049759,-0.540123,-0.181462,0.581130,0.491934
moment,-1.344424,-1.148996,0.330978,0.138221,1.435915,-1.119131,-0.725651,-0.356198,0.303466,-0.191695,...,-0.102726,1.984205,-0.563851,-0.773651,1.520311,-0.654480,0.342524,0.709109,-0.194458,-0.329935
guide,-0.717304,0.074887,0.908573,-0.609118,0.028004,0.637978,-0.525119,0.219843,0.031709,0.404327,...,0.600687,-1.945669,1.451733,-0.660580,0.580075,0.260480,0.468689,0.462840,-0.396998,0.228873
issue,-1.332490,1.309603,-1.430819,-2.641728,-0.174157,-2.024473,-1.959949,-0.283013,-0.186330,0.140906,...,-1.146848,0.279705,1.489608,-0.130224,-0.184515,0.514922,-1.370145,0.977204,0.397306,-0.130708
u,-0.996555,0.723427,-0.306079,1.248816,-1.607930,-1.739381,1.506920,-0.358101,-0.777798,-0.663208,...,-0.300453,-0.194080,0.151264,-0.270356,-0.312743,-0.714342,0.253866,0.044179,0.382210,0.160421
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tear,-0.688503,-0.441396,0.001201,-0.171675,-0.048685,0.532386,-0.287857,-0.851329,-0.577856,0.549965,...,0.466619,-1.119413,-0.348728,-1.183785,-0.261618,0.322894,1.590205,0.216578,0.689742,0.307559
grip,-0.543749,0.485287,-0.022617,0.065935,0.308622,0.991755,-0.586673,-0.121605,-0.137739,0.241433,...,1.049017,0.026628,-0.255477,-1.374012,-0.883866,-0.736975,2.061690,0.209864,0.405819,0.670536
stress,-0.600240,0.266668,-0.795783,0.030973,0.309906,1.247758,-0.702829,-0.314766,0.223497,-0.257166,...,-1.986251,-0.549986,0.420335,-0.106474,-1.571122,-0.600891,-1.733069,-0.147002,-1.000879,0.945081
intent,-0.623305,-0.148459,-0.339792,-0.491391,0.355114,1.240355,-0.058591,-0.178118,-0.328614,-0.245164,...,0.494287,-1.093519,-1.627087,0.504154,-1.756125,0.190346,-0.132808,0.034401,-2.030696,-0.465552


In [17]:
df_svd = pd.read_csv('models/lsa.csv')
df_svd = df_svd.set_index("words")
def k_nearest_words_with_word(embedding_df, target_word, k):
    # Extract the embedding for the target word
    target_embedding = embedding_df.loc[target_word].values.reshape(1, -1)

    # Calculate cosine similarity with all other words
    similarities = cosine_similarity(embedding_df.values, target_embedding)

    # Get indices of k-nearest words (excluding the target word itself)
    nearest_word_indices = similarities.flatten().argsort()[:(-k-1)-1:-1][1:]

    # Get the k-nearest words
    nearest_words = [(i,round(j[0],4)) for i,j in zip(embedding_df.index[nearest_word_indices].tolist(), similarities[nearest_word_indices])]

    return nearest_words
target_word = 'watching'
k_nearest = k_nearest_words_with_word(df_svd, target_word, k=5)
print(k_nearest)


[('tv', 0.2251), ('pundit', 0.2161), ('television', 0.2138), ('recorder', 0.2114), ('viewer', 0.2031)]


In [18]:
df_svd.to_csv("models/lsa.csv",index = True)

In [19]:
def get_word_embedding(string):
    return df_svd.loc[string].to_numpy()

In [20]:
len(get_word_embedding("life"))

300

In [21]:

df_svd = pd.read_csv("models/lsa.csv")
df_svd = df_svd.set_index("words")

def k_nearest_words_with_embedding(embedding_df, target_embedding, k=5):
    # Extract the embedding for the target word
    target_embedding = target_embedding.reshape(1, -1)

    # Calculate cosine similarity with all other words
    similarities = cosine_similarity(embedding_df.values, target_embedding)
#     print(sorted(similarities))
    # Get indices of k-nearest words (excluding the target word itself)
    nearest_word_indices = similarities.flatten().argsort()[:-k-1:-1]
    print(similarities.flatten())
    print(nearest_word_indices)
    # Get the k-nearest words
    nearest_words = [(i,round(j[0],4)) for i,j in zip(embedding_df.index[nearest_word_indices].tolist(), similarities[nearest_word_indices])]

    return nearest_words


In [22]:
list  = []

for i in range(300):
    list.append(1)

list = np.asarray(list)
k_nearest = k_nearest_words_with_embedding(df_svd, list, k=10)
print(k_nearest)


[ 0.01606344  0.05708776  0.14492659 ... -0.12341199 -0.02159511
 -0.03296379]
[ 993  828 2392 2735 2677 2799 3049 2564 1794  724]
[('energy', 0.1733), ('fuel', 0.1722), ('weak', 0.1693), ('currency', 0.1638), ('innovative', 0.1615), ('carmaker', 0.1578), ('material', 0.1567), ('dangerous', 0.1512), ('suspicion', 0.149), ('heineken', 0.1486)]


In [23]:
k_nearest = k_nearest_words_with_word(df_svd, 'queen', k=10)
k_nearest

[('mary', 0.2721),
 ('scot', 0.258),
 ('life', 0.2568),
 ('sky', 0.2437),
 ('prince', 0.2373),
 ('fail', 0.2212),
 ('arrangement', 0.2166),
 ('bruce', 0.2137),
 ('diary', 0.2077),
 ('account', 0.1978)]

In [None]:
# Get word embeddings for each word in the vocabulary, write to file
f = open(f'models/vectors_lsa_300.txt', 'w')

# Create columns for the words and the values in the matrix, makes it easier to read as dataframe
columns = ["word"] + [f"value_{i+1}" for i in range(df_svd.shape[1])]

# Start writing to the file, start with the column names
f.write(" ".join(columns))
f.write("\n")

for i in range(df_svd.shape[0]):
    f.write(df_svd.index[i])
    f.write(" ")
    f.write(" ".join(map(str, list(df_svd.iloc[i,]))))
    f.write("\n")
f.close()