In [1]:
import kagglehub

import subprocess
import gc

import pandas as pd
import numpy as np
import dask.dataframe as dd

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



# Datasets download

Requires [Kaggle API](https://www.kaggle.com/docs/api#authentication) token in one of these directories:
```
~/.kaggle/kaggle.json

~/.config/kaggle/kaggle.json
```

Used datasets:
- [Main OSes terminal commands](https://www.kaggle.com/datasets/vaibhavdlights/linuxcmdmacos-commands)
- [Wikipedia sentences](https://www.kaggle.com/datasets/mikeortman/wikipedia-sentences)
- [Wikipedia plaintext 2023](https://www.kaggle.com/datasets/jjinho/wikipedia-20230701)

In [None]:
# Data storage
subprocess.run(["mkdir", "data"])

# CLI commands dataset
pathCommands = kagglehub.dataset_download("vaibhavdlights/linuxcmdmacos-commands")
subprocess.run(["mv", pathCommands, "./data/commands/"])

# Wikipedia sentences dataset
pathWiki = kagglehub.dataset_download("mikeortman/wikipedia-sentences")
subprocess.run(["mv", pathWiki, "./data/wikiSen/"])

# Wikipedia plaintext 2023 dataset
pathWikiPlain = kagglehub.dataset_download("jjinho/wikipedia-20230701")
subprocess.run(["mv", pathWikiPlain, "./data/wikiPlain/"])

# Data manipulation

Removing punctuation and formatting the text, joining the separate files into one txt file so it's only plain text ready for unsupervised learning

In [2]:
linuxCommandsDf = pd.read_csv('data/commands/linux_commands.csv')
cmdCommandsDf = pd.read_csv('data/commands/cmd_commands.csv')
macOsCommandsDf = pd.read_csv('data/commands/macos_commands.csv')
vbscriptCommandsDf = pd.read_csv('data/commands/vbscript_commands.csv')


commandsDf = pd.concat([linuxCommandsDf, cmdCommandsDf, macOsCommandsDf, vbscriptCommandsDf], ignore_index = True)
del linuxCommandsDf
del cmdCommandsDf
del macOsCommandsDf
del vbscriptCommandsDf
gc.collect()

25

In [None]:
# Joining data frames

subprocess.run(["mkdir", "data/clean"])

# Commands dataset
linuxCommandsDf = pd.read_csv('data/commands/linux_commands.csv')
cmdCommandsDf = pd.read_csv('data/commands/cmd_commands.csv')
macOsCommandsDf = pd.read_csv('data/commands/macos_commands.csv')
vbscriptCommandsDf = pd.read_csv('data/commands/vbscript_commands.csv')


commandsDf = pd.concat([linuxCommandsDf, cmdCommandsDf, macOsCommandsDf, vbscriptCommandsDf], ignore_index = True)
del linuxCommandsDf
del cmdCommandsDf
del macOsCommandsDf
del vbscriptCommandsDf
gc.collect()


# Removing duplicate columns (indexes)
commandsDf = commandsDf.drop(columns=['Unnamed: 0'])
commandsDf = commandsDf.drop(columns=['description'])

# Removing unwanted parts of strings
commandsDf['description'] = commandsDf['description'].str.replace(' •', '')
commandsDf['description'] = commandsDf['description'].str.replace('•', '')

# Saving the data frame to a plain text file
commandsDf.to_csv('data/clean/commandsDf.txt', sep='\t', index=False, header=False)

del commandsDf
gc.collect()


# wikiPlain dataset
wikiPlainDf = dd.read_parquet('data/wikiPlain/*.parquet')

# Processing the wikiPlain dataset in chunks
for chunk in wikiPlainDf.to_delayed():
    chunk_df = chunk.compute()

    chunk_df = chunk_df.drop(columns=['id', 'categories'])

    chunk_df.to_csv('data/clean/wikiPlain.txt', sep='\t', index=False, header=False, mode='a')

    del chunk_df
    gc.collect()

del wikiPlainDf
gc.collect()


# Joining all the text files
data = data2 = ""

with open('data/clean/commandsDf.txt') as fileWrite:
    data = fileWrite.read()

with open('data/wikiSen/wikisent2.txt') as fileWrite:
    data2 = fileWrite.read()

data += data2

with open('data/clean/wikiPlain.txt', 'r') as wikiPlainFile, open('data/clean/dataFull.txt', 'w') as dataFullFile:
    # Writing the previous txt files to the full dataset text file
    dataFullFile.write(data)

    for line in wikiPlainFile:
        dataFullFile.write(line)

# Training word vectors

Word representation

Data split - 80% training 20% test

CBOW vs skipgrams

In [None]:
from numpy.linalg import norm
import fasttext
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Splitting the data

with open('data/data.txt', 'r') as file:
    data = file.readlines()

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, shuffle=True)

# Saving the split data to new files
with open('data/train_data.txt', 'w') as train_file:
    train_file.writelines(train_data)

with open('data/test_data.txt', 'w') as test_file:
    test_file.writelines(test_data)

# TODO zostawic commandsDf.txt w train data

Read 127M words
Number of words:  486699
Number of labels: 0
Progress: 100.0% words/sec/thread:   52007 lr:  0.000000 avg.loss:  0.485565 ETA:   0h 0m 0s53s  70494 lr:  0.048440 avg.loss:  1.697119 ETA:   0h20m51s  70493 lr:  0.048086 avg.loss:  1.669816 ETA:   0h20m42s  6.6% words/sec/thread:   69553 lr:  0.046714 avg.loss:  1.605509 ETA:   0h20m23s 1.518650 ETA:   0h19m50s 0.045232 avg.loss:  1.483721 ETA:   0h19m42s 10.2% words/sec/thread:   69605 lr:  0.044902 avg.loss:  1.464940 ETA:   0h19m35s lr:  0.043324 avg.loss:  1.413436 ETA:   0h18m55s 13.6% words/sec/thread:   69489 lr:  0.043212 avg.loss:  1.411198 ETA:   0h18m52s 1.410845 ETA:   0h18m52s 13.6% words/sec/thread:   69462 lr:  0.043187 avg.loss:  1.410705 ETA:   0h18m52s 14.2% words/sec/thread:   69289 lr:  0.042906 avg.loss:  1.405271 ETA:   0h18m48s 18.9% words/sec/thread:   67672 lr:  0.040567 avg.loss:  1.279591 ETA:   0h18m12ss 0.040246 avg.loss:  1.247219 ETA:   0h18m 7s 1.210666 ETA:   0h18m 2s 0.039320 avg.loss:  1

In [None]:
# modelDef = fasttext.train_unsupervised('data/train_data.txt')
# modelCbow = fasttext.train_unsupervised('data/train_data.txt', 'cbow')
# modelSkipgram = fasttext.train_unsupervised('data/train_data.txt', 'skipgram')

# Saving different model versions to binary file
subprocess.run(["mkdir", "result"])

# modelDef.save_model("result/model1.bin")
modelCbow.save_model("result/modelCbow.bin")
modelSkipgram.save_model("result/modelSkipgram.bin")

Read 127M words
Number of words:  486699
Number of labels: 0
Progress: 100.0% words/sec/thread:   99146 lr:  0.000000 avg.loss:  1.046694 ETA:   0h 0m 0s lr:  0.047926 avg.loss:  1.859780 ETA:   0h 9m40s 9m27s words/sec/thread:  149067 lr:  0.046768 avg.loss:  1.750912 ETA:   0h 9m31s32s 16.2% words/sec/thread:  145456 lr:  0.041901 avg.loss:  1.568963 ETA:   0h 8m44s avg.loss:  1.555797 ETA:   0h 8m44s% words/sec/thread:  134246 lr:  0.038837 avg.loss:  1.503370 ETA:   0h 8m47s 0.038825 avg.loss:  1.502928 ETA:   0h 8m47s avg.loss:  1.473101 ETA:   0h 8m46s% words/sec/thread:  123724 lr:  0.034045 avg.loss:  1.394181 ETA:   0h 8m21s words/sec/thread:  122907 lr:  0.033567 avg.loss:  1.385990 ETA:   0h 8m17s 122863 lr:  0.033545 avg.loss:  1.385620 ETA:   0h 8m17s 121384 lr:  0.032575 avg.loss:  1.370889 ETA:   0h 8m 8s 41.5% words/sec/thread:  117372 lr:  0.029262 avg.loss:  1.327538 ETA:   0h 7m34s 0.029032 avg.loss:  1.321859 ETA:   0h 7m32s22s ETA:   0h 7m19s% words/sec/thread:  11

In [2]:
# Full dataset training

modelCbowFull = fasttext.train_unsupervised('data/data.txt', 'cbow')
modelSkipgramFull = fasttext.train_unsupervised('data/data.txt', 'skipgram')

modelCbowFull.save_model('result/modelCbowFull.bin')
modelSkipgramFull.save_model('result/modelSkipgramFull.bin')

Read 159M words
Number of words:  570697
Number of labels: 0
Progress: 100.0% words/sec/thread:   97950 lr:  0.000000 avg.loss:  0.935774 ETA:   0h 0m 0s 13.6% words/sec/thread:  153391 lr:  0.043181 avg.loss:  1.432496 ETA:   0h10m41s 14.6% words/sec/thread:  151756 lr:  0.042692 avg.loss:  1.423745 ETA:   0h10m40s words/sec/thread:  150811 lr:  0.042458 avg.loss:  1.417306 ETA:   0h10m41s words/sec/thread:  148730 lr:  0.041952 avg.loss:  1.397673 ETA:   0h10m42s lr:  0.040701 avg.loss:  1.364990 ETA:   0h10m42s words/sec/thread:  144092 lr:  0.040676 avg.loss:  1.364731 ETA:   0h10m42s10m43s 0.039385 avg.loss:  1.338430 ETA:   0h10m47sh10m41s 35.7% words/sec/thread:  121680 lr:  0.032137 avg.loss:  1.214525 ETA:   0h10m 1sh 9m59s 36.5% words/sec/thread:  120972 lr:  0.031733 avg.loss:  1.208570 ETA:   0h 9m57s 119215 lr:  0.030757 avg.loss:  1.196144 ETA:   0h 9m47ss 40.4% words/sec/thread:  117932 lr:  0.029808 avg.loss:  1.190069 ETA:   0h 9m35s lr:  0.029716 avg.loss:  1.189825 E

In [68]:
# CommandsDf dataset training

modelSkipgramCommands = fasttext.train_unsupervised('data/commandsDf.txt', 'skipgram')

modelSkipgramCommands.save_model('result/modelSkipgramCommands.bin')

Read 0M words
Number of words:  202
Number of labels: 0
Progress: 100.0% words/sec/thread:   49649 lr:  0.000000 avg.loss:  4.123011 ETA:   0h 0m 0s


In [69]:
# modelSkipgram = fasttext.load_model("result/modelSkipgram.bin")
modelSkipgramFull = fasttext.load_model("result/modelSkipgramFull.bin")
modelSkipgramCommands = fasttext.load_model('result/modelSkipgramCommands.bin')

In [None]:
# Convert fasttext to tensorflow

# Getting the vocabulary and embedding dimension
words = modelSkipgramFull.get_words()
embedding_dim = modelSkipgramFull.get_dimension()


# Dictionary mapping words to their indices
word_index = {word: idx for idx, word in enumerate(words)}

embedding_matrix = np.zeros((len(words), embedding_dim))
for word, idx in word_index.items():
    embedding_matrix[idx] = modelSkipgramFull.get_word_vector(word)


# TensorFlow embedding layer
embedding_layer = tf.keras.layers.Embedding(
    input_dim=len(words),
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    trainable=False
)


'''
def get_word_embedding(word):
    word_id = word_index.get(word)
    if word_id is None:
        raise ValueError(f"Word '{word}' not in vocabulary.")
    return embedding_layer(tf.constant([word_id]))[0].numpy()
'''

def get_word_embedding(word):
    if word in word_index:
        # If the word is in vocabulary, use the pre-trained embedding
        word_id = word_index[word]
        return embedding_layer(tf.constant([word_id]))[0].numpy()
    else:
        # For OOV words, use FastText's subword information
        return modelSkipgramFull.get_word_vector(word)

def get_multiple_embeddings(words):
    word_ids = [word_index[word] for word in words if word in word_index]
    return embedding_layer(tf.constant(word_ids)).numpy()


def most_similar_word(input_word):
    # Get the embedding of the input word
    input_vector = get_word_embedding(input_word)
    
    input_vector = input_vector.reshape(1, -1)

    max_similarity = -1
    most_similar = None

    # Iterate through the vocabulary
    for word, idx in word_index.items():
        # Get the embedding vector of the current word
        word_vector = embedding_layer(tf.constant([idx]))[0].numpy()

        word_vector = word_vector.reshape(1, -1)

        # Compute cosine similarity
        similarity = cosine_similarity(input_vector, word_vector)[0][0]
        
        # Update the most similar word if this one has higher similarity
        if similarity > max_similarity:
            max_similarity = similarity
            most_similar = word
    
    return most_similar, max_similarity

# similar_word, similarity_score = most_similar_word("bot")
# print(f"The most similar word to 'example' is '{similar_word}' with a cosine similarity of {similarity_score:.4f}")

In [74]:
def most_similar_word_optimized(input_word):
    # Get the embedding of the input word
    input_vector = get_word_embedding(input_word)
    
    # Normalize the input vector
    input_vector = input_vector / norm(input_vector)
    
    # Extract all word embeddings in the vocabulary from the embedding layer
    embedding_matrix = embedding_layer.weights[0].numpy()

    # Normalize the embedding matrix row-wise
    embedding_matrix = embedding_matrix / np.linalg.norm(embedding_matrix, axis=1, keepdims=True)

    # Compute cosine similarities as a dot product between input vector and all embeddings
    similarities = np.dot(embedding_matrix, input_vector)
    
    # Find the index of the maximum similarity
    most_similar_idx = np.argmax(similarities)
    most_similar_word = list(word_index.keys())[most_similar_idx]
    max_similarity = similarities[most_similar_idx]

    return most_similar_word, max_similarity

# Example usage
similar_word, similarity_score = most_similar_word_optimized("mkdir")
print(f"The most similar word is '{similar_word}' with a cosine similarity of {similarity_score:.4f}")

The most similar word is 'job' with a cosine similarity of 0.3667


In [None]:
# Cosine similarity between word vectors

# Word embeddings for two words
word1_embedding = get_word_embedding("mkdir")
word2_embedding = get_word_embedding("mikdr")

# Reshape the arrays to match the expected input shape of cosine_similarity
word1_embedding = word1_embedding.reshape(1, -1)
word2_embedding = word2_embedding.reshape(1, -1)
# Calculate cosine similarity
similarity = cosine_similarity(word1_embedding, word2_embedding)[0][0]
print(similarity)

0.6302575


In [70]:
# Convert fasttext to tensorflow

# Getting the vocabulary and embedding dimension
words = modelSkipgramCommands.get_words()
embedding_dim = modelSkipgramCommands.get_dimension()


# Dictionary mapping words to their indices
word_index = {word: idx for idx, word in enumerate(words)}

embedding_matrix = np.zeros((len(words), embedding_dim))
for word, idx in word_index.items():
    embedding_matrix[idx] = modelSkipgramCommands.get_word_vector(word)


# TensorFlow embedding layer
embedding_layer = tf.keras.layers.Embedding(
    input_dim=len(words),
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    trainable=False
)


'''
def get_word_embedding(word):
    word_id = word_index.get(word)
    if word_id is None:
        raise ValueError(f"Word '{word}' not in vocabulary.")
    return embedding_layer(tf.constant([word_id]))[0].numpy()
'''

def get_word_embedding(word):
    if word in word_index:
        # If the word is in vocabulary, use the pre-trained embedding
        word_id = word_index[word]
        return embedding_layer(tf.constant([word_id]))[0].numpy()
    else:
        # For OOV words, use FastText's subword information
        return modelSkipgramCommands.get_word_vector(word)

def get_multiple_embeddings(words):
    word_ids = [word_index[word] for word in words if word in word_index]
    return embedding_layer(tf.constant(word_ids)).numpy()

