In [11]:
import nltk
from nltk.tokenize import RegexpTokenizer
from gensim.models import Word2Vec
import os
import numpy as np
import keyword
import re

# Define a function to extract Python files from a directory
def extract_python_files(directory):
    python_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".py"):
                python_files.append(os.path.join(root, file))
    return python_files

# Define a function to preprocess Python code and handle variable names
def preprocess_python_code(file_paths):
    code_corpus = ""
    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            code = file.read()
            code_corpus += code
    return code_corpus

# Tokenize the code while handling keywords, variables, and special characters
def tokenize_code(code):
    tokenizer = RegexpTokenizer(r'\w+|[^\w\s]+|\s+')
    tokens = tokenizer.tokenize(code)
    
    keywords = set(keyword.kwlist)
    tokenized_code = []
    variable_counter = {}
    variable_id = 0
    
    for token in tokens:
        if token in keywords:
            tokenized_code.append(f"<KEYWORD:{token}>")
        elif token.isidentifier() and not token.isnumeric():
            if token not in variable_counter:
                variable_counter[token] = f"<VAR{variable_id}>"
                variable_id += 1
            tokenized_code.append(variable_counter[token])
        elif re.match(r'\s+', token):  # Preserve whitespace
            tokenized_code.append(token)
        else:
            tokenized_code.append(token)
    
    return tokenized_code

# Extract Python files from the numpy directory
directory = "/Users/krishpatel/Desktop/Skipgram_Implementation/numpy"
numpy_files = extract_python_files(directory)

# Preprocess Python code from numpy
numpy_code = preprocess_python_code(numpy_files)

# Tokenize the code
dictionary_tokens = tokenize_code(numpy_code)

# Reduce vocabulary size here
vocab_size = 500000  # 500k words
word_freq = nltk.FreqDist(dictionary_tokens)
top_words = [word for word, _ in word_freq.most_common(vocab_size)]
training_data = [word if word in top_words else "<UNK>" for word in dictionary_tokens]

# Group tokens into sentences based on newline characters
lines = ' '.join(training_data).split('\n')

# Create context windows of 2-3 sentences
context_windows = []
window_size = 3
for i in range(len(lines) - window_size + 1):
    window = lines[i:i + window_size]
    context_windows.append(' '.join(window))

print("Number of context windows:", len(context_windows))

# Split context windows into tokens
sentences = [window.split() for window in context_windows]

# Build vocabulary
model = Word2Vec(vector_size=20, window=2, min_count=1, workers=4, sg=1)
model.build_vocab(sentences)

# Training the Word2Vec model with the following hyperparameters
epochs = 5
for epoch in range(epochs):
    print(f"Training epoch {epoch + 1}/{epochs}")
    model.train(corpus_iterable=sentences, total_examples=len(sentences), epochs=1)

# Provided sequence to start with
starting_sequence = ['<KEYWORD:if>', '<VAR1275>', '=', '<KEYWORD:else>', '<VAR5>']

# Generating a sequence starting with the provided tokens
sequence = starting_sequence.copy()
num_words_to_generate = 50

for i in range(num_words_to_generate):
    last_word = sequence[-1]
    if last_word in model.wv:
        predicted_contexts = [predicted[0] for predicted in model.wv.most_similar(last_word)]
        next_word = np.random.choice(predicted_contexts)
        sequence.append(next_word)
    else:
        print(f"Word '{last_word}' not in vocabulary")
        break

# Post-process generated sequence to format as code
generated_code = ' '.join(sequence).replace('<KEYWORD:', '').replace('>', '').replace('<VAR', 'var').replace('var', 'var_')
formatted_code = generated_code.replace(' <', '<').replace('> ', '>').replace('{ ', '{\n').replace('} ', '}\n').replace(' ;', ';\n')

print("Generated Sequence:")
print(formatted_code)


Number of context windows: 272525
Training epoch 1/5
Training epoch 2/5
Training epoch 3/5
Training epoch 4/5
Training epoch 5/5
Generated Sequence:
if var_1275 else var_5 var_64 var_1017 var_64 var_9313 var_13737 var_1434 var_33456 var_1434 var_28403 var_18088 var_1554 var_18883 var_18880 var_16909 var_29678 var_16226 var_29322 var_30840 var_18770 var_28338 var_29780 var_29291 var_15736 var_24992 var_29907 var_29953 var_28979 var_33455 var_14994 var_28338 var_1554 var_22006 var_19074 var_29578 var_30562 var_18483 var_28615 var_29832 var_28615 var_28979 var_28401 var_22024 var_23125 var_22024 var_33446 var_5331 var_28375 var_13482 var_16370 var_29807


In [14]:
# Provided sequence to start with
starting_sequence = ['<KEYWORD:if>', '<VAR1275>', '=', '<VAR1275>', ":"]

# Generating a sequence starting with the provided tokens
sequence = starting_sequence.copy()
num_words_to_generate = 50

for i in range(num_words_to_generate):
    last_word = sequence[-1]
    if last_word in model.wv:
        predicted_contexts = [predicted[0] for predicted in model.wv.most_similar(last_word)]
        next_word = np.random.choice(predicted_contexts)
        sequence.append(next_word)
    else:
        print(f"Word '{last_word}' not in vocabulary")
        break

# Post-process generated sequence to format as code
generated_code = ' '.join(sequence).replace('<KEYWORD:', '').replace('>', '').replace('<VAR', 'var').replace('var', 'var_')
formatted_code = generated_code.replace(' <', '<').replace('> ', '>').replace('{ ', '{\n').replace('} ', '}\n').replace(' ;', ';\n')

print("Generated Sequence:")
print(formatted_code)
print("First 10 words in the vocabulary:")
print(list(model.wv.index_to_key)[:10])

Generated Sequence:
if var_1275 = var_1275 : var_10256 ^+` var_2489 var_19283 var_11604 var_11818 var_25948 var_13767 var_9473 var_13767 var_27810 var_10754 var_13767 var_27810 var_1764 var_3080 var_20281 var_29028 var_29678 var_16909 var_13941 var_15615 var_23864 var_9416 var_33456 var_33397 var_18088 var_22882 var_23072 var_24301 var_20850 var_30108 var_28872 var_16695 var_14498 var_28198 var_29094 var_28410 var_18088 var_23972 var_13767 var_18088 var_22006 var_18088 var_22006 var_28442 var_30150 var_19103 var_22986 var_23955
First 10 words in the vocabulary:
[',', '.', '(', '=', ')', '<VAR584>', '1', '0', ':', '<VAR106>']


In [8]:
print("First 10 words in the vocabulary:")
print(list(model.wv.index_to_key)[:1000])

random_word = '<KEYWORD:if>'
num_words = 20
sequence = [random_word]
for i in range(num_words):
    last_word = sequence[-1]
    predicted_contexts = [predicted[0] for predicted in model.wv.most_similar(last_word)]
    next_word = np.random.choice(predicted_contexts)
    sequence.append(next_word)

# Printing this sequence separated with spaces
print("Generated Sequence:")
print(" ".join(sequence))


First 10 words in the vocabulary:
[' ', ',', '.', '(', '\n        ', '=', '\n    ', ')', '<VAR584>', '1', '0', ':', '<VAR106>', "'", '\n            ', '<VAR1293>', '[', '\n\n    ', '2', '#', '<VAR7>', '-', '):', '<VAR1286>', '<VAR1278>', "',", '<KEYWORD:def>', '3', '"', '<VAR396>', '<KEYWORD:is>', '\n', '`', '<KEYWORD:in>', '),', '<KEYWORD:if>', '<VAR58>', '],', '<KEYWORD:for>', '\n                ', '<VAR1275>', '\n\n        ', '<VAR5>', ']', '))', '5', '<KEYWORD:None>', '+', "('", '<VAR274>', '  ', '4', '([', '<KEYWORD:return>', '",', '])', '>>>', '<KEYWORD:and>', '"""', '*', '<KEYWORD:not>', '==', '<KEYWORD:True>', '<KEYWORD:with>', '<VAR618>', '<VAR1325>', '``', '<VAR145>', '<KEYWORD:False>', '<VAR206>', '<VAR2144>', '\n\n', '\n\n\n', '.,', '<VAR207>', '<VAR30>', '()', "')", '<VAR397>', '<KEYWORD:import>', '<VAR96>', '<KEYWORD:as>', "['", '<VAR7841>', "':", '10', '("', '<VAR205>', '<VAR880>', ').', '6', '<KEYWORD:from>', '/', '<VAR1573>', '<KEYWORD:or>', '<VAR298>', '\n            

In [44]:
import nltk
from nltk.tokenize import RegexpTokenizer
from gensim.models import Word2Vec
import os
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

# Define a function to extract Python files from a directory
def extract_python_files():
    python_files = []
    directory = "/Users/krishpatel/Desktop/Skipgram_Implementation/numpy"
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".py"):
                python_files.append(os.path.join(root, file))
    return python_files

# Define a function to preprocess Python code and handle variable names
def preprocess_python_code(file_paths):
    code_corpus = ""
    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            code = file.read()
            code_corpus += code
    return code_corpus

# Extract Python files from the numpy directory
numpy_files = extract_python_files()

# Preprocess Python code from numpy
numpy_code = preprocess_python_code(numpy_files)

# Tokenize the code while keeping special characters
tokenizer = RegexpTokenizer(r'[\w\s]+|\S')

# Tokenize the code
dictionary_tokens = tokenizer.tokenize(numpy_code)

# Reduce vocabulary size here
vocab_size = 500000  # 500k words
word_freq = nltk.FreqDist(dictionary_tokens)
top_words = [word for word, _ in word_freq.most_common(vocab_size)]
training_data = [word if word in top_words else "<UNK>" for word in dictionary_tokens]

# Convert tokens into sentences
sentences = [[word] for word in training_data]
print("Number of sentences:", len(sentences))

# Build vocabulary
model = Word2Vec(vector_size=20, window=2, min_count=1, workers=4, sg=1)
model.build_vocab(sentences)

# Convert words to integers
word2idx = {word: idx for idx, word in enumerate(model.wv.index_to_key)}
idx2word = {idx: word for word, idx in word2idx.items()}
sequences = [[word2idx[word] for word in sent if word in word2idx] for sent in sentences]

# Padding sequences
max_seq_length = max(len(seq) for seq in sequences)
padded_sequences = np.array([seq + [0]*(max_seq_length-len(seq)) for seq in sequences])

# Define the LSTM model
embedding_dim = 20
model_lstm = Sequential([
    Embedding(input_dim=len(word2idx), output_dim=embedding_dim, input_length=max_seq_length),
    LSTM(128),
    Dense(len(word2idx), activation='softmax')
])

# Compile the model
model_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
X = padded_sequences[:, :-1]
y = np.expand_dims(padded_sequences[:, 1:], axis=-1)
y = np.reshape(y, (y.shape[0], y.shape[1], 1))  # Ensure y has 3 dimensions
model_lstm.fit(X, y, epochs=10, batch_size=128)
random_word = 'if'
num_words = 20
sequence = [random_word]
for i in range(num_words): 
    last_word = sequence[-1]
    predicted_contexts = model.wv.most_similar(last_word)  # Adjust this line based on the model used
    next_word = np.random.choice(predicted_contexts)
    sequence.append(next_word)

# Printing this sequence separated with spaces
print("Generated Sequence:")
print(" ".join(sequence))



Number of sentences: 2187441
Epoch 1/10


ValueError: in user code:

    File "/opt/homebrew/lib/python3.11/site-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/opt/homebrew/lib/python3.11/site-packages/keras/src/engine/training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/homebrew/lib/python3.11/site-packages/keras/src/engine/training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "/opt/homebrew/lib/python3.11/site-packages/keras/src/engine/training.py", line 1150, in train_step
        y_pred = self(x, training=True)
    File "/opt/homebrew/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/opt/homebrew/lib/python3.11/site-packages/keras/src/backend.py", line 4979, in <listcomp>
        inputs, [inp[0] for inp in flatted_inputs]

    ValueError: Exception encountered when calling layer 'lstm_1' (type LSTM).
    
    slice index 0 of dimension 0 out of bounds. for '{{node strided_slice_1}} = StridedSlice[Index=DT_INT32, T=DT_FLOAT, begin_mask=0, ellipsis_mask=0, end_mask=0, new_axis_mask=0, shrink_axis_mask=1](transpose, strided_slice_1/stack, strided_slice_1/stack_1, strided_slice_1/stack_2)' with input shapes: [0,?,20], [1], [1], [1] and with computed input tensors: input[1] = <0>, input[2] = <1>, input[3] = <1>.
    
    Call arguments received by layer 'lstm_1' (type LSTM):
      • inputs=tf.Tensor(shape=(None, 0, 20), dtype=float32)
      • mask=None
      • training=True
      • initial_state=None


In [9]:
#intial code: 

import nltk
from nltk.tokenize import RegexpTokenizer
from gensim.models import Word2Vec
import os
import numpy as np

# Define a function to extract Python files from a directory
def extract_python_files():
    python_files = []
    directory = "/Users/krishpatel/Desktop/Skipgram_Implementation/numpy"
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".py"):
                python_files.append(os.path.join(root, file))
    return python_files

# Define a function to preprocess Python code and handle variable names
def preprocess_python_code(file_paths):
    code_corpus = ""
    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            code = file.read()
            code_corpus += code
    return code_corpus

# Extract Python files from the numpy directory
numpy_files = extract_python_files()

# Preprocess Python code from numpy
numpy_code = preprocess_python_code(numpy_files)

# Tokenize the code while keeping special characters
tokenizer = RegexpTokenizer(r'[\w\s]+|\S')

# Tokenize the code
dictionary_tokens = tokenizer.tokenize(numpy_code)

# Reduce vocabulary size here
vocab_size = 500000  # 500k words
word_freq = nltk.FreqDist(dictionary_tokens)
top_words = [word for word, _ in word_freq.most_common(vocab_size)]
training_data = [word if word in top_words else "<UNK>" for word in dictionary_tokens]

# Convert tokens into sentences
sentences = [[word] for word in training_data]
print("Number of sentences:", len(sentences))

# Build vocabulary
model = Word2Vec(vector_size=20, window=2, min_count=1, workers=4, sg=1)
model.build_vocab(sentences)

# Training the Word2Vec model with the following hyperparameters
epochs = 5
for epoch in range(epochs):
    print(f"Training epoch {epoch + 1}/{epochs}")
    model.train(corpus_iterable=sentences, total_examples=len(sentences), epochs=1)

# Example output
random_word = 'if'
num_words = 20
sequence = [random_word]
for i in range(num_words): 
    last_word = sequence[-1]
    predicted_contexts = [predicted[0] for predicted in model.wv.most_similar(last_word)]
    next_word = np.random.choice(predicted_contexts)
    sequence.append(next_word)

# Printing this sequence separated with spaces
print("Generated Sequence:")
print(" ".join(sequence))

Number of sentences: 2187441
Training epoch 1/5
Training epoch 2/5
Training epoch 3/5
Training epoch 4/5
Training epoch 5/5
Generated Sequence:
if 
            f77flags  
subtract  
        return fd 
            version   for a in dividend 
            version  NpyIter_GetInnerFixedStrideArray 
        sub_class  dtype
        assert_array_equal 9989
    fname  
    s_medium  9989
    fname  
                case NPY_SHORT 
            ediff1d 
                case NPY_SHORT sysconfig  state
        int_2  
        if arg is None 
                if not success or args  for a in dividend


In [15]:
random_word = 'if'
num_words = 20
sequence = [random_word]
for i in range(num_words): 
    last_word = sequence[-1]
    predicted_contexts = [predicted[0] for predicted in model.wv.most_similar(last_word)]
    next_word = np.random.choice(predicted_contexts)
    sequence.append(next_word)

# Printing this sequence separated with spaces
print("Generated Sequence:")
print(" ".join(sequence))

KeyError: "Key 'if' not present in vocabulary"