In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import requests
from bs4 import BeautifulSoup
import re

In [None]:
# Function to scrape code snippets from GeeksforGeeks page in batches
def scrape_gfg_code_batch(url_list, batch_size):
    all_code_snippets = []  # Initialize list to store all code snippets
    
    # Regular expression pattern to match comments containing the phrase 'This code is contributed by'
    contributed_comment_pattern = r'#.*?This code is contributed by.*?$'
    
    # Iterate over URL list in batches
    for i in range(0, len(url_list), batch_size):
        batch_urls = url_list[i:i+batch_size]  # Get batch of URLs
        
        # Iterate over URLs in the batch
        for url in batch_urls:
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')
            code_containers = soup.find_all('div', {'class': 'code-container'})
            # Extract code snippets from current URL
            code_snippets = []
            for container in code_containers:
                # Remove comments containing the phrase 'This code is contributed by'
                code_snippet = re.sub(contributed_comment_pattern, '', container.text.strip(), flags=re.MULTILINE)
                code_snippets.append(code_snippet)
            # Append code snippets to the list
            all_code_snippets.extend(code_snippets)
    
    return all_code_snippets


In [None]:
# URLs to scrape code snippets from
urls = ['https://www.geeksforgeeks.org/python-basics/', 'https://www.geeksforgeeks.org/python-string/', 'https://www.geeksforgeeks.org/python-lists/']

# Batch size for scraping
batch_size = 2

# Call the scrape_gfg_code_batch function with the URL list and batch size
code_snippets = scrape_gfg_code_batch(urls, batch_size)
print(code_snippets)

In [None]:
# Tokenize code snippets
tokenizer = Tokenizer()
tokenizer.fit_on_texts(code_snippets)
code_snippets_seq = tokenizer.texts_to_sequences(code_snippets)


In [None]:
# Pad sequences
max_sequence_len = max([len(seq) for seq in code_snippets_seq])
code_snippets_seq = pad_sequences(code_snippets_seq, maxlen=max_sequence_len, padding='post')

# Define model architecture
embedding_dim = 100  # Dimension of word embeddings
vocab_size = len(tokenizer.word_index) + 1  # Plus 1 for padding token

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_len))
model.add(LSTM(256,return_sequences=True))
model.add(LSTM(64,activation="softmax"))
model.add(Dense(max_sequence_len, activation='tanh'))

# Compile model
model.compile(loss='kullback_leibler_divergence', optimizer='adam')

# Train the model
model.fit(code_snippets_seq, code_snippets_seq, epochs=350, batch_size=64)

In [None]:
# Save the model
#model.save('lstm_code_generation_model')

In [None]:
# Function to generate output based on prompt
def generate_output(model, tokenizer, prompt_text, max_output_length=100):
    # Tokenize and pad the prompt text
    prompt_sequence = tokenizer.texts_to_sequences([prompt_text])
    prompt_sequence_padded = pad_sequences(prompt_sequence, maxlen=max_sequence_len, padding='post')
    
    # Generate output sequence using the model
    output_sequence = model.predict(prompt_sequence_padded)
    print(output_sequence)
    # Decode the output sequence
    generated_output = tokenizer.sequences_to_texts(output_sequence)[0]
    print(generated_output)
    return generated_output

# Example prompt
prompt = "Write a Python function to add two numbers."

# Generate output based on the prompt
generated_output = generate_output(model, tokenizer, prompt)

# Print the generated output
print("Generated Python code based on the prompt:")
print(generated_output)