In [None]:
import requests
import re
from bs4 import BeautifulSoup
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

In [None]:
# Function to scrape code snippets from GeeksforGeeks page in batches
def scrape_gfg_code_batch(url_list, batch_size):
    all_code_snippets = []  # Initialize list to store all code snippets
    
    # Regular expression pattern to match comments containing the phrase 'This code is contributed by'
    contributed_comment_pattern = r'#.*?This code is contributed by.*?$'
    
    # Iterate over URL list in batches
    for i in range(0, len(url_list), batch_size):
        batch_urls = url_list[i:i+batch_size]  # Get batch of URLs
        
        # Iterate over URLs in the batch
        for url in batch_urls:
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')
            code_containers = soup.find_all('div', {'class': 'code-container'})
            # Extract code snippets from current URL
            code_snippets = []
            for container in code_containers:
                # Remove comments containing the phrase 'This code is contributed by'
                code_snippet = re.sub(contributed_comment_pattern, '', container.text.strip(), flags=re.MULTILINE)
                code_snippets.append(code_snippet)
                
                # Save the code snippet to a text file
                with open('code_snippets.txt', 'a', encoding='utf-8') as file:
                    file.write(code_snippet)
                    file.write('\n\n')  # Add a newline between snippets
            
            # Append code snippets to the list
            all_code_snippets.extend(code_snippets)
    
    return all_code_snippets

In [None]:
#urls = ['https://www.geeksforgeeks.org/maximum-of-two-numbers-in-python/','https://www.geeksforgeeks.org/python-program-to-find-sum-of-array/']
urls = ['https://www.geeksforgeeks.org/maximum-of-two-numbers-in-python/']
# Batch size for scraping
batch_size = 4

# Call the scrape_gfg_code_batch function with the URL list and batch size
code_snippets = scrape_gfg_code_batch(urls, batch_size)

# Print the dictionary containing code snippets with processed URLs as keys
for key, code_snippets in code_snippets.items():
    print(f"Key: {key}")
    for code_snippet in code_snippets:
        print(code_snippet)

In [None]:
# Training data preparation
training_data = '\n'.join(code_snippets)

In [None]:
# Initialize GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', max_length=2000)
model = GPT2LMHeadModel.from_pretrained('gpt2', max_length=2000)


In [None]:
# Tokenize the training data
input_ids = tokenizer.encode(training_data, return_tensors='pt')


In [None]:
# Fine-tune the GPT-2 model
model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
for epoch in range(75):  # You can adjust the number of epochs as needed
    optimizer.zero_grad()
    outputs = model(input_ids, labels=input_ids)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")


In [None]:
# Save the fine-tuned model
model.save_pretrained('fine_tuned_gpt2_model')

In [None]:
# Load the fine-tuned model with output_hidden_states=True
model = GPT2LMHeadModel.from_pretrained('fine_tuned_gpt2_model', output_hidden_states=True)

In [None]:
# Extract tokenizer from the model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
# Set the model to evaluation mode
model.eval()

In [None]:
# Define a prompt
prompt1 = "Write a program to find the maximum of two numbers in python?"
prompt2 = "Write a program to find the sum of array in python?"
prompt3 = "Write a program to find the length of a list in python?"


In [None]:
# Tokenize the prompt
input_ids = tokenizer.encode(prompt1, return_tensors='pt')

In [None]:
# Generate output
output = model.generate(input_ids, max_length=100, num_return_sequences=1, temperature=0.7)

In [None]:
# Decode and print the generated code snippet
generated_code = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_code)