In [43]:
# For T5 Models

# %pip install sentencepiece

# from transformers import RobertaTokenizer, T5Tokenizer, T5ForConditionalGeneration

# # Load T5
# tokenizer = T5Tokenizer.from_pretrained('t5-base')
# model = T5ForConditionalGeneration.from_pretrained('t5-base')

# # Load Flan-T5
# tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
# model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

# # Load CodeT5
# tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
# model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')


In [44]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')


In [45]:
import json


In [46]:
# Load JSON data
with open('sample_problem.json', 'r') as file:
    problem_data = json.load(file)


In [47]:
# Sample user submission
user_submission = '''
    Create a count for each character in the string.
    For each character:
    Convert it to lowercase.
    Count how many times it appears.
    Set total_length to 0 and odd_count to 0.
    For each character count:
    Add the even part of the count to total_length.
    If the count is odd, add 1 to odd_count.
    If odd_count is greater than 0, add 1 to total_length.
    Return total_length.
'''


In [48]:
# Define the feedback template
feedback_template = '''
    {
        "title": "Analysis of User Submission",
        "analysis": "Detailed analysis of the user's approach.",
        "summary": "Concise summary of the user's approach and correctness.",
        "score": "An integer score: 1 (incorrect), 2 (partially correct), 3 (correct)."
    }
'''

# Define the prompt
prompt = f'''
    You are a professor who specializes in solving code problems.
    The user will submit, in words, their approach to solving the problem.
    Analyze the user's submission and provide feedback.

    Output your feedback in JSON format with the following structure:
    {feedback_template}

    Here are the problem details:

    Category: {problem_data['category']}
    Subcategory: {problem_data['subcategory']}
    Difficulty: {problem_data['difficulty']}
    Title: {problem_data['title']}
    Description: {problem_data['description']}
    Constraints: 
    {'\n'.join([f'{constraint}' for constraint in problem_data['constraints']])}

    Examples:
    {'\n'.join([f'Input: {example["input"]}, Output: {example["output"]}, Explanation: {example["explanation"]}' for example in problem_data['examples']])}

    Here is the user's submission:
    {user_submission}

    Ensure your response is coherent and follows the JSON structure provided.

    For example:
    Output = {
        "title": "User Submission Feedback",
        "analysis": "The user's approach correctly identifies the need to count character occurrences. However, the submission lacks specific details on how to handle both uppercase and lowercase characters separately, which is crucial for the problem. Additionally, it should mention how to use the counts to form the palindrome.",
        "summary": "The user has a good foundational understanding of the problem but misses some critical details in handling case sensitivity and palindrome construction.",
        "score": 2
    }
'''

# prompt = "Can you summarize the capabilities of language models, including tasks such as summarization, question answering, and text generation?"

# Set pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the prompt and set attention mask
encoded_input = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True)


In [49]:
# Generate feedback with attention mask
output_ids = model.generate(
    input_ids=encoded_input['input_ids'],
    attention_mask=encoded_input['attention_mask'],
    max_length=250,
    max_new_tokens=50,
    do_sample=True,
    temperature=0.5,
    top_k=50,
    top_p=0.95,
    repetition_penalty = 5.0
)

# Decode the output
feedback_response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(feedback_response)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=250) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



    You are a professor who specializes in solving code problems.
    The user will submit, in words, their approach to solving the problem.
    Analyze the user's submission and provide feedback.

    Output your feedback in JSON format with the following structure:
    
    {
        "title": "Analysis of User Submission",
        "analysis": "Detailed analysis of the user's approach.",
        "summary": "Concise summary of the user's approach and correctness.",
        "score": "An integer score: 1 (incorrect), 2 (partially correct), 3 (correct)."
    }


    Here are the problem details:

    Category: manipulations
    Subcategory: strings
    Difficulty: easy
    Title: Longest Palindrome
    Description: Given a string s which consists of lowercase or uppercase letters, return the length of the longest palindrome that can be built with those letters. Letters are case sensitive, for example, 'Aa' is not considered a palindrome.
    Constraints: 
    - 1 <= s.length <= 2000
- s 