In [1]:
import os
import csv
import re
import pandas as pd
import matplotlib.pyplot as plt
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import textstat
from collections import Counter

# Load the English language model
nlp = spacy.load("en_core_web_sm")


  Moves tiles left and merges them if possible.

  Args:
      grid: A list of lists representing the 4x4 grid.

  Returns:
      A new grid with the updated state after moving left.
  

  Moves tiles right and merges them if possible.

  Args:
      grid: A list of lists representing the 4x4 grid.

  Returns:
      A new grid with the updated state after moving right.
  

  Moves tiles up and merges them if possible.

  Args:
      grid: A list of lists representing the 4x4 grid.

  Returns:
      A new grid with the updated state after moving up.
  

  Moves tiles down and merges them if possible.

  Args:
      grid: A list of lists representing the 4x4 grid.

  Returns:
      A new grid with the updated state after moving down.
  

  Transposes the grid (swaps rows and columns).

  Args:
      grid: A list of lists representing the 4x4 grid.

  Returns:
      The transposed grid.
  

  Checks if the grid has any empty cells.

  Args:
      grid: A list of lists representing the 4x

In [None]:
# This code block gives the comments in a formatted way which lets us convert to a dataframe
def get_comments(filepath):
    comments = []
    with open(filepath, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith('#'):
                comments.append(line)
            else:
                break 
    return comments

def extract_info(comment):
    # Regular expression to extract difficulty, result, and tests from comments
    pattern = r"#\s+(\d+\.\d+)\s+(.+?)\s+(\d+/\d+)"

    match = re.match(pattern, comment)
    if match:
        return match.groups()
    return None

def find_gpt_files(directory):
    gpt_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file == 'gpt.py':
                gpt_files.append(os.path.join(root, file))
    return gpt_files

def save_comments_to_csv(comments, filename):
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Difficulty", "Result", "Tests"])
        for comment in comments:
            info = extract_info(comment)
            if info:
                writer.writerow(info)

def main():
    kattis_dir = os.path.join(os.getcwd(), 'kattis')
    gpt_files = find_gpt_files(kattis_dir)
    all_comments = []

    for filepath in gpt_files:
        comments = get_comments(filepath)
        all_comments.extend(comments)

    if all_comments:
        csv_filename = 'kattis_comments.csv'
        save_comments_to_csv(all_comments, csv_filename)
        print(f"Comments saved to CSV file: {csv_filename}")
    else:
        print("No comments found in any gpt.py files.")

if __name__ == "__main__":
    main()

In [None]:
# Creating a dataframe from the comments csv
df_kattis = pd.read_csv("./kattis_comments.csv")
df_kattis['Result'] = df_kattis['Result'].apply(lambda x: x.lower() if isinstance(x, str) else x)

# Creating more columns for splitting the tests 
df_kattis[['Correct', 'Total']] = df_kattis['Tests'].str.split('/',expand=True).astype(int)

# Getting the count of each result at the difficulties
grouped = df_kattis.groupby(['Difficulty','Result']).size().unstack(fill_value=0)
# display(grouped)

# Creating a stacked bar chart for each result at each difficulty
ax = grouped.plot(kind='bar', stacked=True, figsize=(10, 6))
ax.set_xlabel('Difficulty')
ax.set_ylabel('Count')
ax.set_title('Outcome Count by Difficulty Level')
plt.xticks(rotation=45)
plt.legend(title='Outcome')
plt.show()

# making all errors to be a general error and plotting again
df_kattis['Result'] = df_kattis['Result'].apply(lambda x: 'error' if isinstance(x, str) and x.lower().split()[-1] in ['error', 'exceeded', 'exception'] else x)

# Getting the count of each result at the difficulties
grouped = df_kattis.groupby(['Difficulty','Result']).size().unstack(fill_value=0)
# Creating a stacked bar chart for each result at each difficulty
ax = grouped.plot(kind='bar', stacked=True, figsize=(10, 6))
ax.set_xlabel('Difficulty')
ax.set_ylabel('Count')
ax.set_title('Outcome Count by Difficulty Level')
plt.xticks(rotation=45)
plt.legend(title='Outcome')
plt.show()
# Figure out which feutures to have as inputs in a prediction model, Figure engineering. With what to get from the problem statements. Readability, frequency of words, etc. think of meaningful features. perform unsupervised learning, understanding the features.

In [None]:
df_kattis

In [None]:
# Functions to get all problems
def find_prompt_files(directory):
    prompt_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if 'gpt_prompt' in file:  # Check if 'gpt_prompt' is in the file name
                prompt_files.append(os.path.join(root, file))
    return prompt_files

def get_problem(filepath):
    with open(filepath, 'r') as f:
        problem = f.read().strip()
    return [problem]

kattis_dir = os.path.join(os.getcwd(), 'kattis')
prompt_files = find_prompt_files(kattis_dir)
all_problems = []

for filepath in prompt_files:
    comments = get_problem(filepath)
    all_problems.extend(comments)

all_problems[0]


In [None]:
 # Look into another filter for words that are important
data = []
terms = set()
with open("programming_terms.txt", "r") as file:
    for line in file:
        # Split the line into terms using comma as delimiter
        term_list = line.strip().lower().split(',')
        # Add each term to the set
        for term in term_list:
            terms.add(term.strip())

for problem in all_problems[:20]:
        doc = nlp(problem)
        # Extract keywords (nouns and verbs) excluding stopwords
        keywords = [token.text.lower() for token in doc if token.text.lower() not in STOP_WORDS
                    and token.text.lower() in terms
                    and token.pos_ not in ['SYM', 'PUNCT', 'SPACE', 'X']]
        # Count the frequency of each keyword
        keyword_freq = Counter(keywords)
        # Print the top keywords and their frequencies
        data.append(keyword_freq)

df = pd.DataFrame(data)
df = df.fillna(0)
df

In [None]:
# Do all textstat readability and find best features from them
with open("output.csv", mode="w", newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["ARI", "DCR", "DCR_V2", "FRE", "FKG", "SMOG", "CLI", "LINSEAR", "GF", "Textstd", "LexCount", "Difficult Words"])

    for problem in all_problems:
        ARI = textstat.automated_readability_index(problem)
        DCR = textstat.dale_chall_readability_score(problem)
        DCR_v2 = textstat.dale_chall_readability_score_v2(problem)
        FRE = textstat.flesch_reading_ease(problem)
        FKG = textstat.flesch_kincaid_grade(problem)
        SMOG = textstat.smog_index(problem)
        CLI = textstat.coleman_liau_index(problem)
        LINSEAR = textstat.linsear_write_formula(problem)
        GF = textstat.gunning_fog(problem)
        txtstd = textstat.text_standard(problem, float_output=False)
        lex_count = textstat.lexicon_count(problem, removepunct=True)
        difficult_words = textstat.difficult_words(problem)
        info = ARI, DCR, DCR_v2, FRE, FKG, SMOG, CLI, LINSEAR, GF, txtstd, lex_count, difficult_words
        writer.writerow(info)
        



In [None]:
# Adopt one-hot encoding for y values: wrong, accepted, error
# cross-validation techniques
# grid search: model training

features = pd.read_csv("./output.csv")
features['Difficulty'] = df_kattis['Difficulty']
features