In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import os
import zipfile

# Unzip the data file
with zipfile.ZipFile('/kaggle/input/konwinski-prize/data.a_zip', 'r') as zip_ref:
    zip_ref.extractall('/kaggle/working/')

# Load the training data
train_data = pd.read_parquet('/kaggle/working/data/data.parquet')
print(train_data.head())

                instance_id                repo  \
0  pylint-dev__astroid-2496  pylint-dev/astroid   
1  pylint-dev__astroid-2468  pylint-dev/astroid   
2    astropy__astropy-17048     astropy/astropy   
3    astropy__astropy-16898     astropy/astropy   
4    astropy__astropy-16830     astropy/astropy   

                                   problem_statement  \
0  TypeError: unsupported format string passed to...   
1  Pylint checks against incorrect type with prop...   
2  QTable cannot take `dimensionless_unscaled` wh...   
3  BUG: tables do not deal well with zero-sized s...   
4  KeyError: 'version_1_3_or_later' when parsing ...   

                                               patch  \
0  diff --git a/ChangeLog b/ChangeLog\nindex 4560...   
1  diff --git a/ChangeLog b/ChangeLog\nindex fdbb...   
2  diff --git a/astropy/table/table.py b/astropy/...   
3  diff --git a/astropy/io/registry/core.py b/ast...   
4  diff --git a/astropy/io/votable/tree.py b/astr...   

                   

In [2]:
# Data preprocessing
# Check for missing values
print(train_data.isnull().sum())

# Fill missing values if necessary (example)
train_data['patch'].fillna('', inplace=True)

# Encode categorical variables if needed
train_data['repo'] = train_data['repo'].astype('category').cat.codes

instance_id          0
repo                 0
problem_statement    0
patch                0
test_patch           0
pull_number          0
base_commit          0
PASS_TO_PASS         0
FAIL_TO_PASS         0
issue_numbers        0
dtype: int64


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split the data into training and validation sets
X = train_data[['repo', 'problem_statement']]  # Features
y = train_data['patch']  # Target variable

# Convert text data to numerical using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(X['problem_statement'])

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Validate the model
y_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))

Validation Accuracy: 0.0


In [4]:
# Function to predict patches for new issues
def predict_patch(problem_statements):
    problem_tfidf = tfidf.transform(problem_statements)
    return model.predict(problem_tfidf)

# Example usage
new_issues = ["Example problem statement 1", "Example problem statement 2"]
predicted_patches = predict_patch(new_issues)
print(predicted_patches)

['diff --git a/astropy/io/registry/core.py b/astropy/io/registry/core.py\nindex 9147101f8b7..b7325c4beb1 100644\n--- a/astropy/io/registry/core.py\n+++ b/astropy/io/registry/core.py\n@@ -225,7 +225,7 @@ def read(self, cls, *args, format=None, cache=False, **kwargs):\n                 # registered.  This returns the parent class, so try coercing\n                 # to desired subclass.\n                 try:\n-                    data = cls(data)\n+                    data = cls(data, copy=False)\n                 except Exception:\n                     raise TypeError(\n                         f"could not convert reader output to {cls.__name__} class."\ndiff --git a/astropy/table/pprint.py b/astropy/table/pprint.py\nindex a910753a2e0..56485717769 100644\n--- a/astropy/table/pprint.py\n+++ b/astropy/table/pprint.py\n@@ -449,7 +449,10 @@ def _pformat_col_iter(\n             i_centers.append(n_header)\n             n_header += 1\n             if dtype is not None:\n-                col_d

In [5]:
# Save the predictions to a CSV file
output_df = pd.DataFrame({'problem_statement': new_issues, 'predicted_patch': predicted_patches})
output_df.to_csv('/kaggle/working/predictions.csv', index=False)