In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
train_df = pd.read_csv("/kaggle/input/llm-prompt-recovery/train.csv")


In [3]:
train_df_augmented = pd.concat([train_df] * 10, ignore_index=True)


In [4]:
# Step 3: Train a model
X_text = train_df_augmented["original_text"]
y = train_df_augmented["rewritten_text"]


In [5]:
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10000)  # You can adjust max_features as needed



In [6]:
# Transform text data into TF-IDF features
X_text_tfidf = tfidf_vectorizer.fit_transform(X_text)

In [7]:
# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_text_tfidf, y, test_size=0.2, random_state=42)


In [8]:
# Initialize and train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)


In [9]:
# Evaluate the model
val_predictions = model.predict(X_val)
accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", accuracy)

Validation Accuracy: 1.0


In [10]:
# Step 4: Read the test data
test_df = pd.read_csv("/kaggle/input/llm-prompt-recovery/test.csv")



In [11]:
# Step 5: Transform test data using TF-IDF vectorizer
test_text_tfidf = tfidf_vectorizer.transform(test_df["original_text"])



In [12]:
# Step 6: Make predictions on the test data
test_predictions = model.predict(test_text_tfidf)


In [13]:
# Step 7: Create submission file
submission_df = pd.read_csv("/kaggle/input/llm-prompt-recovery/sample_submission.csv")
submission_df["rewritten_text"] = test_predictions


In [14]:
# Save submission file
submission_df.to_csv("submission.csv", index=False)


In [15]:
submission_df.head()

Unnamed: 0,id,rewrite_prompt,rewritten_text
0,9559194,Improve that text.,Here is your shanty: (Verse 1) The text is rew...
