In [2]:
import pandas as pd

df = pd.read_csv("loan.csv", low_memory=False)

cols = [
    "loan_amnt", "term", "int_rate", "annual_inc", "grade",
    "dti", "emp_length", "desc", "loan_status"
]
df = df[cols].dropna()

df = df[df["loan_status"].isin(["Fully Paid", "Charged Off", "Default"])]

df = df[~df["desc"].str.strip().str.lower().isin(["", "no description provided."])]

df["term"] = df["term"].str.extract(r"(\d+)").astype(int)
df["int_rate"] = df["int_rate"].str.rstrip("%").astype(float)

# Map loan status to risk
df["risk"] = df["loan_status"].apply(lambda x: "Low Risk" if x == "Fully Paid" else "High Risk")

# Balance dataset: 500 Low Risk + 500 High Risk
low_risk = df[df["risk"] == "Low Risk"].sample(n=500, random_state=42)
high_risk = df[df["risk"] == "High Risk"].sample(n=500, random_state=42)
balanced_df = pd.concat([low_risk, high_risk]).sample(frac=1, random_state=42).reset_index(drop=True)

def generate_reason(row):
    reasons = []

    if row["grade"] in ["A", "B"]:
        reasons.append("good credit rating")
    elif row["grade"] in ["F", "G"]:
        reasons.append("poor credit rating")
    else:
        reasons.append("moderate credit rating")

    if row["dti"] > 30:
        reasons.append("high debt-to-income ratio")
    else:
        reasons.append("acceptable DTI")

    if isinstance(row["emp_length"], str) and "10" in row["emp_length"]:
        reasons.append("long employment history")

    if row["annual_inc"] >= 100000:
        reasons.append("high income")
    elif row["annual_inc"] < 40000:
        reasons.append("low income")

    return f"{row['risk']}. Reason: " + ", ".join(reasons).capitalize() + "."

# Function to generate prompt and response
def make_prompt_response(row):
    prompt = (
        f"Loan Application:\n"
        f"Amount: ${int(row['loan_amnt'])}\n"
        f"Term: {row['term']} months\n"
        f"Interest Rate: {row['int_rate']}%\n"
        f"Annual Income: ${int(row['annual_inc'])}\n"
        f"Credit Rating: {row['grade']}\n"
        f"DTI: {row['dti']}\n"
        f"Employment Length: {row['emp_length']}\n"
        f"Description: {row['desc'].strip()}\n\n"
        f"Is this a high-risk or low-risk borrower? Explain why."
    )
    response = generate_reason(row)
    return pd.Series({"prompt": prompt, "response": response})

formatted = balanced_df.apply(make_prompt_response, axis=1)

formatted.to_json("loan_instruct_1000.jsonl", orient="records", lines=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["term"] = df["term"].str.extract(r"(\d+)").astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["int_rate"] = df["int_rate"].str.rstrip("%").astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["risk"] = df["loan_status"].apply(lambda x: "Low Risk" if x == "Fully Paid" else "