In [None]:
# === 1. Imports ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from datasets import load_dataset

# === 2. Load Dataset ===
# Replace this with actual loading if using HuggingFace or your CSV
# from datasets import load_dataset
# dataset = load_dataset("cuad")
# df = pd.read_csv("data/contract_clauses.csv")  # Example CSV path

# Load the full CUAD dataset
dataset = load_dataset("data/CUADv1")
data = dataset['train']

clause_texts = []
clause_types = []

for item in data:
    # Sometimes multiple answers per question - get all texts and labels
    for ans_text in item['answers']['text']:
        clause_texts.append(ans_text)
        clause_types.append(item['question'])

df = pd.DataFrame({
    'clause_text': clause_texts,
    'clause_type': clause_types
})

print(df.head())

# === 3. Basic EDA ===
print(df.head())
print(df['clause_type'].value_counts())
print(df.info())
print(df.isnull().sum())

# Optional: Visualize class distribution
sns.countplot(data=df, x='clause_type')
plt.title("Distribution of Clause Types")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# === 4. Preprocessing Text ===
df.dropna(subset=['clause_text', 'clause_type'], inplace=True)

X = df['clause_text']
y = df['clause_type']

# Encode target labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Vectorize clause text with TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=3000)
X_tfidf = tfidf.fit_transform(X)

# === 5. Split Dataset ===
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Save preprocessed data if needed
from joblib import dump
dump((X_train, y_train, X_test, y_test, le), 'processed_data.joblib')
