# 03 â€“ Data Preprocessing and Preparation

**Run order:** Third (after 00_download_cuad and 02_eda). Requires CUAD data.

This notebook cleans the CUAD clause data, encodes labels, splits train/test, and prepares TF-IDF and sequence features.
**Output:** Clean dataset and model-ready arrays under `data/processed/`. Next: run **04_legal_clause_classification.ipynb** to train models.


## 1. Imports and load clause data


In [1]:
import pandas as pd
import numpy as np
import json
import os
import pickle
from pathlib import Path

import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)

# Paths: run from project root or Notebooks/
project_root = Path.cwd() if (Path.cwd() / "scripts").exists() else Path.cwd().parent
import sys
sys.path.insert(0, str(project_root))
from scripts.download_cuad import ensure_cuad_data
from scripts.load_cuad_clauses import get_clauses_df, extract_clauses_from_cuadv1

cuad_path, data_path = ensure_cuad_data(project_root)
clauses_df = get_clauses_df(data_path)
print(f"Loaded {len(clauses_df)} clauses. Using data at: {data_path.absolute()}")


CUAD dataset directory already exists.

Dataset found at: /Users/khajamoinuddinmohammed/Documents/MSDS/FALL 2025/BUAN 5312 ADVANCED ML/final project/cuad/data

Contents of data directory:
   CUADv1.json
   test.json
   train_separate_questions.json
EXTRACTING CLAUSES FROM CUADv1.json
Processing 510 contracts...

[OK] Extracted 13823 clauses from 510 contracts
   Unique categories: 41
   Unique contracts: 510
Loaded 13823 clauses. Using data at: /Users/khajamoinuddinmohammed/Documents/MSDS/FALL 2025/BUAN 5312 ADVANCED ML/final project/cuad/data


## 2. Data Preprocessing and Preparation

Prepare data for **multi-class classification** (softmax output) for both models:

**Model 1: Feedforward Neural Network (MLP) with TF-IDF**
- Convert clauses to TF-IDF vectors
- 1-2 hidden layers with ReLU
- Softmax output for multi-class classification
- Sparse Categorical Crossentropy loss, Adam optimizer

**Model 2: LSTM-Based Text Classifier**
- Tokenize into sequences of word IDs
- Random embeddings (can use pretrained later)
- LSTM/BiLSTM to capture word order
- Softmax output for multi-class classification
- Sparse Categorical Crossentropy loss, Adam optimizer


### 2.1 Initial Data Cleaning


In [2]:
print("DATA CLEANING AND PREPARATION")

df_clean = clauses_df.copy()

print(f"\nInitial dataset size: {len(df_clean)} clauses")

initial_size = len(df_clean)
df_clean = df_clean[df_clean['clause_text'].str.strip() != '']
df_clean = df_clean[df_clean['word_count'] >= 2]
print(f"After removing empty/very short clauses: {len(df_clean)} clauses (removed {initial_size - len(df_clean)})")

df_clean = df_clean.drop_duplicates(subset=['clause_text'], keep='first')
print(f"After removing duplicate clause texts: {len(df_clean)} clauses")

min_samples_per_category = 5
category_counts_clean = df_clean['category'].value_counts()
rare_categories = category_counts_clean[category_counts_clean < min_samples_per_category].index

if len(rare_categories) > 0:
    print(f"\n[WARNING]  Found {len(rare_categories)} categories with < {min_samples_per_category} samples:")
    for cat in rare_categories:
        print(f"   - {cat}: {category_counts_clean[cat]} samples")
    print(f"   -> Keeping rare categories (will use class weights in models)")
else:
    print(f"\n[OK] All categories have >= {min_samples_per_category} samples")

print(f"\nFinal cleaned dataset size: {len(df_clean)} clauses")
print(f"Unique categories: {df_clean['category'].nunique()}")


print("\nCATEGORY DISTRIBUTION (After Cleaning):")
print(df_clean['category'].value_counts().head(15))


DATA CLEANING AND PREPARATION

Initial dataset size: 13823 clauses
After removing empty/very short clauses: 12883 clauses (removed 940)
After removing duplicate clause texts: 11047 clauses

[OK] All categories have >= 5 samples

Final cleaned dataset size: 11047 clauses
Unique categories: 41

CATEGORY DISTRIBUTION (After Cleaning):
category
Parties                      1577
License Grant                 658
Audit Rights                  633
Anti-Assignment               614
Insurance                     550
Cap On Liability              545
Governing Law                 453
Agreement Date                434
Expiration Date               415
Revenue/Profit Sharing        414
Post-Termination Services     410
Exclusivity                   402
Minimum Commitment            398
Rofr/Rofo/Rofn                353
Ip Ownership Assignment       310
Name: count, dtype: int64


### 2.2 Text Preprocessing Functions


In [3]:
def clean_text(text, remove_numbers=False, remove_punctuation=False):
    """Clean text for preprocessing."""
    if pd.isna(text) or text == '':
        return ''

    text = str(text).strip()
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)

    if remove_numbers:
        text = re.sub(r'\d+', '', text)
    if remove_punctuation:
        text = re.sub(r'[^\w\s]', '', text)

    return text.strip()

def preprocess_text_for_tfidf(text):
    """Preprocess text for TF-IDF vectorization (Feedforward model)."""
    return clean_text(text, remove_numbers=False, remove_punctuation=False)

def preprocess_text_for_lstm(text):
    """Preprocess text for LSTM model."""
    return clean_text(text, remove_numbers=False, remove_punctuation=False)


print("APPLYING TEXT PREPROCESSING")


df_clean['text_processed'] = df_clean['clause_text'].apply(preprocess_text_for_tfidf)

print(f"\n[OK] Preprocessing applied to {len(df_clean)} clauses")
print(f"\nSample original vs processed:")
for i in range(min(3, len(df_clean))):
    orig = df_clean.iloc[i]['clause_text'][:100]
    proc = df_clean.iloc[i]['text_processed'][:100]
    print(f"\n  Original:  {orig}...")
    print(f"  Processed: {proc}...")


APPLYING TEXT PREPROCESSING

[OK] Preprocessing applied to 11047 clauses

Sample original vs processed:

  Original:  DISTRIBUTOR AGREEMENT...
  Processed: distributor agreement...

  Original:  Electric City Corp....
  Processed: electric city corp....

  Original:  Electric City of Illinois L.L.C....
  Processed: electric city of illinois l.l.c....


### 2.3 Label Encoding and Category Selection


In [4]:
print("LABEL ENCODING AND CATEGORY PREPARATION")
print("\nStrategy: Select 8-10 common categories, group remaining as 'Other'")

top_n_categories = 9
category_counts_sorted = df_clean['category'].value_counts()
top_categories = category_counts_sorted.head(top_n_categories)

print(f"\nSelected top {top_n_categories} common categories:")
for i, (cat, count) in enumerate(top_categories.items(), 1):
    pct = (count / len(df_clean)) * 100
    print(f"  {i:2d}. {cat:40s} : {count:5d} clauses ({pct:5.2f}%)")

remaining_categories = category_counts_sorted.iloc[top_n_categories:]
remaining_count = remaining_categories.sum()
remaining_pct = (remaining_count / len(df_clean)) * 100

print(f"\n  Remaining categories: {len(remaining_categories)} categories")
print(f"  Total clauses in remaining: {remaining_count} ({remaining_pct:.2f}%)")
print(f"  -> Will be grouped as 'Other' category")

df_final = df_clean.copy()

df_final['category_grouped'] = df_final['category'].apply(
    lambda x: x if x in top_categories.index else 'Other'
)

print(f"\n[OK] Category grouping completed:")
print(f"  Total categories after grouping: {df_final['category_grouped'].nunique()}")
print(f"  Category distribution:")
category_grouped_counts = df_final['category_grouped'].value_counts()
for cat, count in category_grouped_counts.items():
    pct = (count / len(df_final)) * 100
    print(f"    {cat:40s} : {count:5d} clauses ({pct:5.2f}%)")

label_encoder = LabelEncoder()
df_final['label_encoded'] = label_encoder.fit_transform(df_final['category_grouped'])

label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
reverse_label_mapping = {v: k for k, v in label_mapping.items()}

print(f"\n[OK] Label encoding completed:")
print(f"  {len(label_mapping)} categories encoded to integers 0-{len(label_mapping)-1}")
print(f"\n  Label mapping:")
for label, cat in sorted(reverse_label_mapping.items()):
    count = (df_final['category_grouped'] == cat).sum()
    print(f"    {label:2d} -> {cat:40s} ({count:5d} samples)")

num_classes = len(label_mapping)
print(f"\n[OK] Number of classes for models: {num_classes} (including 'Other')")


LABEL ENCODING AND CATEGORY PREPARATION

Strategy: Select 8-10 common categories, group remaining as 'Other'

Selected top 9 common categories:
   1. Parties                                  :  1577 clauses (14.28%)
   2. License Grant                            :   658 clauses ( 5.96%)
   3. Audit Rights                             :   633 clauses ( 5.73%)
   4. Anti-Assignment                          :   614 clauses ( 5.56%)
   5. Insurance                                :   550 clauses ( 4.98%)
   6. Cap On Liability                         :   545 clauses ( 4.93%)
   7. Governing Law                            :   453 clauses ( 4.10%)
   8. Agreement Date                           :   434 clauses ( 3.93%)
   9. Expiration Date                          :   415 clauses ( 3.76%)

  Remaining categories: 32 categories
  Total clauses in remaining: 5168 (46.78%)
  -> Will be grouped as 'Other' category

[OK] Category grouping completed:
  Total categories after grouping: 10
  Category 

### 2.4 Train/Test Split


In [5]:
print("TRAIN/TEST DATA PREPARATION")
print("\nStrategy:")
print("  - Training data: CUADv1.json (all data, validation split will be used during training)")
print("  - Test data: test.json (separate holdout set for final unbiased evaluation)")

print(f"\nTraining data from CUADv1.json:")
print(f"  Total clauses: {len(df_final)}")

X_train = df_final['text_processed'].values
y_train = df_final['label_encoded'].values

print(f"  X_train: {len(X_train)} clauses")
print(f"  y_train: {len(y_train)} labels")

print(f"\nExtracting test data from test.json...")
test_file = data_path / "test.json"

if test_file.exists():
    test_clauses_df = extract_clauses_from_cuadv1(test_file)

    if not test_clauses_df.empty:
        print(f"  Raw test clauses extracted: {len(test_clauses_df)}")

        test_clauses_df = test_clauses_df[test_clauses_df['clause_text'].str.strip() != '']
        test_clauses_df = test_clauses_df[test_clauses_df['word_count'] >= 2]
        test_clauses_df = test_clauses_df.drop_duplicates(subset=['clause_text'], keep='first')

        test_clauses_df['text_processed'] = test_clauses_df['clause_text'].apply(preprocess_text_for_tfidf)

        test_clauses_df['category_grouped'] = test_clauses_df['category'].apply(
            lambda x: x if x in top_categories.index else 'Other'
        )

        test_clauses_df = test_clauses_df[test_clauses_df['category_grouped'].isin(label_encoder.classes_)]

        test_clauses_df['label_encoded'] = label_encoder.transform(test_clauses_df['category_grouped'])

        X_test = test_clauses_df['text_processed'].values
        y_test = test_clauses_df['label_encoded'].values

        print(f"  X_test: {len(X_test)} clauses (after filtering and grouping)")
        print(f"  y_test: {len(y_test)} labels")

        print(f"\nTest set category distribution:")
        test_dist = pd.Series(y_test).value_counts().sort_index()
        for label, count in test_dist.items():
            cat_name = reverse_label_mapping[label]
            pct = (count / len(y_test)) * 100 if len(y_test) > 0 else 0
            print(f"  {label:2d} ({cat_name[:35]:35s}): {count:4d} samples ({pct:5.2f}%)")

        print(f"\n[OK] Using test.json as final test set")

    else:
        print("  WARNING: No clauses extracted from test.json")
        print("  Falling back to train/test split from CUADv1.json")
        X_train, X_test, y_train, y_test = train_test_split(
            X_train, y_train,
            test_size=0.2,
            random_state=42,
            stratify=y_train,
            shuffle=True
        )
        print(f"  Split: {len(X_train)} train, {len(X_test)} test")
else:
    print(f"  WARNING: test.json not found at {test_file}")
    print("  Falling back to train/test split from CUADv1.json")
    X_train, X_test, y_train, y_test = train_test_split(
        X_train, y_train,
        test_size=0.2,
        random_state=42,
        stratify=y_train,
        shuffle=True
    )
    print(f"  Split: {len(X_train)} train, {len(X_test)} test")

print(f"\nFinal data split:")
print(f"  Training set: {len(X_train)} clauses")
print(f"  Test set: {len(X_test)} clauses")
print(f"  Training percentage: {len(X_train)/(len(X_train)+len(X_test))*100:.1f}%" if (len(X_train)+len(X_test)) > 0 else "")
print(f"  Test percentage: {len(X_test)/(len(X_train)+len(X_test))*100:.1f}%" if (len(X_train)+len(X_test)) > 0 else "")

from sklearn.utils.class_weight import compute_class_weight

class_weights_dict = compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(class_weights_dict))

print(f"\nClass weights calculated (for imbalanced handling):")
for label, weight in list(class_weights.items())[:5]:
    cat_name = reverse_label_mapping[label]
    print(f"  {label:2d} ({cat_name[:35]:35s}): {weight:.3f}")
print(f"  ... (showing first 5, {len(class_weights)} total)")

print(f"\nLabels prepared for multi-class classification:")
print(f"  y_train: {y_train.shape} (integer labels, 0 to {num_classes-1})")
print(f"  y_test: {y_test.shape} (integer labels)")
print(f"  Loss function: Sparse Categorical Crossentropy")
print(f"  Output activation: Softmax")


TRAIN/TEST DATA PREPARATION

Strategy:
  - Training data: CUADv1.json (all data, validation split will be used during training)
  - Test data: test.json (separate holdout set for final unbiased evaluation)

Training data from CUADv1.json:
  Total clauses: 11047
  X_train: 11047 clauses
  y_train: 11047 labels

Extracting test data from test.json...
Processing 102 contracts...

[OK] Extracted 2643 clauses from 102 contracts
   Unique categories: 40
   Unique contracts: 102
  Raw test clauses extracted: 2643
  X_test: 2172 clauses (after filtering and grouping)
  y_test: 2172 labels

Test set category distribution:
   0 (Agreement Date                     ):   87 samples ( 4.01%)
   1 (Anti-Assignment                    ):  128 samples ( 5.89%)
   2 (Audit Rights                       ):  102 samples ( 4.70%)
   3 (Cap On Liability                   ):  106 samples ( 4.88%)
   4 (Expiration Date                    ):   73 samples ( 3.36%)
   5 (Governing Law                      ):   90 

### 2.5 Prepare Data for Feedforward Model (TF-IDF)


In [6]:
print("PREPARING TF-IDF FEATURES FOR FEEDFORWARD NEURAL NETWORK (MLP)")
print("\nModel Architecture Plan:")
print("  - Input: TF-IDF vectors (dense arrays)")
print("  - Hidden layers: 1-2 layers with ReLU activation")
print("  - Output: Softmax activation for multi-class classification")
print("  - Loss: Sparse Categorical Crossentropy")
print("  - Optimizer: Adam")

max_features = 5000
min_df = 2
max_df = 0.95

tfidf_vectorizer = TfidfVectorizer(
    max_features=max_features,
    min_df=min_df,
    max_df=max_df,
    ngram_range=(1, 2),
    stop_words='english',
    lowercase=True,
    strip_accents='unicode'
)

print(f"\nFitting TF-IDF vectorizer on training data...")
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"\nConverting sparse matrices to dense arrays...")
X_train_tfidf_dense = X_train_tfidf.toarray()
X_test_tfidf_dense = X_test_tfidf.toarray()

print(f"\n[OK] TF-IDF transformation completed:")
print(f"  Training features shape: {X_train_tfidf_dense.shape}")
print(f"  Test features shape: {X_test_tfidf_dense.shape}")
print(f"  Vocabulary size: {len(tfidf_vectorizer.vocabulary_)}")
print(f"  Actual features: {X_train_tfidf_dense.shape[1]}")

sparsity_train = (1.0 - X_train_tfidf.nnz / (X_train_tfidf.shape[0] * X_train_tfidf.shape[1])) * 100
print(f"  Sparsity: {sparsity_train:.2f}% (converted to dense)")

print(f"\n[OK] Data ready for Feedforward Neural Network (MLP) with TF-IDF!")
print(f"   Input shape: {X_train_tfidf_dense.shape[1]} features")
print(f"   Output shape: {num_classes} classes (softmax)")


PREPARING TF-IDF FEATURES FOR FEEDFORWARD NEURAL NETWORK (MLP)

Model Architecture Plan:
  - Input: TF-IDF vectors (dense arrays)
  - Hidden layers: 1-2 layers with ReLU activation
  - Output: Softmax activation for multi-class classification
  - Loss: Sparse Categorical Crossentropy
  - Optimizer: Adam

Fitting TF-IDF vectorizer on training data...

Converting sparse matrices to dense arrays...

[OK] TF-IDF transformation completed:
  Training features shape: (11047, 5000)
  Test features shape: (2172, 5000)
  Vocabulary size: 5000
  Actual features: 5000
  Sparsity: 99.56% (converted to dense)

[OK] Data ready for Feedforward Neural Network (MLP) with TF-IDF!
   Input shape: 5000 features
   Output shape: 10 classes (softmax)


use tfidf with traditional models
to get sequential info use static embedding(try)


### 2.6 Prepare Data for LSTM Model (Tokenization & Padding)


In [7]:
print("PREPARING SEQUENCE DATA FOR LSTM-BASED TEXT CLASSIFIER")
print("\nModel Architecture Plan:")
print("  - Input: Tokenized sequences (word IDs)")
print("  - Embedding: Random embeddings (or pretrained)")
print("  - LSTM/BiLSTM: Capture word order and context")
print("  - Output: Softmax activation for multi-class classification")
print("  - Loss: Sparse Categorical Crossentropy")
print("  - Optimizer: Adam")
print("\nWhy LSTM: Captures word order important for legal phrases like")
print("  'subject to', 'shall remain', 'governing law shall be...'")

word_counts = [len(text.split()) for text in X_train]
p95_length = int(np.percentile(word_counts, 95))
p99_length = int(np.percentile(word_counts, 99))
max_sequence_length = min(p95_length + 20, 300)

print(f"\nSequence length analysis:")
print(f"  95th percentile: {p95_length} words")
print(f"  99th percentile: {p99_length} words")
print(f"  Selected max_sequence_length: {max_sequence_length}")

vocab_size = 10000
tokenizer = Tokenizer(
    num_words=vocab_size,
    oov_token='<OOV>',
    filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n',
    lower=True,
    split=' '
)

print(f"\nFitting tokenizer on training data...")
tokenizer.fit_on_texts(X_train)

actual_vocab_size = len(tokenizer.word_index) + 1
print(f"  Actual vocabulary size: {actual_vocab_size:,} words")
print(f"  Using top {min(vocab_size, actual_vocab_size):,} words")

print(f"\nConverting texts to sequences...")
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

print(f"Padding sequences to length {max_sequence_length}...")
X_train_padded = pad_sequences(
    X_train_seq,
    maxlen=max_sequence_length,
    padding='post',
    truncating='post'
)

X_test_padded = pad_sequences(
    X_test_seq,
    maxlen=max_sequence_length,
    padding='post',
    truncating='post'
)

print(f"\n[OK] Sequence preparation completed:")
print(f"  Training sequences shape: {X_train_padded.shape}")
print(f"  Test sequences shape: {X_test_padded.shape}")
print(f"  Max sequence length: {max_sequence_length}")
print(f"  Vocabulary size: {actual_vocab_size:,}")

train_truncated = sum([len(seq) > max_sequence_length for seq in X_train_seq])
test_truncated = sum([len(seq) > max_sequence_length for seq in X_test_seq])

print(f"\nTruncation statistics:")
print(f"  Train sequences truncated: {train_truncated} ({train_truncated/len(X_train_seq)*100:.2f}%)")
print(f"  Test sequences truncated: {test_truncated} ({test_truncated/len(X_test_seq)*100:.2f}%)")

print(f"\n[OK] Data ready for LSTM-Based Text Classifier!")
print(f"   Input shape: ({max_sequence_length},) - sequences of word IDs")
print(f"   Embedding input dim: {actual_vocab_size:,}")
print(f"   Output shape: {num_classes} classes (softmax)")


PREPARING SEQUENCE DATA FOR LSTM-BASED TEXT CLASSIFIER

Model Architecture Plan:
  - Input: Tokenized sequences (word IDs)
  - Embedding: Random embeddings (or pretrained)
  - LSTM/BiLSTM: Capture word order and context
  - Output: Softmax activation for multi-class classification
  - Loss: Sparse Categorical Crossentropy
  - Optimizer: Adam

Why LSTM: Captures word order important for legal phrases like
  'subject to', 'shall remain', 'governing law shall be...'

Sequence length analysis:
  95th percentile: 118 words
  99th percentile: 207 words
  Selected max_sequence_length: 138

Fitting tokenizer on training data...
  Actual vocabulary size: 9,833 words
  Using top 9,833 words

Converting texts to sequences...
Padding sequences to length 138...

[OK] Sequence preparation completed:
  Training sequences shape: (11047, 138)
  Test sequences shape: (2172, 138)
  Max sequence length: 138
  Vocabulary size: 9,833

Truncation statistics:
  Train sequences truncated: 397 (3.59%)
  Test se

### 2.7 Preprocessing Summary


In [8]:
print("PREPROCESSING SUMMARY - READY FOR MODEL CREATION")

print(f"""
[OK] Data Preprocessing Complete for Multi-Class Classification!

 Dataset Overview:
   - Original clauses: {len(clauses_df):,}
   - After cleaning: {len(df_final):,}
   - Selected categories: {num_classes} (top categories)
   - Train samples: {len(X_train):,}
   - Test samples: {len(X_test):,}



 Model 1: Feedforward Neural Network (MLP) with TF-IDF

   - X_train_tfidf_dense: {X_train_tfidf_dense.shape}
   - X_test_tfidf_dense: {X_test_tfidf_dense.shape}
   - Features: {X_train_tfidf_dense.shape[1]:,} TF-IDF features
   - Architecture: Input({X_train_tfidf_dense.shape[1]}) -> Hidden(ReLU) -> Output({num_classes}, Softmax)



 Model 2: LSTM-Based Text Classifier

   - X_train_padded: {X_train_padded.shape}
   - X_test_padded: {X_test_padded.shape}
   - Vocabulary size: {actual_vocab_size:,}
   - Max sequence length: {max_sequence_length}
   - Architecture: Input({max_sequence_length}) -> Embedding({actual_vocab_size:,}) -> LSTM -> Output({num_classes}, Softmax)



 Labels (Both Models):
   - y_train: {y_train.shape} (integer labels: 0 to {num_classes-1})
   - y_test: {y_test.shape} (integer labels)
   - Loss: Sparse Categorical Crossentropy
   - Output: Softmax ({num_classes} classes)
   - Class weights: Computed for imbalanced handling

 Ready to create models!
""")


PREPROCESSING SUMMARY - READY FOR MODEL CREATION

[OK] Data Preprocessing Complete for Multi-Class Classification!

 Dataset Overview:
   - Original clauses: 13,823
   - After cleaning: 11,047
   - Selected categories: 10 (top categories)
   - Train samples: 11,047
   - Test samples: 2,172



 Model 1: Feedforward Neural Network (MLP) with TF-IDF

   - X_train_tfidf_dense: (11047, 5000)
   - X_test_tfidf_dense: (2172, 5000)
   - Features: 5,000 TF-IDF features
   - Architecture: Input(5000) -> Hidden(ReLU) -> Output(10, Softmax)



 Model 2: LSTM-Based Text Classifier

   - X_train_padded: (11047, 138)
   - X_test_padded: (2172, 138)
   - Vocabulary size: 9,833
   - Max sequence length: 138
   - Architecture: Input(138) -> Embedding(9,833) -> LSTM -> Output(10, Softmax)



 Labels (Both Models):
   - y_train: (11047,) (integer labels: 0 to 9)
   - y_test: (2172,) (integer labels)
   - Loss: Sparse Categorical Crossentropy
   - Output: Softmax (10 classes)
   - Class weights: Computed f

## 3. Save prepared data

Write clean dataset and model-ready arrays to `data/processed/` so the main classification notebook can load them.


In [9]:
# Output directory (under project root)
processed_dir = project_root / "data" / "processed"
processed_dir.mkdir(parents=True, exist_ok=True)
print(f"Saving to {processed_dir.absolute()}")

# 1) Clean clause-level dataset
clauses_clean_path = processed_dir / "clauses_clean.csv"
df_final.to_csv(clauses_clean_path, index=False)
print(f"  - {clauses_clean_path.name}")

# 2) NumPy arrays
np.save(processed_dir / "X_train_tfidf_dense.npy", X_train_tfidf_dense)
np.save(processed_dir / "X_test_tfidf_dense.npy", X_test_tfidf_dense)
np.save(processed_dir / "X_train_padded.npy", X_train_padded)
np.save(processed_dir / "X_test_padded.npy", X_test_padded)
np.save(processed_dir / "y_train.npy", y_train)
np.save(processed_dir / "y_test.npy", y_test)
print("  - X_train_tfidf_dense.npy, X_test_tfidf_dense.npy")
print("  - X_train_padded.npy, X_test_padded.npy")
print("  - y_train.npy, y_test.npy")

# 3) Sklearn objects (joblib)
import joblib
joblib.dump(label_encoder, processed_dir / "label_encoder.joblib")
joblib.dump(tfidf_vectorizer, processed_dir / "tfidf_vectorizer.joblib")
print("  - label_encoder.joblib, tfidf_vectorizer.joblib")

# 4) Keras Tokenizer (pickle)
with open(processed_dir / "tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
print("  - tokenizer.pkl")

# 5) Config (num_classes, max_sequence_length, vocab_size, reverse_label_mapping, class_weights)
config = {
    "num_classes": num_classes,
    "max_sequence_length": int(max_sequence_length),
    "vocab_size": int(actual_vocab_size),
    "reverse_label_mapping": {str(k): v for k, v in reverse_label_mapping.items()},
    "class_weights": {str(k): float(v) for k, v in class_weights.items()},
}
with open(processed_dir / "config.json", "w") as f:
    json.dump(config, f, indent=2)
print("  - config.json")

print("\n[OK] Prepared data saved. Run the main classification notebook and load from data/processed/.")


Saving to /Users/khajamoinuddinmohammed/Documents/MSDS/FALL 2025/BUAN 5312 ADVANCED ML/final project/data/processed


ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.