### S3 Tie UP

In [2]:
# Cell 1 — S3 config
import sagemaker
from sagemaker.s3 import S3Downloader, S3Uploader

bucket = "complaint-classifier-jp2025"
prefix = "data"
s3_uri = f"s3://{bucket}/{prefix}/"

# Confirm file is there
S3Downloader.list(s3_uri)


  from pandas.core.computation.check import NUMEXPR_INSTALLED


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


['s3://complaint-classifier-jp2025/data',
 's3://complaint-classifier-jp2025/data/X_train_full.npy',
 's3://complaint-classifier-jp2025/data/complaints_processed.csv',
 's3://complaint-classifier-jp2025/data/test/complaints_test.csv/complaints_test.csv',
 's3://complaint-classifier-jp2025/data/train/complaints_train.csv/complaints_train.csv']

In [3]:
# Shuffle and split the data into *train and *test.csv files and store it back to S3
import pandas as pd
from sklearn.model_selection import train_test_split
import os 

s3_file_uri = f"s3://{bucket}/{prefix}/complaints_processed.csv"
## ONLY DO IT ONCE to Save to Downloads folder
#os.makedirs("downloads", exist_ok=True)
#S3Downloader.download(s3_uri, "downloads")
#S3Downloader.download(s3_file_uri, local_file)

# Load and shuffle
df = pd.read_csv("downloads/complaints_processed.csv")
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
print(df.columns.tolist())


['Unnamed: 0', 'product', 'narrative']


In [3]:

# Split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42,stratify=df['product'])

# Save locally
train_df.to_csv("complaints_train.csv", index=False)
test_df.to_csv("complaints_test.csv", index=False)

### ONLY DO IT ONCE to Save to Downloads folder
# Upload to S3
s3_prefix = f"s3://{bucket}/{prefix}"
#S3Uploader.upload("complaints_train.csv", f"{s3_prefix}/train/complaints_train.csv")
#S3Uploader.upload("complaints_test.csv", f"{s3_prefix}/test/complaints_test.csv")

print("Train and test CSVs uploaded to S3.")

Train and test CSVs uploaded to S3.


In [3]:
!pip install torch --quiet
!pip install transformers --quiet



KeyboardInterrupt



In [2]:
import torch
import numpy as np
from transformers import DistilBertTokenizer, DistilBertModel
from tqdm import tqdm

# 🔹 Config
bucket = "complaint-classifier-jp2025"
s3_output = f"s3://{bucket}/models/X_train.npy"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 🔹 Load DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")
model.to(device)
model.eval()


# Download complaints_train.csv from S3
s3_csv_uri = "s3://complaint-classifier-jp2025/data/train/complaints_train.csv"
os.makedirs("downloads", exist_ok=True)
S3Downloader.download(s3_csv_uri, "downloads")




In [4]:

# 🔹 Load CSV
df = pd.read_csv("downloads/complaints_train.csv")
texts = df["narrative"].dropna().astype(str).tolist()


In [5]:
s3_uri = 's3://complaint-classifier-jp2025/data'
print(len(texts),s3_uri)

129927 s3://complaint-classifier-jp2025/data


In [14]:
# 🔹 Embed in batches
batch_size = 16
embeddings = []

for i in tqdm(range(0, len(texts), batch_size)):
    batch = texts[i:i + batch_size]
    encodings = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=128)
    input_ids = encodings["input_ids"].to(device)
    attention_mask = encodings["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        cls_batch = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(cls_batch)

# 🔹 Save + Upload
X_train = np.concatenate(embeddings, axis=0)
np.save("X_train_full.npy", X_train)
S3Uploader.upload("X_train_full.npy", s3_uri)
print("✅ Uploaded full embedding to S3:", s3_uri)

100%|██████████| 8121/8121 [07:20<00:00, 18.43it/s]


✅ Uploaded full embedding to S3: s3://complaint-classifier-jp2025/data/X_train_full.npy


In [15]:
np.load("X_train_full.npy").shape


(129927, 768)

In [16]:
#S3Uploader.upload("X_train_full.npy", "s3://complaint-classifier-jp2025/data/X_train_full.npy")

's3://complaint-classifier-jp2025/data/X_train_full.npy/X_train_full.npy'

In [1]:
# LABEL ENCODING + Model Training
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib


df = pd.read_csv("downloads/complaints_train.csv")
texts = df["narrative"].dropna().astype(str).tolist()

valid_idx = df["narrative"].dropna().index
labels = df.loc[valid_idx, "product"].astype(str).tolist()
le = LabelEncoder()
y_encoded = le.fit_transform(labels)

# Save label encoder
joblib.dump(le, 'label_encoder.joblib')

len(texts) == len(y_encoded)



  from pandas.core.computation.check import NUMEXPR_INSTALLED


True

In [5]:
# Load features
X = np.load('X_train_full.npy')
assert X.shape[0] == len(y_encoded), "Mismatch in features and labels"

In [6]:
# Train model
clf = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42)
clf.fit(X, y_encoded)

# Save model
joblib.dump(clf, 'logreg_model.joblib')

# Optional: quick check
y_pred = clf.predict(X)
print(classification_report(y_encoded, y_pred, target_names=le.classes_))

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                     precision    recall  f1-score   support

        credit_card       0.72      0.67      0.69     12452
   credit_reporting       0.87      0.92      0.89     72937
    debt_collection       0.74      0.62      0.67     18518
mortgages_and_loans       0.79      0.77      0.78     15192
     retail_banking       0.79      0.79      0.79     10828

           accuracy                           0.82    129927
          macro avg       0.78      0.75      0.77    129927
       weighted avg       0.82      0.82      0.82    129927

