In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [2]:
from nltk.corpus import stopwords
print(stopwords.words('english')[::])  # Should print stopwords list


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [3]:
import pandas as pd
# Load dataset (replace with actual file)
df = pd.read_csv(r"CAREGIVERS\CAREGIVERS.csv")

In [4]:
df.head()

Unnamed: 0,ROW_ID,CGID,LABEL,DESCRIPTION
0,2228,16174,RO,Read Only
1,2229,16175,RO,Read Only
2,2230,16176,Res,Resident/Fellow/PA/NP
3,2231,16177,RO,Read Only
4,2232,16178,RT,Respiratory


In [5]:
import re
# Text Preprocessing Function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text

In [6]:
# Apply preprocessing to "DESCRIPTION"
df["clean_text"] = df["DESCRIPTION"].astype(str).apply(preprocess_text)

In [7]:
# Encode Labels (Convert outcomes to numerical values)
label_encoder = LabelEncoder()
df["encoded_label"] = label_encoder.fit_transform(df["LABEL"])


In [8]:
# Splitting Data
X = df["clean_text"].values  # Features (processed text)
y = df["encoded_label"].values  # Target labels (encoded outcomes)


In [9]:
# Tokenization & Padding
max_words = 5000  # Max unique words
max_len = 200  # Max words per caregiver note

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=max_len, padding='post')

In [10]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)


In [11]:
# Define LSTM Model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    LSTM(64, return_sequences=True),
    LSTM(32),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')  # Output layer for multi-class classification
])



In [12]:
# Compile Model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
# Train Model
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/5
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 148ms/step - accuracy: 0.1478 - loss: 4.0704 - val_accuracy: 0.2199 - val_loss: 2.7462
Epoch 2/5
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 108ms/step - accuracy: 0.2023 - loss: 2.7467 - val_accuracy: 0.2199 - val_loss: 2.7481
Epoch 3/5
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 107ms/step - accuracy: 0.2086 - loss: 2.7079 - val_accuracy: 0.2199 - val_loss: 2.7541
Epoch 4/5
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 108ms/step - accuracy: 0.2112 - loss: 2.7371 - val_accuracy: 0.2199 - val_loss: 2.7608
Epoch 5/5
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 104ms/step - accuracy: 0.2112 - loss: 2.6856 - val_accuracy: 0.2199 - val_loss: 2.7692


<keras.src.callbacks.history.History at 0x1b9f11e4bf0>

In [14]:
# Evaluate Model
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 51ms/step


In [17]:
from sklearn.metrics import classification_report
import numpy as np

# Ensure only present labels are used
unique_test_labels = np.unique(y_test)
target_names = [str(label_encoder.inverse_transform([i])[0]) for i in unique_test_labels]  # Ensure string labels

# Generate classification report with zero_division fix
print("Classification Report:\n", classification_report(y_test, y_pred_classes, labels=unique_test_labels, target_names=target_names, zero_division=0))


Classification Report:
               precision    recall  f1-score   support

          AA       0.00      0.00      0.00         1
       Admin       0.00      0.00      0.00         8
          CM       0.00      0.00      0.00        13
       CO-Op       0.00      0.00      0.00         1
         CRT       0.00      0.00      0.00         2
       Co-Wk       0.00      0.00      0.00         1
      Co-Wkr       0.00      0.00      0.00         2
      CoOPSt       0.00      0.00      0.00         1
        CoOp       0.00      0.00      0.00         1
      CoOpSt       0.00      0.00      0.00         3
      CoWker       0.00      0.00      0.00        10
       CoWkr       0.00      0.00      0.00         4
      Coop S       0.00      0.00      0.00         1
        Coor       0.00      0.00      0.00         1
      CsMngm       0.00      0.00      0.00         1
          DI       0.00      0.00      0.00         7
          Dr       0.00      0.00      0.00         1
   

In [21]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

In [22]:
# 📌 Step 1: Load Dataset
df = pd.read_csv(r"CAREGIVERS\CAREGIVERS.csv")  # Change to your actual file name
df.head(3)

Unnamed: 0,ROW_ID,CGID,LABEL,DESCRIPTION
0,2228,16174,RO,Read Only
1,2229,16175,RO,Read Only
2,2230,16176,Res,Resident/Fellow/PA/NP


In [23]:
# 📌 Step 2: Remove labels that appear only once
label_counts = df["LABEL"].value_counts()
df = df[df["LABEL"].isin(label_counts[label_counts > 1].index)]  # Keep labels with more than 1 occurrence


In [24]:
# 📌 Step 3: Handle Missing Values in Description
df["DESCRIPTION"] = df["DESCRIPTION"].fillna("")

In [25]:
# 📌 Step 4: Text Preprocessing (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X = vectorizer.fit_transform(df["DESCRIPTION"])

In [26]:
# 📌 Step 5: Encode Labels
label_encoder = LabelEncoder()
df["encoded_label"] = label_encoder.fit_transform(df["LABEL"])
y = df["encoded_label"]


In [27]:
# 📌 Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [28]:
# 📌 Step 7: Handle Class Imbalance (Using SMOTE)
smote = SMOTE(sampling_strategy="auto", k_neighbors=1, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [31]:
param_grid = {
    "n_estimators": [50, 100],  # Reduce the number of trees
    "max_depth": [10, None],  # Limit depth
    "min_samples_split": [5, 10]
}


In [32]:
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring="accuracy", n_jobs=1)  # Use 1 job instead of -1


In [33]:
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestClassifier(class_weight="balanced", random_state=42)
random_search = RandomizedSearchCV(rf, param_grid, cv=3, scoring="accuracy", n_iter=5, n_jobs=1, random_state=42)
random_search.fit(X_train_resampled, y_train_resampled)

best_rf = random_search.best_estimator_


In [34]:
rf = RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42)
rf.fit(X_train_resampled, y_train_resampled)
y_pred = rf.predict(X_test)


In [36]:
if hasattr(grid_search, "best_params_"):
    print("Best Model Parameters:", grid_search.best_params_)
else:
    print("Grid Search did not run successfully.")


Grid Search did not run successfully.


In [37]:
print("Best Model Parameters:", random_search.best_params_)


Best Model Parameters: {'n_estimators': 100, 'min_samples_split': 5, 'max_depth': 10}


In [40]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score

# Ensure only present labels are used
unique_test_labels = np.unique(y_test)  # Get only labels in y_test
target_names = label_encoder.inverse_transform(unique_test_labels)  # Convert to class names

# Print Accuracy and Classification Report
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, labels=unique_test_labels, target_names=target_names, zero_division=0))



Accuracy: 0.11705006765899864

Classification Report:
               precision    recall  f1-score   support

       Admin       0.00      0.00      0.00         4
          CM       0.87      1.00      0.93        13
         CRT       0.00      0.00      0.00         1
      Co-Wkr       0.00      0.00      0.00         1
      CoOPSt       0.00      0.00      0.00         1
      CoOpSt       0.00      0.00      0.00         1
      CoOpst       0.00      0.00      0.00         1
      CoWker       0.00      0.00      0.00         7
       CoWkr       0.00      0.00      0.00         3
      CoWork       0.00      0.00      0.00         1
          DI       0.00      0.00      0.00         7
      DietIn       0.00      0.00      0.00         1
       HMSIV       0.00      0.00      0.00         1
         IMD       1.00      1.00      1.00         2
       ISOPS       0.40      1.00      0.57         2
      ISSupp       0.00      0.00      0.00         2
       LICSW       0.00  