#**CONTINUOUS ASSESSMENT**

##CHAINED MULTI-OUTPUT CLASSIFICATION

In [22]:
#!pip install scikit-learn pandas

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

In [14]:
df = pd.read_csv('/content/PubMed Multi Label Text Classification Dataset Processed.csv')

In [15]:
df['text'] = df['Title'].fillna('') + ' ' + df['abstractText'].fillna('')

In [16]:
X_text = df['text']
y_type2 = df['A']
y_type3 = df['B']
y_type4 = df['C']

In [17]:
vectorizer = TfidfVectorizer(max_features=300)
X = vectorizer.fit_transform(X_text)

In [18]:
X_train, X_test, y2_train, y2_test, y3_train, y3_test, y4_train, y4_test = train_test_split(
    X, y_type2, y_type3, y_type4, test_size=0.2, random_state=42
)

In [23]:
model1 = LogisticRegression(max_iter=200)
model1.fit(X_train, y2_train)
y2_pred = model1.predict(X_test)

In [26]:
X_train_2 = pd.DataFrame(X_train.toarray())
X_train_2['type2'] = y2_train.values

X_test_2 = pd.DataFrame(X_test.toarray())
X_test_2['type2'] = y2_pred

X_train_2.columns = X_train_2.columns.astype(str)
X_test_2.columns = X_test_2.columns.astype(str)

In [27]:
model2 = LogisticRegression(max_iter=200)
model2.fit(X_train_2, y3_train)
y3_pred = model2.predict(X_test_2)

In [28]:
X_train_3 = X_train_2.copy()
X_train_3['type3'] = y3_train.values

X_test_3 = X_test_2.copy()
X_test_3['type3'] = y3_pred

X_train_3.columns = X_train_3.columns.astype(str)
X_test_3.columns = X_test_3.columns.astype(str)

In [29]:
model3 = MLPClassifier(hidden_layer_sizes=(64,), max_iter=100)
model3.fit(X_train_3, y4_train)
y4_pred = model3.predict(X_test_3)



In [30]:
print("\n📌 Classification Report - Type 2:\n")
print(classification_report(y2_test, y2_pred))


📌 Classification Report - Type 2:

              precision    recall  f1-score   support

           0       0.74      0.82      0.78      5394
           1       0.76      0.66      0.71      4606

    accuracy                           0.75     10000
   macro avg       0.75      0.74      0.74     10000
weighted avg       0.75      0.75      0.75     10000



In [31]:
print("\n📌 Classification Report - Type 3:\n")
print(classification_report(y3_test, y3_pred))


📌 Classification Report - Type 3:

              precision    recall  f1-score   support

           0       0.64      0.28      0.39       724
           1       0.95      0.99      0.97      9276

    accuracy                           0.94     10000
   macro avg       0.79      0.63      0.68     10000
weighted avg       0.92      0.94      0.92     10000



In [32]:
print("\n📌 Classification Report - Type 4:\n")
print(classification_report(y4_test, y4_pred))


📌 Classification Report - Type 4:

              precision    recall  f1-score   support

           0       0.78      0.78      0.78      4716
           1       0.81      0.80      0.81      5284

    accuracy                           0.79     10000
   macro avg       0.79      0.79      0.79     10000
weighted avg       0.79      0.79      0.79     10000



##Hierarchical Structure

In [33]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from collections import Counter

In [35]:
df = pd.read_csv("/content/PubMed Multi Label Text Classification Dataset Processed.csv")

In [36]:
df['combined_text'] = df['Title'].fillna('') + " " + df['abstractText'].fillna('')
df['meshMajor'] = df['meshMajor'].fillna('').apply(lambda x: x.split(','))

In [37]:
all_labels = [label for sublist in df['meshMajor'].tolist() for label in sublist]
top_labels = [label for label, _ in Counter(all_labels).most_common(20)]
df['meshMajor'] = df['meshMajor'].apply(lambda labels: [l for l in labels if l in top_labels])
df = df[df['meshMajor'].map(len) > 0]

In [38]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['meshMajor'])

In [39]:
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['combined_text']).toarray()

In [40]:
X_train, X_test, y_train_all, y_test_all = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
y_train_lvl1 = y_train_all[:, 0]
y_test_lvl1 = y_test_all[:, 0]

In [42]:
model1 = RandomForestClassifier(n_estimators=100)
model1.fit(X_train, y_train_lvl1)
y1_pred = model1.predict(X_test)

In [45]:
X_train_lvl2 = X_train[y_train_lvl1 == 1]
y_train_lvl2 = y_train_all[y_train_lvl1 == 1, 1]
X_test_lvl2 = X_test[y1_pred == 1]

if len(X_train_lvl2) > 0 and len(X_test_lvl2) > 0:
    model2 = GradientBoostingClassifier()
    model2.fit(X_train_lvl2, y_train_lvl2)
    y2_pred = model2.predict(X_test_lvl2)

    # LEVEL 3 - further subset where Level 2 also predicted positive
    X_train_lvl3 = X_train_lvl2[y_train_lvl2 == 1]
    y_train_lvl3 = y_train_all[(y_train_lvl1 == 1) & (y_train_lvl2 == 1), 2]
    X_test_lvl3 = X_test_lvl2[y2_pred == 1]

    if len(X_train_lvl3) > 0 and len(X_test_lvl3) > 0:
        model3 = MLPClassifier(hidden_layer_sizes=(64,), max_iter=100)
        model3.fit(X_train_lvl3, y_train_lvl3)
        y3_pred = model3.predict(X_test_lvl3)

        print("✅ Level 3 Evaluation Report:")
        print(classification_report(
            y_test_all[(y1_pred == 1) & (y2_pred == 1), 2],
            y3_pred
        ))
    else:
        print("⚠️ Not enough data for Level 3")
else:
    print("⚠️ Not enough data for Level 2")

⚠️ Not enough data for Level 2


In [48]:
from sklearn.metrics import accuracy_score

# ----------- LEVEL 1 REPORT -----------
print("\n✅ Level 1 Report (Top Label: {})".format(mlb.classes_[0]))
print(classification_report(y_test_lvl1, y1_pred))
print("Accuracy (Level 1):", accuracy_score(y_test_lvl1, y1_pred))



✅ Level 1 Report (Top Label:  'Adult')
              precision    recall  f1-score   support

           0       0.88      1.00      0.94      7846
           1       0.00      0.00      0.00      1027

    accuracy                           0.88      8873
   macro avg       0.44      0.50      0.47      8873
weighted avg       0.78      0.88      0.83      8873

Accuracy (Level 1): 0.884255606897329


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
