In [None]:
pip install transformers datasets

In [None]:
pip install faker

In [None]:
pip install cleanlab

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from tqdm import tqdm

from transformers import DistilBertTokenizer, DistilBertConfig, TFDistilBertModel

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, precision_score, recall_score

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding, LSTM, Conv2D, Conv1D, MaxPooling1D, Dense, Dropout, GlobalMaxPooling1D, Input, Bidirectional, concatenate, Flatten, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.utils import plot_model

import cleanlab

# physical_devices = tf.config.list_physical_devices('GPU')
# tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

# Preparing data

Creating labeling noise for evaluation phase

In [None]:
from faker import Faker
import pandas as pd
import random

fake = Faker()

# generate random text data
data = []
for i in range(500):
    text = fake.text()
    true_label = 5
    label = random.choice(list(range(1,5)))
    data.append((text, label, true_label))

# create a pandas dataframe
df = pd.DataFrame(data, columns=['text', 'changed Index' ,'Class Index'])
df

In [None]:
data = pd.read_csv("/content/drive/MyDrive/ECE720/downsample_agnew_with_oos.csv")
data["changed Index"] = data["Class Index"]

# Select n random rows
random_rows = np.random.choice(data[data["Class Index"] != 5].index, size=120, replace=False)

# Set the "changed Index" column of the selected rows to a random integer between 1 and 4
data.loc[random_rows, "changed Index"] = np.random.randint(1, 5, size=120)


In [None]:
final_data = pd.concat([data, df]).reset_index(drop=True)
final_data = final_data.sample(frac=1).reset_index(drop=True)
final_data.to_csv("/content/drive/MyDrive/ECE720/digi_text/AGnew/final_data.csv", index=False)
final_data

Loading the datset and concatenating it with noisy data

In [None]:
data = pd.read_csv("/content/drive/MyDrive/ECE720/downsample_agnew_with_oos.csv")
data["changed Index"] = data["Class Index"]
temp = data[data["changed Index"] ==5]
temp2 = pd.read_csv("/content/drive/MyDrive/ECE720/digi_text/AGnew/final_data.csv")
final_data = pd.concat([temp, temp2]).reset_index(drop=True)
final_data = final_data.sample(frac=1).reset_index(drop=True)
final_data

# Training Classifer for Out of Sample Prediction

In [None]:
from sklearn.model_selection import KFold

# Assuming the dataframe is named "final_data" and you want to divide it into 5 folds
k = 5
kf = KFold(n_splits=k)

fold_indices = []
for train_index, test_index in kf.split(final_data):
    fold_indices.append((train_index, test_index))

In [None]:
vocab_size = 20000
embed_size = 32
distil_bert = 'distilbert-base-uncased'

maxlen = final_data['text'].map(lambda x: len(x.split())).max()

tokenizer = DistilBertTokenizer.from_pretrained(distil_bert, do_lower_case=True, add_special_tokens=True,
                                                max_length=maxlen, pad_to_max_length=True)

def tokenize(sentences, tokenizer):
    input_ids, input_masks, input_segments = [],[],[]
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=maxlen, pad_to_max_length=True, 
                                             return_attention_mask=True, return_token_type_ids=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])        
        
    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32'), np.asarray(input_segments, dtype='int32')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
predctions = [[]] * len(final_data)
for i, (train_index, test_index) in enumerate(fold_indices):
  train_data = final_data.iloc[train_index]
  test_data = final_data.iloc[test_index]


  X_train = train_data['text']
  y_train = train_data['changed Index'].apply(lambda x: x-1).values

  x_test = test_data['text']
  y_test = test_data['changed Index'].apply(lambda x: x-1).values


  num_labels = 5

  # Tokenize desc and title train data
  X_train = tokenize(X_train, tokenizer)
  x_test = tokenize(x_test, tokenizer)


  with tpu_strategy.scope():
    config = DistilBertConfig(dropout=0.2, attention_dropout=0.2)
    config.output_hidden_states = False
    transformer_model = TFDistilBertModel.from_pretrained(distil_bert, config=config)

    input_ids_in = tf.keras.layers.Input(shape=(maxlen,), name='input_token', dtype='int32')
    input_masks_in = tf.keras.layers.Input(shape=(maxlen,), name='masked_token', dtype='int32') 

    embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0]
    X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(embedding_layer)
    X = tf.keras.layers.GlobalMaxPool1D()(X)
    X = tf.keras.layers.Dense(64, activation='relu')(X)
    X = tf.keras.layers.Dropout(0.2)(X)
    X = tf.keras.layers.Dense(num_labels, activation='sigmoid')(X)
    model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs = X)

    for layer in model.layers[:3]:
      layer.trainable = False

  print(model.summary())

  callbacks = [
    ModelCheckpoint(
        filepath="/content/drive/MyDrive/ECE720/digi_text/AGnew/weights_oos_fold"+str(i)+".h5",
        monitor='val_accuracy', 
        mode='max', 
        save_best_only=True,
        save_weights_only=True,
        verbose=1)]

  model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  model.fit(X_train[0:2], y_train, batch_size=1024, validation_data=(x_test[0:2], y_test), epochs=10, callbacks=callbacks)

  pred_probs = model.predict(x_test[0:2])
  for j in range(len(test_index)):
    if predctions[test_index[j]] != []:
      print("ga")
    predctions[test_index[j]] = pred_probs[j]


In [None]:
preds = [np.argmax(i) for i in predctions]
labels = np.asarray(final_data["changed Index"])-1

In [None]:
cm  = confusion_matrix(labels, preds)
plt.figure()
plot_confusion_matrix(cm, figsize=(16,12), hide_ticks=True, cmap=plt.cm.Blues)
plt.xticks(range(5), [0,1,2,3,4], fontsize=12)
plt.yticks(range(5),  [0,1,2,3,4], fontsize=12)
plt.show()

# Confident Learning Step

In [None]:

# STEP 1 - Compute confident joint

# Verify inputs
labels = np.asarray(final_data["changed Index"])-1
pred_probs = np.asarray(predctions)

# Find the number of unique classes if K is not given
K = len(np.unique(labels))

# Estimate the probability thresholds for confident counting
# You can specify these thresholds yourself if you want
# as you may want to optimize them using a validation set.
# By default (and provably so) they are set to the average class prob.
thresholds = [np.mean(pred_probs[:, k][labels == k]) for k in range(K)]  # P(label^=k|label=k)
thresholds = np.asarray(thresholds)

# Compute confident joint
confident_joint = np.zeros((K, K), dtype=int)
for i, row in enumerate(pred_probs):
    # row = np.array(row)
    s_label = labels[i]
    # Find out how many classes each example is confidently labeled as
    confident_bins = row >= thresholds - 1e-6
    num_confident_bins = sum(confident_bins)
    # If more than one conf class, inc the count of the max prob class
    if num_confident_bins == 1:
        confident_joint[s_label][np.argmax(confident_bins)] += 1
    elif num_confident_bins > 1:
        confident_joint[s_label][np.argmax(row)] += 1

# Normalize confident joint (use cleanlab, trust me on this)
confident_joint = cleanlab.count.calibrate_confident_joint(confident_joint, labels)

cleanlab.internal.util.print_joint_matrix(confident_joint)



 Joint Label Noise Distribution Matrix P(given_label, true_label) of shape (5, 5)
 p(s,y)	y=0	y=1	y=2	y=3	y=4
	---	---	---	---	---
s=0 |	5275	121	183	135	0
s=1 |	60	5641	12	17	1
s=2 |	144	26	5081	473	4
s=3 |	116	20	264	5330	1
s=4 |	11	13	22	14	11142
	Trace(matrix) = 32469



In [None]:
# STEP 2 - Find label errors

# We arbitrarily choose at least 5 examples left in every class.
# Regardless of whether some of them might be label errors.
MIN_NUM_PER_CLASS = 5
# Leave at least MIN_NUM_PER_CLASS examples per class.
# NOTE prune_count_matrix is transposed (relative to confident_joint)
prune_count_matrix = cleanlab.filter._keep_at_least_n_per_class(
    prune_count_matrix=confident_joint.T,
    n=MIN_NUM_PER_CLASS,
)

s_counts = np.bincount(labels)
noise_masks_per_class = []
# For each row in the transposed confident joint
for k in range(K):
    noise_mask = np.zeros(len(pred_probs), dtype=bool)
    pred_probs_k = pred_probs[:, k]
    if s_counts[k] > MIN_NUM_PER_CLASS:  # Don't prune if not MIN_NUM_PER_CLASS
        for j in range(K):  # noisy label index (k is the true label index)
            if k != j:  # Only prune for noise rates, not diagonal entries
                num2prune = prune_count_matrix[k][j]
                if num2prune > 0:
                    # num2prune'th largest p(classk) - p(class j)
                    # for x with noisy label j
                    margin = pred_probs_k - pred_probs[:, j]
                    s_filter = labels == j
                    threshold = -np.partition(-margin[s_filter], num2prune - 1)[
                        num2prune - 1
                    ]
                    noise_mask = noise_mask | (s_filter & (margin >= threshold))
        noise_masks_per_class.append(noise_mask)
    else:
        noise_masks_per_class.append(np.zeros(len(labels), dtype=bool))

# Boolean label error mask
label_errors_bool = np.stack(noise_masks_per_class).any(axis=0)

# Remove label errors if given label == model prediction
for i, pred_label in enumerate(pred_probs.argmax(axis=1)):
    # np.all lets this work for multi_label and single label
    if label_errors_bool[i] and np.all(pred_label == labels[i]):
        label_errors_bool[i] = False

# Convert boolean mask to an ordered list of indices for label errors
label_errors_idx = np.arange(len(labels))[label_errors_bool]
# self confidence is the holdout probability that an example
# belongs to its given class label
self_confidence = np.array([np.mean(pred_probs[i][labels[i]]) for i in label_errors_idx])
margin = self_confidence - pred_probs[label_errors_bool].max(axis=1)
label_errors_idx = label_errors_idx[np.argsort(margin)]

print("Indices of label errors found by confident learning:")
print("Note label errors are sorted by likelihood of being an error")
print("but here we just sort them by index for comparison with above.")
print(np.array(sorted(label_errors_idx)))


Indices of label errors found by confident learning:
Note label errors are sorted by likelihood of being an error
but here we just sort them by index for comparison with above.
[   14    35    53 ... 33966 33980 34103]


In [None]:
test = final_data.iloc[label_errors_idx]
test['pred'] = np.array(preds)[label_errors_idx] + 1
test

In [None]:
different_rows = final_data[final_data["changed Index"] != final_data["Class Index"]]
actual_label_errors = list(different_rows.index)
NUM_ERRORS= len(actual_label_errors)

In [None]:
score = sum([e in label_errors_idx for e in actual_label_errors]) / NUM_ERRORS
print("% actual errors that confident learning found: {:.0%}".format(score))
score = sum([e in actual_label_errors for e in label_errors_idx]) / len(
    label_errors_idx
)

# OOS data generation

In [None]:
import random

def generate_mixed_sentence(df, label_col="label"):
    # Randomly select 3 texts with 3 different labels
    texts = []
    labels = []
    while len(texts) < 3:
        row = df.sample(n=1)
        if row[label_col].values[0] not in labels:
            texts.append(row['text'].values[0])
            labels.append(row[label_col].values[0])

    # Mix words of selected texts
    words = []
    for text in texts:
        words += text.split()
    random.shuffle(words)

    # Generate new sentence with random length
    new_sentence = ' '.join(random.sample(words, random.randint(min(len(words), 10), int(len(words)/3))))
    
    return new_sentence


In [None]:
data = pd.read_csv("/content/drive/MyDrive/ECE720/train.csv")

X_train = data['Title'] + " " + data['Description']
y_train = data['Class Index'].apply(lambda x: x-1).values # Classes need to begin from 0
data['text'] = data['Title'] + " " + data['Description']
data

In [None]:
from transformers import pipeline
fill = pipeline('fill-mask', tokenizer='bert-base-uncased', model="bert-base-uncased")

METHOD1

In [None]:
import random

for i in range(data.shape[0]):
  text = generate_mixed_sentence(data,label_col="Class Index")
  # text = data.iloc[i]['text']
  text = text.split()
  for j in range(len(text)):
    if j >= len(text):
      break
    prob = random.uniform(0, 1)
    if prob < 0.8:
      text[j] = "[MASK]"
      text = " ".join(text)
      text = fill(text)[random.randint(0,4)]['sequence']
      text = text.split()
    # elif prob < 0.8:
    #   text[j] = r.get_random_word()


  text = " ".join(text) 
  print(text)
  data.loc[i,'oos'] = text
  data.loc[i,'Class Index'] = "oos"

  if i%100==0:
    print(i,"******************************************************\n")
    data.to_csv("/content/drive/MyDrive/ECE720/oos_new1.csv", index=False)

data

METHOD2

In [None]:
import random

for i in range(data.shape[0]):
  text = data.iloc[i]['text']
  text = text.split()
  len_sentence = random.randint(int(len(text)/1.5),len(text))
  text = text[random.randint(0,len(text)-1)]
  for j in range(len_sentence):
    text = text + " [MASK]"
    text = fill(text)[random.randint(0,4)]['sequence']

  print(text)
  data.loc[i,'oos'] = text
  data.loc[i,'Class Index'] = "oos"

  if i%100==0:
    print(i,"******************************************************\n")
    data.to_csv("/content/drive/MyDrive/ECE720/oos_new1.csv", index=False)

data

In [None]:
df = pd.read_csv("/content/drive/MyDrive/ECE720/oos_new.csv")
df1 = df[df['Class Index'] == 'oos']
df1.to_csv("/content/drive/MyDrive/ECE720/oos_dgnew1.csv", index=False)

In [None]:
oos = pd.read_csv("/content/drive/MyDrive/ECE720/oos_dgnew1.csv")
oos['text'] = oos['oos']
oos = oos.drop(columns=['oos', 'Description', 'Title'])
oos['Class Index'] = 5

In [None]:
data = pd.read_csv("/content/drive/MyDrive/ECE720/train.csv")

X_train = data['Title'] + " " + data['Description']
y_train = data['Class Index'].apply(lambda x: x-1).values # Classes need to begin from 0
data['text'] = data['Title'] + " " + data['Description']
data = data.drop(columns=['Description', 'Title'])

In [None]:
final_data = pd.concat([data, oos]).reset_index(drop=True)

In [None]:
import pandas as pd

# Read the CSV file into a pandas dataframe
df = final_data

min_class_count = df['Class Index'].value_counts().min()
result = df.groupby('Class Index').head(min_class_count).reset_index(drop=True)


In [None]:
result.to_csv("/content/drive/MyDrive/ECE720/downsample_agnew_with_oos.csv", index=False)