In [None]:
# import zipfile
#
# from fontTools.misc.cython import returns
# !curl -O https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
#
# zip_ref = zipfile.ZipFile('nlp_getting_started.zip')
# zip_ref.extractall()
# zip_ref.close()

In [None]:
import pandas as pd

train_df = pd.read_csv("nlp_getting_started/train.csv")
test_df = pd.read_csv("nlp_getting_started/test.csv")
train_df.head()

In [None]:
train_df_shuffled = train_df.sample(frac=1, random_state=42)  # shuffle with random_state=42 for reproducibility
train_df_shuffled.head()

In [None]:
train_df.target.value_counts()

In [None]:
import random

random_index = random.randint(0, len(train_df) - 5)
for row in train_df[["text", "target"]][random_index:random_index + 5].itertuples():
    _, text, target = row
    print(f"target: {target}", "(Real disaster)" if target > 0 else "(Not Real disaster)")
    print(f"Text:\n{text}\n")
    print("--------------\n")

In [None]:
#Split data into training and validation sets
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                                            train_df_shuffled["target"].to_numpy(),
                                                                            test_size=0.1, random_state=42)

In [None]:
len(train_sentences), len(val_sentences), len(train_labels), len(val_labels)

In [None]:
train_sentences[:5]

In [None]:
#converting text to number
import tensorflow as tf
from keras.api.layers import TextVectorization

text_vectorizer = TextVectorization(
    max_tokens=500000, standardize="lower_and_strip_punctuation",
    split="whitespace", ngrams=None, output_mode="int",
    output_sequence_length=None, pad_to_max_tokens=True,
)


In [None]:
len(train_sentences[0].split())

In [None]:
round(sum([len(i.split()) for i in train_sentences])) / len(train_sentences)

In [None]:
max_vocab_length = 10000
max_length = 15
text_vectorizer = TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length
)
text_vectorizer.adapt(train_sentences)

In [None]:
sample_sentences = "There's a flood in my street !!"
text_vectorizer([sample_sentences])

In [None]:
#Choose a random sentence frm dataset and tokenize it
random_sentence = random.choice(train_sentences)
print(f"Original sentence: \n {random_sentence}\
      \n\nVectorize version:")
text_vectorizer([random_sentence])

In [None]:
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:]
print(f"Number of word in vocabulary: {len(words_in_vocab)}")
print(f"5 most common word: {top_5_words}")
print(f"5 least common word: {bottom_5_words}")

In [None]:
#Creating an Embedding using an Embedding layer
from keras.api.layers import Embedding
embeddings=Embedding(input_dim=max_vocab_length,
                     output_dim=128,
                     embeddings_initializer="uniform",
                     input_length=max_length, name="embeddings_1")
embeddings

In [None]:
random_sentence = random.choice(train_sentences)
print(f"Original sentence: \n {random_sentence}\
\n\nVectorize version:")

sample_embeded=embeddings(text_vectorizer([random_sentence]))
sample_embeded

In [None]:
sample_embeded[0][0],sample_embeded[0][0].shape, random_sentence

In [None]:
from sklearn.pipeline import Pipeline
#model 0
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

model0 = Pipeline([
    ("tfidf", TfidfVectorizer()), #convert words into numbers using tfidf
    ("clf", MultinomialNB()) #model the text
])

model0.fit(train_sentences, train_labels)

In [None]:
baseline_score=model0.score(val_sentences, val_labels)
print(f"Baseline score: {baseline_score*100:.2f}%")

In [None]:
baseline_preds=model0.predict(val_sentences)
baseline_preds[:20]

In [None]:
train_labels[:20]

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_result(y_true, y_pred):
    model_accuracy = accuracy_score(y_true, y_pred) * 100
    model_prediction, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    model_results = {
        "accuracy": model_accuracy,
        "precision": model_prediction,
        "recall": model_recall,
        "f1": model_f1,
    }
    return model_results

In [None]:
baseline_result=calculate_result(y_true=val_labels, y_pred=baseline_preds)
baseline_result

In [None]:
from FRAME.TENSORFLOW.helper_functions import create_tensorboard_callback

SAVE_DIR= "model_logs"

In [None]:
from keras.api.layers import Input,Dense,GlobalAveragePooling1D
from keras.api.models import Model
inputs=Input(shape=(1,), dtype=tf.string)
x=text_vectorizer(inputs)
x=embeddings(x)
x=GlobalAveragePooling1D()(x)
outputs=Dense(1, activation="sigmoid")(x)
model1=Model(inputs=inputs, outputs=outputs, name="model1_dense")

In [None]:
from keras.api.optimizers import Adam
model1.compile(loss="binary_crossentropy", optimizer=Adam(), metrics=["accuracy"])
model1.summary()

In [None]:
model1_history=model1.fit(train_sentences, train_labels, epochs=5, validation_data=(val_sentences, val_labels),
                          callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,experiment_name="simple-dense-model")],verbose=2)

In [None]:
model1.evaluate(val_sentences, val_labels)

In [None]:
embeddings.weights

In [None]:
embed_weights=model1.get_layer("embeddings_1").get_weights()[0]
print(embed_weights.shape)

In [None]:
model_1_pred_probs=model1.predict(val_sentences)
model_1_pred_probs[:10]

In [None]:
model1_preds=tf.squeeze(tf.round(model_1_pred_probs))
model1_preds[:10]

In [None]:
model1_results=calculate_result(y_true=val_labels, y_pred=model1_preds)
model1_results

In [None]:
import numpy as np
np.array(list(model1_results.values()))>np.array(list(baseline_result.values()))

In [None]:
# Create a helper function to compare our baseline results to new model results
def compare_baseline_to_new_results(baseline_results, new_model_results):
  for key, value in baseline_results.items():
    print(f"Baseline {key}: {value:.2f}, New {key}: {new_model_results[key]:.2f}, Difference: {new_model_results[key]-value:.2f}")

compare_baseline_to_new_results(baseline_results=baseline_result,
                                new_model_results=model1_results)

In [None]:
words_in_vocab=text_vectorizer.get_vocabulary()
len(words_in_vocab),words_in_vocab[:10]

In [None]:
model1.summary()

In [None]:
embed_weights=model1.get_layer("embeddings_1").get_weights()[0]
embed_weights.shape

In [None]:
import io
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(words_in_vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = embed_weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [None]:
from keras.api.layers import LSTM
import tensorflow as tf

tf.random.set_seed(42)
model_2_embedding = Embedding(input_dim=max_vocab_length, output_dim=128, embeddings_initializer="uniform",
                              input_length=max_length, name="embeddings_2")

input=Input(shape=(1,), dtype="string")
x=text_vectorizer(input)
x=model_2_embedding(x)
print(x.shape)

x=LSTM(64)(x)
print(x.shape)

outputs=Dense(1, activation="sigmoid")(x)
model2=Model(input, outputs, name="model2_LSTM")

In [None]:
model2.compile(loss="binary_crossentropy", optimizer=Adam(), metrics=["accuracy"])

In [None]:
model2.summary()

In [None]:
model2_history = model2.fit(train_sentences, train_labels, epochs=5, validation_data=(val_sentences, val_labels),
                            callbacks=[create_tensorboard_callback(SAVE_DIR, "LSTM")], verbose=2)

In [None]:
model_2_pred_probs=model2.predict(val_sentences)
model_2_pred_probs.shape,model_2_pred_probs[:10]

In [None]:
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_preds[:10]

In [None]:
model_2_results = calculate_result(y_true=val_labels,
                                    y_pred=model_2_preds)
model_2_results

In [None]:
compare_baseline_to_new_results(baseline_result, model_2_results)

In [None]:
from keras.api.layers import GRU
#Model GRU
tf.random.set_seed(42)
model_3_embedding = Embedding(input_dim=max_vocab_length, output_dim=128, embeddings_initializer="uniform",
                              input_length=max_length, name="embeddings_3")
input=Input(shape=(1,), dtype="string")
x=text_vectorizer(input)
x=model_3_embedding(x)
x=GRU(64)(x)
outputs=Dense(1, activation="sigmoid")(x)
model3=Model(input, outputs, name="model3_GRU")

In [None]:
model3.compile(loss="binary_crossentropy", optimizer=Adam(), metrics=["accuracy"])

In [None]:
model3.summary()

In [None]:
model3_history = model3.fit(train_sentences, train_labels, epochs=5, validation_data=(val_sentences, val_labels),
                            verbose=2)

In [None]:
model3_pred_probs=model3.predict(val_sentences)
model3_pred_probs.shape,model3_pred_probs[:10]

In [None]:
model3_preds = tf.squeeze(tf.round(model3_pred_probs))
model3_preds[:10]

In [None]:
model3_results = calculate_result(y_true=val_labels, y_pred=model3_preds)
model3_results

In [None]:
compare_baseline_to_new_results(baseline_result, model3_results)

In [None]:
#bidirectional RNN model

tf.random.set_seed(42)
from keras.api.layers import Bidirectional

model4_embedding = Embedding(input_dim=max_vocab_length, output_dim=128, embeddings_initializer="uniform",
                             input_length=max_length, name="embeddings_4")
input=Input(shape=(1,), dtype="string")
x=text_vectorizer(input)
x=model4_embedding(x)
x=Bidirectional(LSTM(64))(x)
outputs=Dense(1, activation="sigmoid")(x)
model4=Model(input, outputs, name="model4_bidirectional")

In [None]:
model4.compile(loss="binary_crossentropy", optimizer=Adam(), metrics=["accuracy"])
model4.summary()

In [None]:
model4_history = model4.fit(train_sentences, train_labels, epochs=5, validation_data=(val_sentences, val_labels),
                            verbose=2)

In [None]:
model4_pred_probs=model4.predict(val_sentences)
model4_pred_probs.shape,model4_pred_probs[:10]

In [None]:
model4_preds = tf.squeeze(tf.round(model4_pred_probs))
model4_preds[:10]

In [None]:
model4_results = calculate_result(y_true=val_labels, y_pred=model4_preds)
model4_results

In [None]:
compare_baseline_to_new_results(baseline_result, model4_results)

### Using CNN for TEXT

In [None]:
from keras.api.layers import Conv1D, GlobalMaxPooling1D

embedding_test = embeddings(text_vectorizer(["this is a test sentence"]))
conv_1d = Conv1D(filters=32, kernel_size=5, activation="relu")
conv_1d_output = conv_1d(embedding_test)
max_pool = GlobalMaxPooling1D()
max_pool_output = max_pool(conv_1d_output)
embedding_test.shape, conv_1d_output.shape, max_pool_output.shape

In [None]:
embedding_test[:1], conv_1d_output[:1], max_pool_output[:1]

In [None]:
tf.random.set_seed(42)
model5_embedding = Embedding(input_dim=max_vocab_length, output_dim=128, embeddings_initializer="uniform",input_length=max_length, name="embeddings_5")
input=Input(shape=(1,), dtype="string")
x=text_vectorizer(input)
x=model5_embedding(x)
x=Conv1D(filters=32, kernel_size=5, activation="relu")(x)
x=GlobalAveragePooling1D()(x)
outputs=Dense(1, activation="sigmoid")(x)
model5=Model(input, outputs, name="model5")
model5.compile(loss="binary_crossentropy", optimizer=Adam(), metrics=["accuracy"])
model5.summary()

In [None]:
model5_history = model5.fit(train_sentences, train_labels, epochs=5, validation_data=(val_sentences, val_labels),
                            verbose=2)

In [None]:
model5_pred_probs=model5.predict(val_sentences)
model5_pred_probs[:10]

In [None]:
# Convert model_5 prediction probabilities to labels
model_5_preds = tf.squeeze(tf.round(model5_pred_probs))
model_5_preds[:10]

In [None]:
model_5_results = calculate_result(y_true=val_labels, y_pred=model_5_preds)
model_5_results

In [None]:
compare_baseline_to_new_results(baseline_result, model_5_results)

In [None]:
all_model_results = pd.DataFrame({
    "baseline": baseline_result,
    "model1": model1_results,
    "model2": model_2_results,
    "model3": model3_results,
    "model4": model4_results,
    "model5": model_5_results
})
all_model_results = all_model_results.transpose()
all_model_results

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/german-credit")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/uciml/german-credit?dataset_version_number=1...


100%|██████████| 10.9k/10.9k [00:00<00:00, 3.82MB/s]

Extracting files...
Path to dataset files: C:\Users\anmol\.cache\kagglehub\datasets\uciml\german-credit\versions\1



