<a href="https://www.kaggle.com/code/kelixirr/natural-language-processing-with-disaster-tweets?scriptVersionId=145189618" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.target.value_counts()

In [None]:
len(train_df) + len(test_df)

In [None]:
# shuffle the training data 
train_data = train_df.sample(frac =1, random_state = 42)
train_data.head()

In [None]:
# visulize the data
import random
random_index = random.randint(0, len(train_df) - 5)

for row in train_data[["text", "target"]][random_index:random_index + 5].itertuples():
    _,text, target = row
    print(f"Target: {target}", "(real disater)" if target > 0 else "(not real disater)")
    print(f"Text: \n {text} \n")
    print("...\n")
    

In [None]:
# split the data inot training and validation set
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_data["text"].to_numpy(),
                                                                           train_data["target"].to_numpy(),
                                                                           test_size =0.1,
                                                                           random_state = 42)

In [None]:
len(train_sentences), len(val_sentences), len(train_labels), len(val_labels)

In [None]:
train_sentences[:10]

In [None]:
train_labels[:10]

## Text Vectorization

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

In [None]:
# average number of words or tokens in training data
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

In [None]:
text_vectorizer = TextVectorization(max_tokens = 10000,
                                    output_mode = "int",
                                    output_sequence_length = 15)

In [None]:
# fit the text vectorizer to data
text_vectorizer.adapt(train_sentences)

In [None]:
# checking for the text vectorizer 
random_sentence = random.choice(train_sentences)
print(f"original {random_sentence}")
print(f"Vectorized: {text_vectorizer([random_sentence])}")

In [None]:
# unique words in our dictionary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5 = words_in_vocab[:5]
bottom_5 = words_in_vocab[-5:]

print("Vocab", len(words_in_vocab))
print("Top 5 words:", top_5)
print("Bottom 5 words:", bottom_5)

## Creating Embeddings

In [None]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim = 10000,
                            output_dim = 128,
                            embeddings_initializer = "uniform",
                            input_length = 15,
                            name = "embedding_1")
embedding

In [None]:
# example from sample data

random_sentence = random.choice(train_sentences)
print(f"original {random_sentence}")
print(f"Vectorized: {text_vectorizer([random_sentence])}")
print(f"Vectorized: {embedding(text_vectorizer([random_sentence]))}")

## Creating Our Models

### 0. Naive Bayes - Baseline

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# create tokenizartiona and modelling the pipeline
model_0 = Pipeline([
    
    ("tfdf", TfidfVectorizer()),
    ("clf", MultinomialNB())
])

model_0.fit(train_sentences, train_labels)

In [None]:
baseline_score = model_0.score(val_sentences, val_labels)
baseline_score

In [None]:
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

#### Creating Evaluation functions for our future use

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
    
    # accuracy
    model_accuracy = accuracy_score(y_true, y_pred) * 100
    
    # model_precision,f1_score, recall using weighted average
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    
    model_results = {
        
        "accurracy": model_accuracy,
        "precision": model_precision,
        "recall": model_recall,
        "f1": model_f1
    }
    
    return model_results

In [None]:
baseline_results = calculate_results(y_true = val_labels,
                                     y_pred = baseline_preds)
baseline_results

### Model 1 - Simple Dense

In [None]:
# importing helper functions
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

In [None]:
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

In [None]:
# create directory to save log 
SAVE_DIR = "model_logs"

In [None]:
# building the model 
inputs = layers.Input(shape=(1,), dtype = "string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1, activation = "sigmoid")(x)
model_1 = tf.keras.Model(inputs, outputs, name = "model_1_choice")

In [None]:
# compile the model
model_1.compile(loss = "binary_crossentropy",
                optimizer = tf.keras.optimizers.Adam(),
                metrics = ["accuracy"])

In [None]:
model_1.summary()

In [None]:
# fit the model 
model_1_history = model_1.fit(train_sentences,
                              train_labels,
                              epochs = 5,
                              validation_data = (val_sentences, val_labels),
                              callbacks = [create_tensorboard_callback(dir_name=SAVE_DIR, 
                                                                     experiment_name="simple_dense_model")])

In [None]:
model_1.evaluate(val_sentences, val_labels)

In [None]:
embedding.weights

In [None]:
embed_weights = model_1.get_layer("embedding_1").get_weights()[0]
print(embed_weights.shape)

In [None]:
# make predictions
model_1_preds = tf.squeeze(tf.round(model_1.predict(val_sentences)))
model_1_preds[:10]

In [None]:
# model_1 metrics 
model_1_results = calculate_results(y_true = val_labels,
                                    y_pred = model_1_preds)

model_1_results

In [None]:
# comparing results of baseline with new models 

def compare_with_baseline(baseline_results, new_model_results):
    for key, value in baseline_results.items():
        print(f"Baseline {key}: {value:.2f}, New {key}: {new_model_results[key]:.2f}, Difference: {new_model_results[key] - value:.2f}")


In [None]:
compare_with_baseline(baseline_results=baseline_results, 
                                new_model_results=model_1_results)

## Model 2 - LSTM 

In [None]:
tf.random.set_seed(42)

model_2_embedding = layers.Embedding(input_dim = 10000,
                                     output_dim = 128,
                                     embeddings_initializer = "uniform",
                                     input_length = 15,
                                     name = "embedding_2")

# LSTM Model
inputs = layers.Input(shape = (1,), dtype = "string")
x = text_vectorizer(inputs)
x = model_2_embedding(x)
print(x.shape)
x = layers.LSTM(64)(x)
print(x.shape)
outputs = layers.Dense(1, activation = "sigmoid")(x)
model_2 = tf.keras.Model(inputs, outputs , name = "model_2_LSTM")

In [None]:
# compile the model 

model_2.compile(loss = "binary_crossentropy",
                optimizer = tf.keras.optimizers.Adam(),
                metrics = ["accuracy"])

model_2.summary()

In [None]:
# fit the model 
model_2_history = model_2.fit(train_sentences,
                              train_labels,
                              epochs = 5,
                              validation_data = (val_sentences, val_labels),
                              callbacks = [create_tensorboard_callback(SAVE_DIR, 
                                                                     "LSTM")])

In [None]:
model_2_pred_probs = model_2.predict(val_sentences)
model_2_pred_probs.shape, model_2_pred_probs[:10]


In [None]:
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_preds[:10]

In [None]:
model_2_results = calculate_results(y_true=val_labels,
                                    y_pred=model_2_preds)
model_2_results

In [None]:
compare_with_baseline(baseline_results, model_2_results)

### Model 3 -- GRU

In [None]:
model_3_embedding = layers.Embedding(input_dim = 10000,
                                     output_dim = 128,
                                     embeddings_initializer = "uniform",
                                     input_length = 15,
                                     name = "embedding_3")

# model 
inputs = layers.Input(shape = (1,), dtype = 'string')
x = text_vectorizer(inputs)
x = model_3_embedding(x)
x = layers.GRU(64)(x)
outputs = layers.Dense(1, activation = "sigmoid")(x)

model_3 = tf.keras.Model(inputs, outputs, name = "model_3_GRU")

In [None]:
model_3.compile( loss = "binary_crossentropy",
                 optimizer = tf.keras.optimizers.Adam(),
                 metrics = ["accuracy"])

In [None]:
model_3.summary()

In [None]:
model_3_history = model_3.fit(train_sentences,
                              train_labels, 
                              epochs = 5,
                              validation_data = (val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR, "GRU")])

In [None]:
model_3_pred_probs = model_3.predict(val_sentences)
model_3_pred_probs.shape, model_3_pred_probs[:10]

In [None]:
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))
model_3_preds[:10]

In [None]:
model_3_results = calculate_results(y_true=val_labels, 
                                    y_pred=model_3_preds)
model_3_results

In [None]:
compare_with_baseline(baseline_results, model_3_results)

### Bidirection RNN 

In [None]:
model_4_embedding = layers.Embedding(input_dim = 10000,
                                     output_dim = 128,
                                     embeddings_initializer="uniform",
                                     input_length = 15,
                                     name = "embedding_4")

# model

inputs = layers.Input(shape=(1,), dtype = "string")
x = text_vectorizer(inputs)
x = model_4_embedding(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
outputs = layers.Dense(1, activation = "sigmoid")(x)

model_4 = tf.keras.Model(inputs, outputs, name = "model_4_Bidrectional")
model_4.summary()

In [None]:
model_4.compile( loss = "binary_crossentropy",
                 optimizer = tf.keras.optimizers.Adam(),
                 metrics = ["accuracy"])

In [None]:
model_4_history = model_4.fit(train_sentences,
                              train_labels, 
                              epochs = 5,
                              validation_data = (val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR, "bidirectional_RNN")])

In [None]:
model_4_pred_probs = model_4.predict(val_sentences)
model_4_pred_probs[:10]

In [None]:
model_4_preds = tf.squeeze(tf.round(model_4_pred_probs))
model_4_preds[:10]

In [None]:
model_4_results = calculate_results(val_labels, model_4_preds)
model_4_results

In [None]:
compare_with_baseline(baseline_results, model_4_results)

### CNN1D

In [None]:
model_5_embedding = layers.Embedding(input_dim = 10000,
                                     output_dim = 128,
                                     embeddings_initializer = "uniform",
                                     input_length = 15,
                                     name = "embedding_5")


# model 
inputs = layers.Input(shape=(1,), dtype = "string")
x = text_vectorizer(inputs)
x = model_5_embedding(x)
x = layers.Conv1D(filters = 32, kernel_size = 5, activation = "relu")(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1, activation = "sigmoid")(x)

model_5 = tf.keras.Model(inputs, outputs, name="model_5_Conv1D")
model_5.summary()


In [None]:
model_5.compile(loss = "binary_crossentropy",
                optimizer = tf.keras.optimizers.Adam(),
                metrics = ["accuracy"])

In [None]:
model_5_history = model_5.fit(train_sentences,
                              train_labels,
                              epochs = 5,
                              validation_data = (val_sentences, val_labels),
                              callbacks = [create_tensorboard_callback(SAVE_DIR, 
                                                                     "Conv1D")])

In [None]:
model_5_pred_probs = model_5.predict(val_sentences)
model_5_pred_probs[:10]

In [None]:
model_5_preds = tf.squeeze(tf.round(model_5_pred_probs))
model_5_preds[:10]

In [None]:
model_5_results = calculate_results(y_true=val_labels, 
                                    y_pred=model_5_preds)
model_5_results

In [None]:
compare_with_baseline(baseline_results, model_5_results)

### Model 6: TensorFlow Hub Pretrained Sentence Encoder

In [None]:
import tensorflow_hub as hub

In [None]:
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape=[],
                                        dtype=tf.string,
                                        trainable=False,
                                        name="USE")

In [None]:
model_6 = tf.keras.Sequential([
    
    sentence_encoder_layer,
    layers.Dense(64, activation = "relu"),
    layers.Dense(1, activation = "relu"),
], name = 'model_6')

model_6.compile(loss = "binary_crossentropy",
                optimizer = tf.keras.optimizers.Adam(),
                metrics = ["accuracy"])

model_6.summary()

In [None]:
model_6_history = model_6.fit(train_sentences,
                              train_labels,
                              epochs = 5,
                              validation_data = (val_sentences, val_labels),
                              callbacks = [create_tensorboard_callback(SAVE_DIR, 
                                                                     "tf_hub_sentence_encoder")])

In [None]:
model_6_pred_probs = model_6.predict(val_sentences)
model_6_pred_probs[:10]

In [None]:
model_6_preds = tf.squeeze(tf.round(model_6_pred_probs))
model_6_preds[:10]

In [None]:
model_6_results = calculate_results(val_labels, model_6_preds)
model_6_results

In [None]:
compare_with_baseline(baseline_results, model_6_results)

### Model 7: TensorFlow Hub Pretrained Sentence Encoder 10% of the training data

In [None]:
# split our data into 10% training
import numpy as np
train_90, train_10, train_labels_90, train_labels_10 = train_test_split(np.array(train_sentences),
                                                                        train_labels,
                                                                        test_size=0.1,
                                                                        random_state=42)
   
                                           

In [None]:
len(train_sentences), len(train_10)

In [None]:
pd.Series(train_labels_10).value_counts()

In [None]:
# model 7 
model_7 = tf.keras.models.clone_model(model_6)

model_7.compile(loss = "binary_crossentropy",
                optimizer = "Adam",
                metrics = ["accuracy"])

model_7.summary()

In [None]:
model_7_history = model_7.fit(x=train_10,
                              y=train_labels_10,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR, "10_percent_tf_hub_sentence_encoder")])

In [None]:
model_7_pred_probs = model_7.predict(val_sentences)
model_7_pred_probs[:10]

In [None]:
model_7_preds = tf.squeeze(tf.round(model_7_pred_probs))
model_7_preds[:10]

In [None]:
model_7_results = calculate_results(val_labels, model_7_preds)
model_7_results

In [None]:
compare_with_baseline(baseline_results, model_7_results)

## Comparing Results Of All Models

In [None]:
all_model_results = pd.DataFrame({
    "Baseline": baseline_results,
    'Simple Dense': model_1_results,
    "LSTM": model_2_results,
    "GRU": model_3_results,
    "Bidirectional": model_4_results,
    "Conv1D": model_5_results,
    "TF Sentence Encoder": model_6_results,
    "TF 10 Percent Data": model_7_results,
})

all_model_results = all_model_results.transpose()
all_model_results

In [None]:
all_model_results["accurracy"] = all_model_results["accurracy"]/100

In [None]:
all_model_results.plot(kind="bar", figsize=(10, 7)).legend(bbox_to_anchor=(1.0, 1.0));

In [None]:
all_model_results.sort_values("f1", ascending=False)["f1"].plot(kind="bar", figsize=(10, 7));

## Model Ensembling 

In [None]:
baseline_pred_probs = np.max(model_0.predict_proba(val_sentences), axis = 1)

#combining prediction probabilities
combined_pred_probs = baseline_pred_probs + tf.squeeze(model_2_pred_probs, axis=1) + tf.squeeze(model_6_pred_probs)

#averaging the prediction
combined_preds = tf.round(combined_pred_probs/3)
combined_preds[:20]                       

In [None]:
ensemble_results = calculate_results(val_labels, combined_preds)
ensemble_results

In [None]:
ensemble_results["accurracy"] = ensemble_results["accurracy"] / 100
ensemble_results["accurracy"] 

In [None]:
all_model_results.loc["Ensemble Results"] = ensemble_results

In [None]:
all_model_results

In [None]:
val_df = pd.DataFrame({"text": val_sentences,
                       "target": val_labels,
                       "pred": model_7_preds,
                       "pred_prob": tf.squeeze(model_7_pred_probs)})
val_df.head()

In [None]:
# most wronged predictions
most_wrong = val_df[val_df["target"] != val_df["pred"]].sort_values("pred_prob", ascending=False)
most_wrong[:10]

In [None]:
# False positives Most Wrong
for row in most_wrong[:10].itertuples():
    _, text, target, pred, prob = row
    print(f"Target: {target}, Pred: {int(pred)}, Prob: {prob}")
    print(f"Text:\n{text}\n")
    print("----\n")

In [None]:
# false negatives
for row in most_wrong[-10:].itertuples():
    _, text, target, pred, prob = row
    print(f"Target: {target}, Pred: {int(pred)}, Prob: {prob}")
    print(f"Text:\n{text}\n")
    print("----\n")

## Making Prediction On Test Data

In [None]:
test_df.head()

In [None]:
# making prediction on test data
test_sentences = test_df["text"].to_list()
test_samples = random.sample(test_sentences, 10)

for test_sample in test_samples:
    pred_prob = tf.squeeze(model_6.predict([test_sample]))
    pred = tf.round(pred_prob)
    print(f"Pred: {int(pred)}, Prob: {pred_prob}")
    print(f"Text:\n{test_sample}\n")
    print("----\n")
  

In [None]:
test_df.head()

In [None]:
pred_prob = tf.squeeze(model_6.predict(test_df["text"].to_numpy()))
pred = tf.round(pred_prob)
pred_prob[:10]

In [None]:
pred[:10]

In [None]:
test_data = test_df[:]

In [None]:
submission = test_df[['id']].reset_index(drop=True)
submission['target'] = tf.cast(pred, tf.int64)
submission

In [None]:
submission.to_csv('submission.csv', index=False)