In [1]:
import polars as pl 
import pandas as pd 
import numpy as np 
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import emoji 
import re 
import string

from pathlib import Path

In [2]:
pio.templates.default = "plotly_white"

# <b>1 <span style='color:#c93e22'>|</span> Introduction</b> 

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#c93e22'>1.1 |</span></b> Project description </b></p>
</div>

This is a binary text classification problem. The goal is to identify tweets that are related to a distaster. The positive `1` tweets are messages about disasters and the negatives `0` are other type of messages.

The goal of this project is to create a RNN architecture to detect positive tweets.

Find this project on [github](https://github.com/huwilerb/Disaster-lstm)

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#c93e22'>1.2 |</span></b> Data description </b></p>
</div>

Here are some general information about the data and the datasets. First we define the paths and folders to the data and files, and load the labels and ids into polars dataframes.

In [3]:
input_path = Path('/kaggle/input/nlp-getting-started/')
train_file = input_path.joinpath('train.csv')
test_file = input_path.joinpath('test.csv')

train_df = pl.read_csv(train_file)
test_df = pl.read_csv(test_file)

In [4]:
train_df.describe()

statistic,id,keyword,location,text,target
str,f64,str,str,str,f64
"""count""",7613.0,"""7552""","""5080""","""7613""",7613.0
"""null_count""",0.0,"""61""","""2533""","""0""",0.0
"""mean""",5441.934848,,,,0.42966
"""std""",3137.11609,,,,0.49506
"""min""",1.0,"""ablaze""",""" ""","""! Residents Re…",0.0
"""25%""",2734.0,,,,0.0
"""50%""",5408.0,,,,0.0
"""75%""",8146.0,,,,1.0
"""max""",10873.0,"""wrecked""","""åø\_(?)_/åø""","""åÈMGN-AFRICAå¨…",1.0


In [5]:
print(f'The training dataset contains {train_df.shape[0]} rows and {train_df.shape[1]} columns.')
print(f'The test dataset contains {test_df.shape[0]} rows and {test_df.shape[1]} columns.')

The training dataset contains 7613 rows and 5 columns.
The test dataset contains 3263 rows and 4 columns.


# <b>2 <span style='color:#c93e22'>|</span> EDA and data cleaning</b> 


In [6]:
import subprocess
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer

try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')

[nltk_data] Downloading package wordnet to /kaggle/working/...
Archive:  /kaggle/working/corpora/wordnet.zip
   creating: /kaggle/working/corpora/wordnet/
  inflating: /kaggle/working/corpora/wordnet/lexnames  
  inflating: /kaggle/working/corpora/wordnet/data.verb  
  inflating: /kaggle/working/corpora/wordnet/index.adv  
  inflating: /kaggle/working/corpora/wordnet/adv.exc  
  inflating: /kaggle/working/corpora/wordnet/index.verb  
  inflating: /kaggle/working/corpora/wordnet/cntlist.rev  
  inflating: /kaggle/working/corpora/wordnet/data.adj  
  inflating: /kaggle/working/corpora/wordnet/index.adj  
  inflating: /kaggle/working/corpora/wordnet/LICENSE  
  inflating: /kaggle/working/corpora/wordnet/citation.bib  
  inflating: /kaggle/working/corpora/wordnet/noun.exc  
  inflating: /kaggle/working/corpora/wordnet/verb.exc  
  inflating: /kaggle/working/corpora/wordnet/README  
  inflating: /kaggle/working/corpora/wordnet/index.sense  
  inflating: /kaggle/working/corpora/wordnet/data.


<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#c93e22'>2.1 |</span></b> EDA </b></p>
</div>

### Basic info
Let's take a look the the null values count first.

In [7]:
plot_df = train_df.select(pl.all().is_null().sum()).transpose(include_header=True).rename({'column_0': 'null counts'})

px.bar(plot_df, x='column', y='null counts', title='Null counts')

There are 61 nulls for the keywords and 2533 for the location

We can also check the balance between positive and negative classes: 

In [8]:
plot_df = train_df.group_by(pl.col('target')).agg(pl.len().alias('Counts'))
plot_df = plot_df.with_columns(pl.col('target').cast(pl.Utf8))
px.bar(plot_df, x='target', y='Counts', title="Target counts")

### Word frequency

In [9]:
text_df = (
    train_df
    .select(pl.col('text'))
    .with_columns(
        pl.col('text').str.split(' ').alias('Word')
    ).explode('Word')
    .group_by(pl.col('Word'))
    .agg(pl.len().alias('Counts'))
    .with_columns(
        (pl.col('Counts')/pl.len()).alias("Frequency")
    )
    .sort('Counts', descending=True)
)
text_df.head(10)

Word,Counts,Frequency
str,u32,f64
"""the""",2573,0.080364
"""a""",1840,0.057469
"""to""",1804,0.056345
"""in""",1757,0.054877
"""of""",1721,0.053753
"""and""",1301,0.040635
"""I""",1186,0.037043
"""for""",820,0.025611
"""is""",814,0.025424
"""on""",773,0.024143


In [10]:
text_df.tail(10)

Word,Counts,Frequency
str,u32,f64
"""http://t.co/Gu…",1,3.1e-05
"""Nixon""",1,3.1e-05
"""http://t.co/GJ…",1,3.1e-05
"""Lyf""",1,3.1e-05
"""$40Mln""",1,3.1e-05
"""LiveLeak""",1,3.1e-05
"""http://t.co/AF…",1,3.1e-05
"""FLECHADAS""",1,3.1e-05
"""Dix""",1,3.1e-05
"""http://t.co/4k…",1,3.1e-05


In [11]:
counts = text_df.filter(pl.col('Counts') > 1)['Counts'].to_list() 
px.histogram(counts, nbins=500, title="Word counts histogram")

We see here that the most of the words are present only a few times or only once

In [12]:
counts_df = (
    text_df
    .group_by(pl.col('Counts'))
    .agg(pl.len())
    .sort('len', descending=True)
)
counts_df

Counts,len
u32,u32
1,23425
2,3300
3,1442
4,828
5,507
…,…
82,1
78,1
77,1
76,1


### Urls in text
Tweets often contains urls, let's check if we have some urls in the texts:

In [13]:
urls = (
    train_df
    .filter(pl.col('text').str.contains(r'https?\S+'))
    .select('text')
    
)['text'].to_list()

for t in urls[:30]:
    print(t)
    
print(f'\n There are {len(urls)} messages containing http or https urls')

@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C
We always try to bring the heavy. #metal #RT http://t.co/YAo1e0xngw
#AFRICANBAZE: Breaking news:Nigeria flag set ablaze in Aba. http://t.co/2nndBGwyEi
On plus side LOOK AT THE SKY LAST NIGHT IT WAS ABLAZE http://t.co/qqsmshaJ3N
INEC Office in Abia Set Ablaze - http://t.co/3ImaomknnA
Barbados #Bridgetown JAMAICA ÛÒ Two cars set ablaze: SANTA CRUZ ÛÓ Head of the St Elizabeth Police Superintende...  http://t.co/wDUEaj8Q4J
Check these out: http://t.co/rOI2NSmEJJ http://t.co/3Tj8ZjiN21 http://t.co/YDUiXEfIpE http://t.co/LxTjc87KLS #nsfw
I wanted to set Chicago ablaze with my preaching... But not my hotel! http://t.co/o9qknbfOFX
I gained 3 followers in the last week. You? Know your stats and grow with http://t.co/TIyUliF5c6
How the West was burned: Thousands of wildfires ablaze in California alone http://t.co/vl5TBR3wbr
Check these out: http://t.co/rOI2NSmEJJ http://t.co/3Tj8ZjiN21 http://t.co/YDUiXEfIpE http://t.co/LxTjc87KLS #nsfw
D

We will need to clean this 

### hastags
Let's check the hashtags in the tweets

In [14]:
n = 30
plot_df = (
    train_df
    .with_columns(
        pl.col('text').str.extract_all(r'(#\w+)')
    )
    .filter(pl.col('text').list.len() > 0)
    .explode('text')
    .with_columns(
        pl.col('text').str.to_lowercase(), 
        pl.col('target').cast(pl.Utf8)
    )
    .group_by(['text', 'target'])
    .agg(pl.len().alias('count'))
    .group_by('text')
    .agg(pl.col('count'), pl.col('target'))
    .with_columns(pl.col('count').list.sum().alias('tot'))
    .explode(['count', 'target'])
    .sort('tot')
).tail(n)

fig = go.Figure()
for target in map(str, range(2)):
    x = plot_df.filter(pl.col('target') == target)['text'].to_list()[::-1]
    y = plot_df.filter(pl.col('target') == target)['count'].to_list()[::-1]
    fig.add_trace(go.Bar(
        x=x, 
        y=y, 
        name=target
    ))

title = f'Top {n} hastags per target'
fig.update_layout(barmode='stack', 
                  title=title, 
                  xaxis_title='hashtag',
                  yaxis_title='Count')
fig.show()


### Tweets location 

In [15]:
n = 30
plot_df = (
    train_df
    .filter(~pl.col('location').is_null())
    .group_by(pl.col('location'))
    .agg(pl.len().alias('Count'))
    .sort('Count', descending=True)
    .head(n)
)

px.bar(plot_df, x='location', y='Count', title=f'Top {n} locations', barmode='stack')

In [16]:
n = 30
plot_df = (
    train_df
    .filter(~pl.col('location').is_null())
    .with_columns(pl.col('target').cast(pl.Utf8))
    .group_by(['location', 'target'])
    .agg(pl.len().alias('Count'), )
    .sort('Count', descending=True)
    .head(n)
)
plot_df
px.bar(plot_df, x='location', y='Count', title=f'Top {n} locations', barmode='stack', color='target')





Some of the locations has only negative messages, some other only positives and a part of them have both positive and negative locations. Let's take a deeper look into the ration pos/neg for each locations

In [17]:
plot_df = (
    train_df
    .filter( ~pl.col('location').is_null())
    .group_by('location')
    .agg(
        [
            pl.col('target').filter(pl.col('target') == 1).len().alias('pos'),
            pl.col('target').len().alias('tot'),
            
        ]
    )
    .with_columns(
        (pl.col('pos') / pl.col('tot')).alias('Positive proportion')
    )
    .sort(['Positive proportion', 'tot'], descending=True)
)
px.histogram(plot_df, x='Positive proportion', title='Proportion of positive rates by location', nbins=100)

The Distribution of the proportion of positives rates by location are almost all fully negatives or fully positives. This means that for a given location there is a high probability that all the tweets have the same target is high. 

For the model, we won't use any other feature then `text`

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#c93e22'>2.2 |</span></b> Data Cleaning </b></p>
</div>



First I just create the functions I need and then I will pipe them to pre process the text

In [18]:
def backup_col(df):
    """
    backup the original text in a new column
    """
    df = (
        df
        .with_columns(pl.col('text').alias('text_bkp'))
        )
    return df

def remove_urls(df):
    """
    remove url from text
    """
    df = (
        df
        .with_columns(
            pl.col('text').str.replace_all(r'https?://\S+|www\.\S+', '')
        )
    )
    return df

def remove_html(df):
    """
    remove html tags
    """
    df = (
        df
        .with_columns(
            pl.col('text').str.replace_all(r'<[^>]*>', '')
        )
        .with_columns(
            pl.col('text').str.replace_all('&amp;', '')
        )
    )
    return df

def remove_mentions(df):
    """
    remove mentions such at @elonmusk
    """
    df = (
        df
        .with_columns(
            pl.col('text').str.replace_all(r'@\S+', '')
        )
    )
    return df 

def remove_emojis(df):
    """
    remove emojis
    """
    def remove_emojis(text):
        pattern = re.compile("["
                                u"\U0001F600-\U0001F64F"  
                                u"\U0001F300-\U0001F5FF"  
                                u"\U0001F680-\U0001F6FF"  
                                u"\U0001F1E0-\U0001F1FF"  
                                u"\U00002500-\U00002BEF"  
                                u"\U00002702-\U000027B0"
                                u"\U00002702-\U000027B0"
                                u"\U000024C2-\U0001F251"
                                u"\U0001f926-\U0001f937"
                                u"\U00010000-\U0010ffff"
                                u"\u2640-\u2642"
                                u"\u2600-\u2B55"
                                u"\u200d"
                                u"\u23cf"
                                u"\u23e9"
                                u"\u231a"
                                u"\ufe0f"  
                                u"\u3030"
                                "]+", flags=re.UNICODE)
        
    
        cleaned = pattern.sub(r'', text)
        return cleaned
        
    df = (
        df
        .with_columns(
            pl.col('text').map_elements(lambda x: remove_emojis(x), return_dtype=pl.Utf8)
        )
    )
        
    return df

def remove_punctuation(df):
    """
    Remove all punctuation 
    """
    sc = list(map(str, string.punctuation))
    df = (
        df
        .with_columns(
            pl.col('text').str.replace_many(sc, '')
        )
    )
    return df

def lower_text(df):
    """
    lower the text
    """
    df = (
        df
        .with_columns(
            pl.col('text').str.to_lowercase()
        )
    )
    return df



testing the cleaning

In [19]:
rows = [
    {"text": ".,;:\"'?!Hello, world! This is a sentence with punctuation marks.,;:\"'?!"},
    {"text": "<http://www.example.com> You can visit this website for more information: <http://www.example.com>"},
    {"text": "<https://secure.example.com> Or you can use this secure link instead: <https://secure.example.com>"},
    {"text": "<html><body><h1>This is an HTML document</h1></body></html>"},
    {"text": "Have a great day! 😔😔"}, 
    {"text": "You are crazy @elonmusk"},
    {"text": "Size do NOT maTTer"}
]

# Create a Polars DataFrame from the list of rows
df = pl.DataFrame(rows)

(
    df 
    .pipe(backup_col, )
    .pipe(remove_urls, )
    .pipe(remove_html, )
    .pipe(remove_emojis, )
    .pipe(remove_mentions, )
    .pipe(lower_text, )
    .pipe(remove_punctuation, )
).to_pandas()

Unnamed: 0,text,text_bkp
0,hello world this is a sentence with punctuatio...,".,;:""'?!Hello, world! This is a sentence with ..."
1,you can visit this website for more information,<http://www.example.com> You can visit this we...
2,or you can use this secure link instead,<https://secure.example.com> Or you can use th...
3,this is an html document,<html><body><h1>This is an HTML document</h1><...
4,have a great day,Have a great day! 😔😔
5,you are crazy,You are crazy @elonmusk
6,size do not matter,Size do NOT maTTer


In [20]:
train_df = (
    train_df
    .pipe(backup_col, )
    .pipe(remove_urls, )
    .pipe(remove_html, )
    .pipe(remove_emojis, )
    .pipe(remove_mentions, )
    .pipe(lower_text, )
    .pipe(remove_punctuation, )
)

test_df = (
    test_df
    .pipe(remove_urls, )
    .pipe(remove_html, )
    .pipe(remove_emojis, )
    .pipe(remove_mentions, )
    .pipe(lower_text, )
    .pipe(remove_punctuation, )
)

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#c93e22'>2.3 |</span></b> Data Preprocessing </b></p>
</div>

In [21]:
stops = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
tokenizer = TweetTokenizer()

def process_text(text, tokenizer=tokenizer, lemmatizer=lemmatizer, min_token_len=1):
    tokens = tokenizer.tokenize(text)
    tokens = filter(lambda x: x not in stops, tokens)
    tokens = filter(lambda x: not(x.isdigit() or x.replace('.', '').isnumeric() or x.replace(',', '').isnumeric()), tokens)
    tokens = map(lemmatizer.lemmatize, tokens)
    tokens = map(lambda x: x.strip(), tokens)
    tokens = filter(lambda x: len(x) > min_token_len, tokens)
    return " ".join(tokens)

train_df = (
    train_df
    .with_columns(
        pl.col('text').map_elements(process_text, return_dtype=pl.Utf8)
    )
)
test_df = (
    test_df
    .with_columns(
        pl.col('text').map_elements(process_text, return_dtype=pl.Utf8)
    )
)

train_df.select(['text', 'text_bkp']).head().to_pandas()

Unnamed: 0,text,text_bkp
0,deed reason earthquake may allah forgive,Our Deeds are the Reason of this #earthquake M...
1,forest fire near la ronge sask canada,Forest fire near La Ronge Sask. Canada
2,resident asked shelter place notified officer ...,All residents asked to 'shelter in place' are ...
3,people receive wildfire evacuation order calif...,"13,000 people receive #wildfires evacuation or..."
4,got sent photo ruby alaska smoke wildfire pour...,Just got sent this photo from Ruby #Alaska as ...


# <b>3 <span style='color:#c93e22'>|</span> Model</b> 

In [22]:
from plotly.subplots import make_subplots

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.layers import SimpleRNN, LSTM, Bidirectional, Embedding
from tensorflow.keras.layers import (BatchNormalization, 
                          Dense, 
                          Activation, 
                          Dropout, 
                          Conv1D, 
                          GlobalMaxPool1D, 
                          MaxPooling1D, 
                          Flatten, 
                          SpatialDropout1D)
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.initializers import Constant
from tensorflow.keras.optimizers import Adam

from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score

2024-04-13 21:03:15.833463: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-13 21:03:15.833585: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-13 21:03:15.979502: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [23]:
stop=set(stopwords.words('english'))

In [24]:
def plot_convergence(history, title="Training stats"):
    """
    Plot the training stats 
    """
    hist = history.history
    
    metrics = list(hist.keys())
    train_metrics = list(filter(lambda x: x[:4] != 'val_', metrics))
    epochs = list(range(1, len(hist[metrics[0]]) + 1))
    
    fig = make_subplots(
        rows=1, 
        cols=len(train_metrics), 
        subplot_titles=train_metrics,     
    )    
    
    markers = ['triangle-up-open', 'circle-open']
    colors = ['red', 'blue']
    names = ['training', 'validation']
    
    for idx, metric in enumerate(train_metrics):
        show_legend = False
        if not idx: 
            show_legend = True
        for idy, data in enumerate([metric, 'val_' + metric]):
            y = hist.get(data)
            fig.add_trace(
                go.Scatter(
                    x=epochs, 
                    y=y, 
                    mode='lines+markers', 
                    marker=dict(
                        symbol=markers[idy], 
                        color=colors[idy], 
                        size=8
                    ), 
                    name=names[idy], 
                    showlegend=show_legend
                ), 
                row=1, 
                col=idx+1
            )
    fig.update_layout(
        template='plotly_white',
        title=title
    )

    return fig
    

In [55]:
def plot_results(y_pred, y_true, title='Prediction stats'): 
    """
    Plot the results
    """
    y_pred = (y_pred.flatten() > 0.5).astype(int)
    cm = confusion_matrix(y_pred=y_pred, y_true=y_true, )
    ac = accuracy_score(y_pred=y_pred, y_true=y_true)
    f1 = f1_score(y_pred=y_pred, y_true=y_true)
    recall = recall_score(y_pred=y_pred, y_true=y_true)
    precision = precision_score(y_pred=y_pred, y_true=y_true)
    
    tot_true = np.sum(y_true)
    tot_false = len(y_true) - tot_true
    cm_scale = cm/np.array([tot_false, tot_true])[:, np.newaxis]   
    
    fig = make_subplots(
        rows=1, 
        cols=2, 
        column_widths=[0.75, 0.25]
    )
    fig.add_trace(
        go.Heatmap(
            z=cm_scale,
            x=["0", "1"],
            y=["0", "1"],
            showscale=False, 
            transpose=False,
            text=cm,
            texttemplate="%{text}<br>%{z:.2f}",
            textfont={"size": 30}, 
            colorscale='Bluered',
            zmin=0,
            zmax=1, 
            reversescale=True,
            xgap=8, 
            ygap=8, 
        ), 
        row=1, 
        col=2, 
        
    )
    
    fig.add_trace(
        go.Bar(
            x=['Accuracy', "F1 score", "Recall", "Precision"], 
            y=[ac, f1, recall, precision], 
            marker=dict(
                color=[ac, f1, recall, precision], 
                colorscale='Bluered', 
                cmin=0, 
                cmax=1, 
                reversescale=True
            ),
            text=[ac, f1, recall, precision], 
            texttemplate="%{text:.4f}",  
            textfont={"size": 30}
        ), 
        row=1, 
        col=1
    )
    
    fig.update_layout(
        yaxis1_range=[0, 1],
        xaxis2_title="Predicted", 
        yaxis2_title="Expected", 
        title=title, 
        template='plotly_white'
        
    )
    
    return fig 

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#c93e22'>3.1 |</span></b> Glove embedding </b></p>
</div>



The first step is to transform our text into vectors using tokenization. I choosed to use [GloVe](https://www.kaggle.com/datasets/rtatman/glove-global-vectors-for-word-representation) for this task. 

In [56]:
def get_embedding_dict(file):
    print("Load embedding file: ")
    em_dict = {}

    with glove_file.open('r') as fp: 
        for line in tqdm(fp, total=400000):
            data = line.split() 
            word = data[0]
            vect = np.asarray(data[1:], 'float32')
            em_dict[word] = vect
            
    return em_dict

def get_embedding_matrix(em_dict, word_index, size):
    print("Creating embedding_matrix: ")
    words_len = len(word_index) + 1
    m = np.zeros((words_len, size))
    not_found = []
    
    for word, i in tqdm(word_index.items()):
        em_vect=em_dict.get(word, None)
        if em_vect is not None: 
            m[i] = em_vect
        else: 
            not_found.append(word)
    return m, not_found


In [27]:
maxlen = 100
padding = 'post'
truncating = 'post'
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(train_df['text'].to_list())

word_index = tokenizer.word_index

train_seq = pad_sequences(tokenizer.texts_to_sequences(train_df['text'].to_list()),
                          maxlen=maxlen,
                          padding=padding, 
                          truncating=truncating
                         )

test_seq = pad_sequences(tokenizer.texts_to_sequences(test_df['text'].to_list()),
                         maxlen=maxlen, 
                         padding=padding, 
                         truncating=truncating
                        )

In [28]:
glove_path = Path('/kaggle/input/glove-global-vectors-for-word-representation')
glove_filename = 'glove.6B.100d.txt'
glove_file = glove_path.joinpath(glove_filename)
assert glove_file.exists()

em_dict = get_embedding_dict(glove_file)
em_matrix, not_found = get_embedding_matrix(em_dict, word_index, size=maxlen)

print(em_matrix.shape)
print(f"There are {len(not_found)} words missing in glove for a total of {len(word_index.keys())} words")

Load embedding file: 


100%|██████████| 400000/400000 [00:14<00:00, 27451.70it/s]


Creating embedding_matrix: 


100%|██████████| 13697/13697 [00:00<00:00, 333965.32it/s]

(13698, 100)
There are 3035 words missing in glove for a total of 13697 words





<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#c93e22'>3.2 |</span></b> Datasets creation </b></p>
</div>

In [29]:
def generate_datasets(df, padded, train_frac): 
    assert df.shape[0] == len(padded)
    idx = np.round(len(padded)*train_frac).astype(int)
    y = df['target'].to_numpy()
    
    X_train = padded[:idx]
    X_val = padded[idx:]
    
    y_train = y[:idx]
    y_val = y[idx:]
    
    assert len(X_train) + len(X_val) == len(padded)
    
    return np.asarray(X_train), np.asarray(X_val), np.asarray(y_train), np.asarray(y_val)

X_train, X_val, y_train, y_val = generate_datasets(train_df, train_seq, 0.8)

In [30]:
X_train, X_val, y_train, y_val = generate_datasets(train_df, train_seq, 0.8)

print(f'The training dataset size is {len(X_train)}')
print(f'The validation dataset size is {len(X_val)}')


The training dataset size is 6090
The validation dataset size is 1523


<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#c93e22'>3.3 |</span></b> Model </b></p>
</div>


### Baseline model 

In [31]:
metrics = ['accuracy', tf.keras.metrics.Recall(name='recall'), tf.keras.metrics.Precision(name='precision')]
optimizer = Adam(learning_rate=0.001)

In [32]:
model_1 = Sequential([
    Embedding(input_dim=em_matrix.shape[0], 
              output_dim=maxlen, 
              embeddings_initializer=Constant(em_matrix),
              trainable=False
             ),
    Flatten(),
    Dense(6, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_1.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=metrics)


In [33]:
history_1 = model_1.fit(
    X_train,
    y_train, 
    epochs=6, 
    validation_data=(X_val, y_val), 
    batch_size=128
)

Epoch 1/6
[1m40/48[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 4ms/step - accuracy: 0.6695 - loss: 0.6131 - precision: 0.6444 - recall: 0.5474

I0000 00:00:1713042226.890254      93 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 47ms/step - accuracy: 0.6826 - loss: 0.5998 - precision: 0.6620 - recall: 0.5561 - val_accuracy: 0.7932 - val_loss: 0.4555 - val_precision: 0.8675 - val_recall: 0.6559
Epoch 2/6
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8165 - loss: 0.4103 - precision: 0.8396 - recall: 0.6973 - val_accuracy: 0.8050 - val_loss: 0.4393 - val_precision: 0.8270 - val_recall: 0.7348
Epoch 3/6
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8527 - loss: 0.3459 - precision: 0.8581 - recall: 0.7789 - val_accuracy: 0.7991 - val_loss: 0.4498 - val_precision: 0.8409 - val_recall: 0.7010
Epoch 4/6
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8919 - loss: 0.2720 - precision: 0.9197 - recall: 0.8122 - val_accuracy: 0.7984 - val_loss: 0.4520 - val_precision: 0.8064 - val_recall: 0.7461
Epoch 5/6
[1m48/48[0m [32m━━━━━━━━━━━━━━━━

In [None]:
y_pred_1 = model_1.predict(X_val)

### Simple LSTM model

In [36]:
metrics_2 = ['accuracy', tf.keras.metrics.Recall(name='recall'), tf.keras.metrics.Precision(name='precision')]
optimizer_2 = Adam(learning_rate=0.0001)
callbacks_2 = [
    EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True)
]

In [37]:
model_2 = Sequential([
    Embedding(input_dim=em_matrix.shape[0], 
              output_dim=maxlen, 
              embeddings_initializer=Constant(em_matrix),
              trainable=False
             ),
    Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.3, return_sequences=False)),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_2.compile(loss='binary_crossentropy',
              optimizer=optimizer_2,
              metrics=metrics_2,
            )


In [38]:
history_2 = model_2.fit(
    X_train,
    y_train, 
    epochs=30, 
    validation_data=(X_val, y_val), 
    callbacks=callbacks_2
)

Epoch 1/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 263ms/step - accuracy: 0.6041 - loss: 0.6643 - precision: 0.7021 - recall: 0.0584 - val_accuracy: 0.7814 - val_loss: 0.5603 - val_precision: 0.8481 - val_recall: 0.6460
Epoch 2/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 263ms/step - accuracy: 0.7360 - loss: 0.5622 - precision: 0.7449 - recall: 0.5722 - val_accuracy: 0.8089 - val_loss: 0.4667 - val_precision: 0.8530 - val_recall: 0.7123
Epoch 3/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 262ms/step - accuracy: 0.7832 - loss: 0.4892 - precision: 0.7759 - recall: 0.6738 - val_accuracy: 0.8089 - val_loss: 0.4620 - val_precision: 0.8989 - val_recall: 0.6643
Epoch 4/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 263ms/step - accuracy: 0.7924 - loss: 0.4644 - precision: 0.7860 - recall: 0.6911 - val_accuracy: 0.8168 - val_loss: 0.4377 - val_precision: 0.8682 - val_recall: 0.7151
Epoch 5/30


In [None]:
y_pred_2 = model_2.predict(X_val)

### Double LSTM model 

In [41]:
metrics_3 = ['accuracy', tf.keras.metrics.Recall(name='recall'), tf.keras.metrics.Precision(name='precision')]
optimizer_3 = Adam(learning_rate=0.0001)
callbacks_3 = [
    EarlyStopping(monitor='val_loss', patience=2, verbose=1, restore_best_weights=True)
]

In [42]:
model_3 = Sequential([
    Embedding(input_dim=em_matrix.shape[0], 
              output_dim=maxlen, 
              embeddings_initializer=Constant(em_matrix),
              trainable=False
             ),
    Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.3, return_sequences=True)),
    Bidirectional(LSTM(32, dropout=0.3, recurrent_dropout=0.3, return_sequences=False)),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_3.compile(loss='binary_crossentropy',
              optimizer=optimizer_3,
              metrics=metrics_3,
            )


In [43]:
history_3 = model_3.fit(
    X_train,
    y_train, 
    epochs=30, 
    validation_data=(X_val, y_val), 
    callbacks=callbacks_3
)

Epoch 1/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 550ms/step - accuracy: 0.5213 - loss: 0.6952 - precision: 0.4729 - recall: 0.7811 - val_accuracy: 0.7577 - val_loss: 0.6095 - val_precision: 0.8899 - val_recall: 0.5472
Epoch 2/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 543ms/step - accuracy: 0.7353 - loss: 0.5947 - precision: 0.7521 - recall: 0.5532 - val_accuracy: 0.7997 - val_loss: 0.4671 - val_precision: 0.8544 - val_recall: 0.6869
Epoch 3/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 549ms/step - accuracy: 0.7746 - loss: 0.5034 - precision: 0.7749 - recall: 0.6524 - val_accuracy: 0.8063 - val_loss: 0.4467 - val_precision: 0.8606 - val_recall: 0.6968
Epoch 4/30
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 545ms/step - accuracy: 0.8004 - loss: 0.4647 - precision: 0.8131 - recall: 0.6860 - val_accuracy: 0.8070 - val_loss: 0.4443 - val_precision: 0.8893 - val_recall: 0.6685
Epoch 5/

In [51]:
y_pred_3 = model_3.predict(X_val)


[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 101ms/step


## <b>4 <span style='color:#c93e22'>|</span> Discussions</b> 

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#c93e22'>4.1 |</span></b> Results comparison </b></p>
</div>


First let's take a look to the training plots: 

In [52]:
plot_convergence(history_1, "Model 1, training statistics").show()
plot_convergence(history_2, "Model 2 training statistics").show()
plot_convergence(history_3, "Model 3 training statistics").show()

The first model, which is not A RNN, did not train well, the validation resuls were always the same. Comparing the second and third model, the results are very similar. for some reason that I can't explain, the precision starts very high, especially for the second model. On model 2 and 3, both validation and training losses are decreasing with training epoch, and the  accuracies are increasing. We can see from these plots that we have some underfitting in the beggining and some overfitting at the end. the early stopping callback prevents to much of overfitting, restoring the best model.

Now let's take a look on the predictions of these models:

In [57]:
plot_results(y_pred=y_pred_1, y_true=y_val, title="Model 1 results").show()
plot_results(y_pred=y_pred_2, y_true=y_val, title="Model 2 results").show()
plot_results(y_pred=y_pred_3, y_true=y_val, title="Model 3 results").show()

The two RNN model architecture tested have similar results. The model 2 had a TP rate $\approx 75\%$ and a TN rate $\approx 86 \%$ where the model 3 had a TP rate $\approx 73\%$ and a TN rate $\approx 89\%$. Both accuracies are around $0.81$ which is relativelly good.

There is still plenty of room to play to try new architectures, tune hyper parameters, ... This notebook is just an introduction which took me already a lot of time and whith decent results. Not every architecture that I tested are present in this notebook because of computing time, but I think that the requirement for this introduction course are met. 

<div style="color:white;display:fill;border-radius:8px;
            background-color:#03112A;font-size:150%;
            letter-spacing:1.0px">
    <p style="padding: 8px;color:white;"><b><b><span style='color:#c93e22'>4.2 |</span></b> Model selection </b></p>
</div>


AS the evaluation metric for this competition is the [F1](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html) score, I will choose the model number 2, because it has the highest F1 score 

In [47]:
selected_model = model_2

## <b>5 <span style='color:#c93e22'>|</span> Submission</b> 

In [48]:
y_pred = selected_model.predict(test_seq)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 47ms/step


In [49]:
y_pred = (y_pred > 0.5).ravel().astype(int)
submission = pl.DataFrame({'id': test_df['id'], 'target': y_pred})
submission.write_csv('submission.csv')

In [50]:
submission

id,target
i64,i64
0,1
2,1
3,1
9,1
11,1
…,…
10861,1
10865,1
10868,1
10874,1
