In [None]:
import pandas as pd
import numpy as np
import nltk

In [None]:
nltk.download('punkt_tab')
nltk.download("stopwords", quiet=True)

from nltk import tokenize
from nltk.corpus import stopwords
from string import punctuation

[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
import spacy

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse
from sklearn.preprocessing import OneHotEncoder

In [None]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv', index_col='id')

# **Preprocessing**

In [None]:
train

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...
10869,,,Two giant cranes holding a bridge collapse int...,1
10870,,,@aria_ahrary @TheTawniest The out of control w...,1
10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
10872,,,Police investigating after an e-bike collided ...,1


In [None]:
y = train['target']
train.drop('target', axis=1, inplace=True)

Let's check NaN values in all of the columns

In [None]:
train.isna().mean()

keyword     0.008013
location    0.332720
text        0.000000
dtype: float64

In [None]:
train['location'].nunique()

3341

In [None]:
train['keyword'].nunique()

221

Seems that the most appropriate way to handle these NaN values is to fill them with empty string, because this data is textual and there many possible values in the dataset

In [None]:
train.fillna(' ', inplace=True)

In [None]:
test.isna().mean()

keyword     0.007968
location    0.338645
text        0.000000
dtype: float64

In [None]:
test.fillna(' ', inplace=True)

It's time to tokenize the texts. Since these texts are tweets, i think that we should use TweetTokenizer from nltk

Let's delete stopwords and punctuation that have not so much meaning

In [None]:
def my_tokenizer(text: str) -> list[str]:
    text = tokenize.TweetTokenizer().tokenize(text)
    text = [word for word in text if (word.lower() not in stopwords.words('english')) and (word not in punctuation) and (word != '...')]
    return text


I've decided to try both lemmatization and stemming to choose the best of them.

In [None]:
lemmatizer = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
stemmer = nltk.stem.snowball.EnglishStemmer()

In [None]:
lemmatized_train = train.copy()
lemmatized_train['text'] = lemmatized_train['text'].apply(lambda x: " ".join([i.lemma_ for i in lemmatizer(x)]))

In [None]:
lemmatized_test = test.copy()
lemmatized_test['text'] = lemmatized_test['text'].apply(lambda x: " ".join([i.lemma_ for i in lemmatizer(x)]))

In [None]:
stemmed_train = train.copy()
stemmed_train['text'] = stemmed_train['text'].apply(lambda x: ' '.join([stemmer.stem(i) for i in x.split()]))
stemmed_test = test.copy()
stemmed_test['text'] = stemmed_test['text'].apply(lambda x: ' '.join([stemmer.stem(i) for i in x.split()]))

I think that keyword and location are unique features, so we need to treat them separately from text

In [None]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
loc_train = encoder.fit_transform(np.array(train['location']).reshape(-1, 1))
loc_test = encoder.transform(np.array(test['location']).reshape(-1, 1))

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
key_train = encoder.fit_transform(np.array(train['keyword']).reshape(-1, 1))
key_test = encoder.transform(np.array(test['keyword']).reshape(-1, 1))

It's time to apply BoW/TF-IDF vectorizers

In [None]:
def vectorize(vectorizer, train, test) -> (pd.Series, pd.Series):
    new_train = train.copy()
    vectorizer.fit(new_train)
    new_train = vectorizer.transform(new_train)
    new_test = test.copy()
    new_test = vectorizer.transform(new_test)
    return new_train, new_test

In [None]:
bow_lemm_train, bow_lemm_test = vectorize(CountVectorizer(tokenizer=my_tokenizer), lemmatized_train['text'], lemmatized_test['text'])
bow_stem_train, bow_stem_test = vectorize(CountVectorizer(tokenizer=my_tokenizer), stemmed_train['text'], stemmed_test['text'])
tfidf_lemm_train, tfidf_lemm_test = vectorize(TfidfVectorizer(tokenizer=my_tokenizer), lemmatized_train['text'], lemmatized_test['text'])
tfidf_stem_train, tfidf_stem_test = vectorize(TfidfVectorizer(tokenizer=my_tokenizer), stemmed_train['text'], stemmed_test['text'])



Let's join bow and tfidf interpretations with one-hot encoded locations and keywords into whole sparse matriсes

In [None]:
bow_lemm_train = scipy.sparse.csr_matrix(pd.DataFrame(loc_train,index=train.index).join(pd.DataFrame(key_train,index=train.index), rsuffix='key', lsuffix='loc').join(pd.DataFrame.sparse.from_spmatrix(bow_lemm_train, index=train.index), rsuffix='text'))
tfidf_lemm_train = scipy.sparse.csr_matrix(pd.DataFrame(loc_train,index=train.index).join(pd.DataFrame(key_train,index=train.index), rsuffix='key', lsuffix='loc').join(pd.DataFrame.sparse.from_spmatrix(tfidf_lemm_train, index=train.index), rsuffix='text'))
bow_stem_train = scipy.sparse.csr_matrix(pd.DataFrame(loc_train,index=train.index).join(pd.DataFrame(key_train,index=train.index), rsuffix='key', lsuffix='loc').join(pd.DataFrame.sparse.from_spmatrix(bow_stem_train, index=train.index), rsuffix='text'))
tfidf_stem_train = scipy.sparse.csr_matrix(pd.DataFrame(loc_train,index=train.index).join(pd.DataFrame(key_train,index=train.index), rsuffix='key', lsuffix='loc').join(pd.DataFrame.sparse.from_spmatrix(tfidf_stem_train, index=train.index), rsuffix='text'))
bow_lemm_test = scipy.sparse.csr_matrix(pd.DataFrame(loc_test,index=test.index).join(pd.DataFrame(key_test,index=test.index), rsuffix='key', lsuffix='loc').join(pd.DataFrame.sparse.from_spmatrix(bow_lemm_test, index=test.index), rsuffix='text'))
tfidf_lemm_test = scipy.sparse.csr_matrix(pd.DataFrame(loc_test,index=test.index).join(pd.DataFrame(key_test,index=test.index), rsuffix='key', lsuffix='loc').join(pd.DataFrame.sparse.from_spmatrix(tfidf_lemm_test, index=test.index), rsuffix='text'))
bow_stem_test = scipy.sparse.csr_matrix(pd.DataFrame(loc_test,index=test.index).join(pd.DataFrame(key_test,index=test.index), rsuffix='key', lsuffix='loc').join(pd.DataFrame.sparse.from_spmatrix(bow_stem_test, index=test.index), rsuffix='text'))
tfidf_stem_test = scipy.sparse.csr_matrix(pd.DataFrame(loc_test,index=test.index).join(pd.DataFrame(key_test,index=test.index), rsuffix='key', lsuffix='loc').join(pd.DataFrame.sparse.from_spmatrix(tfidf_stem_test, index=test.index), rsuffix='text'))

# **Training ML models**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [None]:
import datetime

In [None]:
def grid_training(train, y):

    models = [
        LogisticRegression(),
        SVC(),
        KNeighborsClassifier(),
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        GradientBoostingClassifier()
    ]

    param_grids = [
        {
            'penalty': ['l1', 'l2'],
            'C': np.logspace(-3,0,10),
            'class_weight': [None, 'balanced'],
            'solver': ['saga']
        },
        {
            'C': np.logspace(-3,0,10),
            'class_weight': [None, 'balanced'],
            'kernel': ['linear', 'rbf']
        },
        {
            'n_neighbors': np.arange(1, 26)
        },
        {
            'class_weight': [None, 'balanced'],
            'min_samples_split': np.arange(2, 103, 10),
            'min_samples_leaf': np.arange(1, 102, 10)
        },
        {
            'n_estimators': [50, 100],
            'class_weight': [None, 'balanced'],
            'min_samples_split': np.arange(2, 53, 10),
            'min_samples_leaf': np.arange(1, 52, 10)
        },
        {
            'n_estimators': [100],
            'min_samples_split': np.arange(2, 53, 10),
            'min_samples_leaf': np.arange(1, 52, 10)
        }
    ]

    X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.3, stratify=y)
    grid_models = []

    for model, params in zip(models, param_grids):
        grid = GridSearchCV(model, params, scoring='f1', cv=5, n_jobs=-1)
        grid.fit(X_train, y_train)
        grid_models.append((grid.best_estimator_, f1_score(grid.best_estimator_.fit(X_train, y_train).predict(X_test), y_test), grid.best_params_))
        print(f'{model} trained {datetime.datetime.now()}')

    return max(grid_models, key=lambda x: x[1])

In [None]:
grid_training(bow_lemm_train, y)



LogisticRegression() trained 2025-03-19 10:09:55.693728
SVC() trained 2025-03-19 10:13:35.104555
KNeighborsClassifier() trained 2025-03-19 10:13:45.935171
DecisionTreeClassifier() trained 2025-03-19 10:16:34.506596
RandomForestClassifier() trained 2025-03-19 10:26:20.042457
GradientBoostingClassifier() trained 2025-03-19 10:50:40.940162


(LogisticRegression(C=0.21544346900318823, class_weight='balanced',
                    solver='saga'),
 0.7603565810173046,
 {'C': 0.21544346900318823,
  'class_weight': 'balanced',
  'penalty': 'l2',
  'solver': 'saga'})

In [None]:
grid_training(bow_stem_train, y)



LogisticRegression() trained 2025-03-19 11:17:41.405483
SVC() trained 2025-03-19 11:21:17.695963
KNeighborsClassifier() trained 2025-03-19 11:21:27.445845
DecisionTreeClassifier() trained 2025-03-19 11:24:15.806724
RandomForestClassifier() trained 2025-03-19 11:33:59.406270
GradientBoostingClassifier() trained 2025-03-19 11:50:29.056969


(LogisticRegression(C=0.1, class_weight='balanced', solver='saga'),
 0.7552966101694917,
 {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'saga'})

In [None]:
grid_training(tfidf_lemm_train, y)

LogisticRegression() trained 2025-03-19 11:50:36.050765
SVC() trained 2025-03-19 11:54:15.946875
KNeighborsClassifier() trained 2025-03-19 11:55:21.199317
DecisionTreeClassifier() trained 2025-03-19 11:58:33.306595
RandomForestClassifier() trained 2025-03-19 12:08:09.122116
GradientBoostingClassifier() trained 2025-03-19 12:24:37.790521


(SVC(class_weight='balanced', kernel='linear'),
 0.7489583333333332,
 {'C': 1.0, 'class_weight': 'balanced', 'kernel': 'linear'})

In [None]:
grid_training(tfidf_stem_train, y)

LogisticRegression() trained 2025-03-19 12:24:44.950499
SVC() trained 2025-03-19 12:28:25.593721
KNeighborsClassifier() trained 2025-03-19 12:29:30.899361
DecisionTreeClassifier() trained 2025-03-19 12:32:42.953745
RandomForestClassifier() trained 2025-03-19 12:42:36.832598
GradientBoostingClassifier() trained 2025-03-19 12:59:43.092958


(LogisticRegression(class_weight='balanced', solver='saga'),
 0.7387480600103465,
 {'C': 1.0, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'saga'})

Seems that using bag of words with lemmatization is the best approach. The best model that we got was LogisticRegression(). Let's train the best model on the whole train dataset and submit predictions for the test dataset.

In [None]:
best_model = LogisticRegression(C=0.21544346900318823, class_weight='balanced', penalty='l2', solver='saga', random_state=1)
best_model.fit(bow_lemm_train, y)
pd.DataFrame(best_model.predict(bow_lemm_test), index=test.index, columns=['target']).to_csv('/kaggle/working/output.csv')



**F1-score on submitted predictions = Score: 0.79773**

# **Neural Network from scratch**

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers import Embedding
from keras.layers import Dropout, GRU, SimpleRNN
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

I've chosen CNN because it states that "A CNN usually performs just as well as an RNN on text-classification tasks and trains much faster." in the next internet articles: https://www.atmosera.com/blog/text-classification-with-neural-networks/  and  https://www.geeksforgeeks.org/text-classification-using-cnn/.

The idea is taken from https://www.atmosera.com/blog/text-classification-with-neural-networks/.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.3, stratify=y, random_state=1)

In [None]:
tokenizer = Tokenizer(num_words=25000)
tokenizer.fit_on_texts(X_train['text'])
sequences = tokenizer.texts_to_sequences(X_train['text'] + X_train['location'] + X_train['keyword'])
train_emb = pad_sequences(sequences, maxlen=500)
test_sequences = tokenizer.texts_to_sequences(X_test['text'] + X_test['location'] + X_test['keyword'])
test_emb = pad_sequences(test_sequences, maxlen=500)

In [None]:
networks_cnn = []

for emb in [16, 32, 64, 128, 256]:
    for maxpooling in [2, 3]:
        for dropout in [0.1, 0.2, 0.3, 0.4]:
            for batch in [20, 50, 100]:

                model = Sequential()
                model.add(Embedding(25000, emb, input_length=500))
                model.add(Conv1D(32, 7, activation='relu'))
                model.add(MaxPooling1D(maxpooling))
                model.add(Conv1D(32, 7, activation='relu'))
                model.add(GlobalMaxPooling1D())
                model.add(Dropout(dropout))
                model.add(Dense(1, activation='sigmoid'))
                model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['f1_score'])
                model.fit(train_emb, y_train, validation_split=0.2, epochs=10, batch_size=batch)
                networks_cnn.append(((emb, maxpooling, dropout, batch), model, f1_score(np.round(model.predict(test_emb)).reshape(y_test.shape), y_test)))

In [None]:
max(networks_cnn, key=lambda x: x[2])

((128, 2, 0.3, 100),
 <Sequential name=sequential_80, built=True>,
 0.731457800511509)

In [None]:
model = Sequential()
model.add(Embedding(25000, output_dim=64))
model.add(GRU(256, return_sequences=True))
model.add(SimpleRNN(128))
model.add(Dropout(0.4))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['f1_score'])
model.fit(train_emb, y_train, validation_split=0.2, epochs=10, batch_size=100)

Epoch 1/10
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 500ms/step - f1_score: 0.5946 - loss: 0.6766 - val_f1_score: 0.5890 - val_loss: 0.5616
Epoch 2/10
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 491ms/step - f1_score: 0.6055 - loss: 0.4594 - val_f1_score: 0.5890 - val_loss: 0.5021
Epoch 3/10
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 488ms/step - f1_score: 0.6042 - loss: 0.2991 - val_f1_score: 0.5890 - val_loss: 0.5668
Epoch 4/10
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 490ms/step - f1_score: 0.6106 - loss: 0.1028 - val_f1_score: 0.5890 - val_loss: 0.7299
Epoch 5/10
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 491ms/step - f1_score: 0.6103 - loss: 0.0602 - val_f1_score: 0.5890 - val_loss: 0.8772
Epoch 6/10
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 489ms/step - f1_score: 0.5923 - loss: 0.0379 - val_f1_score: 0.5890 - val_loss: 0.9439
Epoch 7/10
[1m43/43[

<keras.src.callbacks.history.History at 0x797f6b79ead0>

In [None]:
print(f'random rnn score: {f1_score(np.round(model.predict(test_emb)).reshape(y_test.shape), y_test)} | min cnn score: {min(networks_cnn, key=lambda x: x[2])[2]}')

[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 166ms/step
random rnn score: 0.6594161419576416 | min cnn score: 0.6839145106861643


As we can see, RNN takes much longer to train, but score of RNN with random values of hyperparameters is worse than minimal score that we achieved with CNN, so i decided to use the best CNN for submission predictions (according to statistics)

In [None]:
tokenizer = Tokenizer(num_words=25000)
tokenizer.fit_on_texts(train['text'])
sequences = tokenizer.texts_to_sequences(train['text'] + train['location'] + train['keyword'])
train_emb = pad_sequences(sequences, maxlen=500)
test_sequences = tokenizer.texts_to_sequences(test['text'] + test['location'] + test['keyword'])
test_emb = pad_sequences(test_sequences, maxlen=500)

In [None]:
best_model = Sequential()
best_model.add(Embedding(25000, 128, input_length=500))
best_model.add(Conv1D(32, 7, activation='relu'))
best_model.add(MaxPooling1D(2))
best_model.add(Conv1D(32, 7, activation='relu'))
best_model.add(GlobalMaxPooling1D())
best_model.add(Dropout(0.3))
best_model.add(Dense(1, activation='sigmoid'))
best_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['f1_score'])
best_model.fit(train_emb, y, validation_split=0.2, epochs=10, batch_size=100)

Epoch 1/10
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 86ms/step - f1_score: 0.5921 - loss: 0.6738 - val_f1_score: 0.6353 - val_loss: 0.5636
Epoch 2/10
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - f1_score: 0.5856 - loss: 0.4239 - val_f1_score: 0.6353 - val_loss: 0.4596
Epoch 3/10
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - f1_score: 0.5849 - loss: 0.2090 - val_f1_score: 0.6353 - val_loss: 0.5168
Epoch 4/10
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - f1_score: 0.5905 - loss: 0.1391 - val_f1_score: 0.6353 - val_loss: 0.6224
Epoch 5/10
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - f1_score: 0.5969 - loss: 0.0683 - val_f1_score: 0.6353 - val_loss: 0.6702
Epoch 6/10
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - f1_score: 0.5930 - loss: 0.0450 - val_f1_score: 0.6353 - val_loss: 0.7575
Epoch 7/10
[1m61/61[0m [32m━━━━

<keras.src.callbacks.history.History at 0x797f96845630>

In [None]:
pd.DataFrame(np.round(best_model.predict(test_emb)).reshape(test.shape[0]).astype('int'), index=test.index, columns=['target']).to_csv('/kaggle/working/nn_output.csv')

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


**F1-score on submitted predictions = Score: 0.73460**

# **Fine-tuning pre-trained model**

In [None]:
!pip install transformers evaluate accelerate

In [None]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TFAutoModelForSequenceClassification
import evaluate
from transformers import create_optimizer
import tensorflow as tf
import datasets
from datasets import Dataset, DatasetDict
from transformers.keras_callbacks import KerasMetricCallback
from transformers import pipeline

I've decided to choose DistilBERT. The idea is taken from https://huggingface.co/docs/transformers/en/tasks/sequence_classification

DistilBERT is a transformers model, smaller and faster than BERT, which was pretrained on the same corpus in a self-supervised fashion, using the BERT base model as a teacher. This means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts using the BERT base model.

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train, y, test_size=0.3, stratify=y, random_state=1)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [None]:
train_dataset = Dataset.from_pandas(pd.DataFrame(pd.DataFrame(X_train['text'] + X_train['location'] + X_train['keyword']).join(y_train)).rename({0:'text', 'target':'label'}, axis=1))
val_dataset = Dataset.from_pandas(pd.DataFrame(pd.DataFrame(X_val['text'] + X_val['location'] + X_val['keyword']).join(y_val)).rename({0:'text', 'target':'label'}, axis=1))
dataset = DatasetDict()

dataset['train'] = train_dataset
dataset['validation'] = val_dataset

In [None]:
def preprocess_function(df):
    return tokenizer(df['text'], truncation=True)

In [None]:
tokenized_data = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/5329 [00:00<?, ? examples/s]

Map:   0%|          | 0/2284 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
f_score = evaluate.load("f1")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f_score.compute(predictions=predictions, references=labels)

In [None]:
batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_data["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=2)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [None]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_data['train'],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator
)

In [None]:
tf_validation_set = model.prepare_tf_dataset(
    tokenized_data['validation'],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator
)

In [None]:
model.compile(optimizer=optimizer)

In [None]:
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

In [None]:
callbacks = [metric_callback]

In [None]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=10, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tf_keras.src.callbacks.History at 0x797cc7957f40>

In [None]:
test_dataset = Dataset.from_pandas(pd.DataFrame(pd.DataFrame(test['text'] + test['location'] + test['keyword'])).rename({0:'text'}, axis=1))

In [None]:
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

In [None]:
tf_test_set = model.prepare_tf_dataset(
    tokenized_test,
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator
)

In [None]:
logits = model.predict(tf_test_set).logits



In [None]:
predictions = tf.nn.softmax(logits, axis=-1)
predicted_labels = tf.argmax(predictions, axis=1)

In [None]:
pd.DataFrame(predicted_labels, index=test.index, columns=['target']).to_csv('/kaggle/working/distilbert_output.csv')

**F1-score on submitted predictions = Score: 0.82439**

# **Conclusion**

**Not surprisingly, the fine-tunned DistilBERT gave the highest score. Speaking of neural network trained from scratch, perhaps we should have tried a more complex architecture or performed more detailed EDA to make its prediction outperform classical ML models. In terms of quality/resources Logistic Regression is the best choice, because it generates predictions much faster than fine-tuned Neural Network and it is not so much worse. For me it was difficult to understand optimal format of features' encoding.**