### Import Packages

In [1]:
import pandas as pd
import numpy as np

datapath = 'data/'

### Load Data

In [2]:
data = pd.read_csv(datapath + 'disaster_train.csv', sep=',',)

### Transfrom data

we extract the text of the tweets as X vector and the transform them into a `tf-idf` (term-frequency times inverse document-frequency) matrix. 

__TODO__
- At a later stage add location and keyword data to the feature matrix
    - Problem: How do we addjust the weight of those, should they way as much as the occurence of one word or more?
    

In [3]:
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

X = data['text']
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# the function TfidfVectorizer transforms our data in a sparse feature matrix. Every feature is the td-idf of a n-gram. The sizes of the 
# n-grams are set by our argument ngram_range.
# with the argument stopwords we can remove words that typically occur often but do not give any information (i.e. and, a, ...)
tf_computer = TfidfVectorizer(use_idf=False, ngram_range=(1,1), stop_words='english')

train_features = tf_computer.fit_transform(X_train)
test_features = tf_computer.transform(X_test)

print("vocabulary size:", len(tf_computer.vocabulary_))


vocabulary size: 19097


In [4]:
108206/19097

5.666125569461172

# First naive try

In [5]:
# select a linear model based on the assumption that specific words (linearly) imply a real disaster or not

from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

# TODO: vary hpyerparameter alpha

clf = RidgeClassifier()
scores = cross_val_score(clf, train_features, y_train, cv=3, scoring="f1")
scores

array([0.73647059, 0.7452381 , 0.73079179])

In [6]:
clf.fit(train_features, y_train)
y_train_pred = clf.predict(train_features)
y_test_pred = clf.predict(test_features)


print('Train F1 score:', np.round(f1_score(y_train, y_train_pred), 3))
print('Test F1 score:', np.round(f1_score(y_test, y_test_pred), 3))

Train F1 score: 0.945
Test F1 score: 0.744


This looks very much like overfitting

# Second (a bit less) naive try
Lets try a neural network.

Get some ideas [here](https://medium.com/@datamonsters/artificial-neural-networks-for-natural-language-processing-part-1-64ca9ebfa3b2)

### Load packages

In [7]:
import tensorflow #the backend used by Keras (there are different beckend)
from tensorflow.keras.models import Sequential #import the type of mpdel: sequential (e.g., MLP)
from tensorflow.keras.layers import Dropout, Input, Dense #simple linear layer
from tensorflow.keras.utils import to_categorical # transformation for classification labels
from keras.utils.vis_utils import plot_model
from tensorflow.python.framework.random_seed import set_random_seed
import matplotlib.pyplot as plt

2022-01-06 11:42:21.184179: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-06 11:42:21.184201: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


### Create early stopping mechanism for NN

In [8]:
from keras.callbacks import EarlyStopping

# simple early stopping
es = EarlyStopping(monitor='val_loss', #quantity to be monitored
                   mode='min', #we look for decreasing patterns stop 
                   patience = 3, #number of epochs with no improvement
                   verbose=1)

np.random.seed(123)
set_random_seed(2)

In [19]:
train_features_den = train_features.todense()
feature_vector_length = train_features_den.shape[1]

In [21]:
model = Sequential() #we first define how the "model" looks like
model.add(Dense(input_dim = feature_vector_length, units=100 , activation='relu')) #input layer
model.add(Dense(100, activation='relu'))
model.add(Dense(2, activation='softmax')) #output layer

# Configure the model and start training
model.compile(loss='sparse_categorical_crossentropy', #loss metric
    optimizer='adam',  #optimizer
    metrics=['accuracy']) #displayed metric


history = model.fit(train_features_den, y_train, epochs=10, verbose=2, validation_split=0.2)

Epoch 1/10
162/162 - 3s - loss: 0.5418 - accuracy: 0.7320 - val_loss: 0.4773 - val_accuracy: 0.7869 - 3s/epoch - 21ms/step
Epoch 2/10
162/162 - 3s - loss: 0.2105 - accuracy: 0.9196 - val_loss: 0.5895 - val_accuracy: 0.7730 - 3s/epoch - 16ms/step
Epoch 3/10
162/162 - 3s - loss: 0.0737 - accuracy: 0.9741 - val_loss: 0.7752 - val_accuracy: 0.7776 - 3s/epoch - 17ms/step
Epoch 4/10
162/162 - 3s - loss: 0.0375 - accuracy: 0.9876 - val_loss: 0.8302 - val_accuracy: 0.7668 - 3s/epoch - 17ms/step
Epoch 5/10
162/162 - 3s - loss: 0.0250 - accuracy: 0.9913 - val_loss: 0.9025 - val_accuracy: 0.7745 - 3s/epoch - 16ms/step
Epoch 6/10
162/162 - 3s - loss: 0.0192 - accuracy: 0.9925 - val_loss: 0.9345 - val_accuracy: 0.7629 - 3s/epoch - 17ms/step
Epoch 7/10
162/162 - 3s - loss: 0.0124 - accuracy: 0.9952 - val_loss: 0.9911 - val_accuracy: 0.7730 - 3s/epoch - 17ms/step
Epoch 8/10
162/162 - 3s - loss: 0.0112 - accuracy: 0.9952 - val_loss: 1.0185 - val_accuracy: 0.7676 - 3s/epoch - 16ms/step
Epoch 9/10
162/1

In [11]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
# plt.ylim(0.8, 1)
plt.show()


NameError: name 'history' is not defined

In [12]:
model.fit(train_features_den, y_train)
y_train_pred = clf.predict(train_features_den)
y_test_pred = clf.predict(test_features.todense())


print('Train F1 score:', np.round(f1_score(y_train, y_train_pred), 3))
print('Test F1 score:', np.round(f1_score(y_test, y_test_pred), 3))

NameError: name 'model' is not defined

In [13]:
from sklearn.metrics import accuracy_score

print('Train F1 score:', np.round(accuracy_score(y_train, y_train_pred), 3))
print('Test F1 score:', np.round(accuracy_score(y_test, y_test_pred), 3))

Train F1 score: 0.954
Test F1 score: 0.79


The former model does not improve after the first few epochs, afterwards train and validation acc both only slightly fluctuate. The Train accuracy raises quickly over the first epochs up to 99.7% (epoch 9). The validation accuracy on the other hand slightly falls. This indicates that our model not really learns a lot. It becomes able to predict the training set very well, but the accuracy on the validattion set stays at the same level (overfitting).

### Model Variation

In the following section we will try out different architectures for our model. To do so we will vary the activation function and the depth of the model.

In [14]:
#  the function can be used in a loop to define several models

def MLP_definer(n_layers, n_features, n_classes, activation_f):
    
    model = Sequential()
    model.add(Dense(input_dim = n_features, units=500, activation=activation_f))
    for i in range(1, n_layers):
        model.add(Dense(units=500-10*i^2, activation=activation_f))
        
    model.add(Dense(units=n_classes, activation='softmax'))
    return model

In [None]:
out = {}
for fun in ('relu', 'sigmoid'):
    print(f'Activation function: {fun}')
    for i in range(2,7):    
        model = MLP_definer(i, n_features=feature_vector_length, n_classes=2, activation_f=fun)
        model.compile(
            loss='sparse_categorical_crossentropy',
            optimizer='adam',
            metrics='accuracy'
        )
        history = model.fit(train_features_den, y_train, batch_size=16, epochs=500, validation_split=0.25, verbose=0, callbacks=[es])
        out[i] = [history.history['accuracy'][-1],  history.history['val_accuracy'][-1]]
        print(f'Number of layers: {i} \nTrain Acc: {history.history["accuracy"][-1]} \nTest Acc:  {history.history["val_accuracy"][-1]}')

Activation function: relu
Epoch 00004: early stopping
Number of layers: 2 
Train Acc: 0.984957754611969 
Test Acc:  0.7756489515304565
