In [1]:
"""
Using XLNet for Kaggle's NLP with Disaster Tweets
Reference: [https://www.kaggle.com/alvaroibrain/xlnet-huggingface-transformers/data?select=sample_submission.csv]
*****Using XLNet to classify tweets*****
"""

"\nUsing XLNet for Kaggle's NLP with Disaster Tweets\nReference: [https://www.kaggle.com/alvaroibrain/xlnet-huggingface-transformers/data?select=sample_submission.csv]\n*****Using XLNet to classify tweets*****\n"

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import seaborn as sns
import transformers
import nltk
import re
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
plt.style.use('seaborn')

In [3]:
print(tf.__version__)

2.4.0


In [4]:
print(tf.config.list_physical_devices())

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]


In [5]:
train_df = pd.read_csv("../data/tutorial/train.csv")
test_df = pd.read_csv("../data/tutorial/test.csv")

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/parichay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [8]:
test_df

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [9]:
def clean_text(text):
    clean = text
    reg = re.compile('\&amp')
    clean = clean.apply(lambda r: re.sub(reg, string=r, repl='&'))
    reg = re.compile('\\n')
    clean = clean.apply(lambda r: re.sub(reg, string=r, repl=' '))
    reg = re.compile('@[a-zA-Z0-9\_]+')
    clean = clean.apply(lambda r: re.sub(reg, string=r, repl='@'))
    reg = re.compile('https?\S+(?=\s|$)')
    clean = clean.apply(lambda r: re.sub(reg, string=r, repl='www'))
    return clean

In [10]:
train_df["text"]

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object

In [11]:
clean_text(train_df["text"])

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @ @ The out of control wild fires in Californi...
7610       M1.94 [01:04 UTC]?5km S of Volcano Hawaii. www
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object

In [12]:
clean_text(test_df["text"])

0                      Just happened a terrible car crash
1       Heard about #earthquake is different cities, s...
2       there is a forest fire at spot pond, geese are...
3                Apocalypse lighting. #Spokane #wildfires
4           Typhoon Soudelor kills 28 in China and Taiwan
                              ...                        
3258    EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259    Storm in RI worse than last hurricane. My city...
3260                 Green Line derailment in Chicago www
3261       MEG issues Hazardous Weather Outlook (HWO) www
3262    #CityofCalgary has activated its Municipal Eme...
Name: text, Length: 3263, dtype: object

In [13]:
train_df['clean'] = clean_text(train_df["text"])

In [14]:
test_df['clean'] = clean_text(test_df["text"])

In [15]:
train_df

Unnamed: 0,id,keyword,location,text,target,clean
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...
...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,Two giant cranes holding a bridge collapse int...
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,@ @ The out of control wild fires in Californi...
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. www
7611,10872,,,Police investigating after an e-bike collided ...,1,Police investigating after an e-bike collided ...


In [16]:
test_df

Unnamed: 0,id,keyword,location,text,clean
0,0,,,Just happened a terrible car crash,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...","Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are...","there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...,Green Line derailment in Chicago www
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...,MEG issues Hazardous Weather Outlook (HWO) www


In [17]:
from transformers import TFXLNetModel, XLNetTokenizer

In [18]:
xlnet_model = 'xlnet-large-cased'

In [19]:
xlnet_tokenizer = XLNetTokenizer.from_pretrained(xlnet_model)

In [20]:
def create_xlnet(mname):
    word_inputs = tf.keras.Input(shape=(120,), name='word_inputs', dtype='int32')
    xlnet = TFXLNetModel.from_pretrained(mname)
    xlnet_encodings = xlnet(word_inputs)[0]
    doc_encoding = tf.squeeze(xlnet_encodings[:, -1:, :], axis=1)
    doc_encoding = tf.keras.layers.Dropout(.1)(doc_encoding)
    outputs = tf.keras.layers.Dense(1, activation='sigmoid', name='outputs')(doc_encoding)
    model = tf.keras.Model(inputs=[word_inputs], outputs=[outputs])
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=2e-5), loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
    return model


In [21]:
xlnet = create_xlnet(xlnet_model)

Some layers from the model checkpoint at xlnet-large-cased were not used when initializing TFXLNetModel: ['lm_loss']
- This IS expected if you are initializing TFXLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFXLNetModel were initialized from the model checkpoint at xlnet-large-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLNetModel for predictions without further training.


In [22]:
xlnet.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
word_inputs (InputLayer)     [(None, 120)]             0         
_________________________________________________________________
tfxl_net_model (TFXLNetModel TFXLNetModelOutput(last_h 360268800 
_________________________________________________________________
tf.__operators__.getitem (Sl (None, 1, 1024)           0         
_________________________________________________________________
tf.compat.v1.squeeze (TFOpLa (None, 1024)              0         
_________________________________________________________________
dropout_73 (Dropout)         (None, 1024)              0         
_________________________________________________________________
outputs (Dense)              (None, 1)                 1025      
Total params: 360,269,825
Trainable params: 360,269,825
Non-trainable params: 0
_______________________________________________

In [23]:
# Original context: "Clean and split the data"
# Potentially important for score tabulation. Please revisit.
tweets = train_df['clean']
labels = train_df['target']
X_train, X_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.15, random_state=196)

In [24]:
X_test
# Attempts:
#from sklearn import model_selection
#scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=3, scoring="f1")
#scores = model_selection.cross_val_score(xlnet, xlnet_tokenizer, train_df["target"], cv=3, scoring="f1")

4714    The sunset looked like an erupting volcano ......
4203    @ ...because if it were on fire that'd be a sa...
4867    @ If you take the suit off him I wouldn't be s...
6568    Patient-reported outcomes in long-term survivo...
5849    fresh out da shower lookss ?? (still loving th...
                              ...                        
5548    Rainstorm Destroys 600 Houses in Yobe State: R...
6823    Billionaires have a plan to free half a billio...
7115    POV video captures violent landing at Amsterda...
5420                          Ay I am fully panicking lol
4575    @ As a health care professional that deals all...
Name: clean, Length: 1142, dtype: object

In [25]:
def get_inputs(tweets, tokenizer, max_len=120):
    inps = [tokenizer.encode_plus(t, max_length=max_len, pad_to_max_length=True, add_special_tokens=True) for t in tweets]
    inp_tok = np.array([a['input_ids'] for a in inps])
    ids = np.array([a['attention_mask'] for a in inps])
    segments = np.array([a['token_type_ids'] for a in inps])
    return inp_tok, ids, segments

def warmup(epoch, lr):
    return max(lr +1e-6, 2e-5)

def plot_metrics(pred, true_labels):
    acc = accuracy_score(true_labels, np.array(pred.flatten() >= .5, dtype='int'))
    fpr, tpr, thresholds = roc_curve(true_labels, pred)
    auc = roc_auc_score(true_labels, pred)
    fig, ax = plt.subplots(1, figsize=(8,8))
    ax.plot(fpr, tpr, color='red')
    ax.plot([0,1], [0,1], color='black', linestyle='--')
    ax.set_title(f"AUC: {auc}\nACC: {acc}");
    return fig


In [26]:
# Create the input data (tensors)
inp_tok, ids, segments = get_inputs(X_train, xlnet_tokenizer)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [27]:
inp_tok

array([[   5,    5,    5, ..., 2055,    4,    3],
       [   5,    5,    5, ..., 2055,    4,    3],
       [   5,    5,    5, ..., 2055,    4,    3],
       ...,
       [   5,    5,    5, ..., 2055,    4,    3],
       [   5,    5,    5, ..., 2055,    4,    3],
       [   5,    5,    5, ..., 2055,    4,    3]])

In [28]:
ids

array([[0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1],
       ...,
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1]])

In [29]:
segments

array([[3, 3, 3, ..., 0, 0, 2],
       [3, 3, 3, ..., 0, 0, 2],
       [3, 3, 3, ..., 0, 0, 2],
       ...,
       [3, 3, 3, ..., 0, 0, 2],
       [3, 3, 3, ..., 0, 0, 2],
       [3, 3, 3, ..., 0, 0, 2]])

In [30]:
# Training

callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=4, min_delta=0.02, restore_best_weights=True),
    tf.keras.callbacks.LearningRateScheduler(warmup, verbose=0),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=1e-6, patience=2, verbose=0, mode='auto', min_delta=0.001, cooldown=0, min_lr=1e-6)
]

In [31]:
hist = xlnet.fit(x=inp_tok, y=y_train, epochs=1, batch_size=16, validation_split=.98, callbacks=callbacks)



ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/parichay/code/env/nlp2/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-31-5057d2b21ca2>", line 1, in <module>
    hist = xlnet.fit(x=inp_tok, y=y_train, epochs=1, batch_size=16, validation_split=.98, callbacks=callbacks)
  File "/home/parichay/code/env/nlp2/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 1131, in fit
    val_logs = self.evaluate(
  File "/home/parichay/code/env/nlp2/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 1389, in evaluate
    tmp_logs = self.test_function(iterator)
  File "/home/parichay/code/env/nlp2/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 828, in __call__
    result = self._call(*args, **kwds)
  File "/home/parichay/code/env/nlp2/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 

TypeError: object of type 'NoneType' has no len()

In [None]:
# Testing
inp_tok, ids, segments = get_inputs(X_test, xlnet_tokenizer)

In [None]:
preds = xlnet.predict(inp_tok, verbose=True)