In [4]:
import pandas as pd

# read the TSV file into a DataFrame
df = pd.read_csv('task_informative_text_img_agreed_lab_train.tsv', sep='\t')

# display the first few rows of the DataFrame
print(df.head())

             event_name            tweet_id              image_id  \
0  california_wildfires  917791291823591425  917791291823591425_0   
1  california_wildfires  917793137925459968  917793137925459968_0   
2  california_wildfires  917793137925459968  917793137925459968_1   
3  california_wildfires  917793137925459968  917793137925459968_2   
4  california_wildfires  917815040962695168  917815040962695168_0   

                                          tweet_text  \
0  RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...   
1  RT @KAKEnews: California wildfires destroy mor...   
2  RT @KAKEnews: California wildfires destroy mor...   
3  RT @KAKEnews: California wildfires destroy mor...   
4  RT @TheAtlantic: Photos of California's destru...   

                                               image        label  \
0  data_image/california_wildfires/10_10_2017/917...  informative   
1  data_image/california_wildfires/10_10_2017/917...  informative   
2  data_image/california_wildfires/10_10_

In [5]:
# read the TSV file into a DataFrame
ds = pd.read_csv('task_informative_text_img_agreed_lab_test.tsv', sep='\t')

# display the first few rows of the DataFrame
print(ds.head())

        event_name            tweet_id              image_id  \
0  srilanka_floods  878185882431389696  878185882431389696_0   
1  hurricane_maria  910542719864397824  910542719864397824_0   
2  hurricane_maria  913009824195104768  913009824195104768_0   
3  hurricane_maria  916053383383011328  916053383383011328_0   
4  hurricane_maria  922230253359116288  922230253359116288_0   

                                          tweet_text  \
0  Cristofer CLEMENTE MORA now in 2nd at aguille ...   
1  Hurricane Maria batters Puerto Rico as a Cat 4...   
2  8am #Maria update: holding steady as a strong ...   
3  .@lprnyc is hosting a Puerto Rico benefict con...   
4  Vet In Puerto Rico Hurricane Worse Than War - ...   

                                               image            label  \
0  data_image/srilanka_floods/23_6_2017/878185882...  not_informative   
1  data_image/hurricane_maria/20_9_2017/910542719...      informative   
2  data_image/hurricane_maria/27_9_2017/913009824...      i

In [6]:
df = pd.concat([df,ds])
df.count()

event_name          11135
tweet_id            11135
image_id            11135
tweet_text          11135
image               11135
label               11135
label_text          11135
label_image         11135
label_text_image    11135
dtype: int64

In [7]:
df = df.drop(['event_name', 'tweet_id', 'image_id', 'image', 'label', 'label_image', 'label_text_image'], axis=1)

# display the updated DataFrame
print(df.head())

                                          tweet_text   label_text
0  RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...  informative
1  RT @KAKEnews: California wildfires destroy mor...  informative
2  RT @KAKEnews: California wildfires destroy mor...  informative
3  RT @KAKEnews: California wildfires destroy mor...  informative
4  RT @TheAtlantic: Photos of California's destru...  informative


In [8]:
import re

# define a lambda function to preprocess a single tweet
preprocess_tweet = lambda tweet: re.sub(r'#\w+\s*|https?:\/\/.*\/\w*|[^\w\s]', '', tweet)

# apply the preprocessing function to the relevant columns of the DataFrame
df['processed_text'] = df['tweet_text'].apply(preprocess_tweet)

# display the updated DataFrame
print(df.head())

                                          tweet_text   label_text  \
0  RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...  informative   
1  RT @KAKEnews: California wildfires destroy mor...  informative   
2  RT @KAKEnews: California wildfires destroy mor...  informative   
3  RT @KAKEnews: California wildfires destroy mor...  informative   
4  RT @TheAtlantic: Photos of California's destru...  informative   

                                      processed_text  
0  RT Cal_OES PLS SHARE Weâre capturing wildfire ...  
1  RT KAKEnews California wildfires destroy more ...  
2  RT KAKEnews California wildfires destroy more ...  
3  RT KAKEnews California wildfires destroy more ...  
4  RT TheAtlantic Photos of Californias destructi...  


In [9]:
# convert all text in the DataFrame to lowercase
df = df.applymap(lambda s: s.lower() if type(s) == str else s)

# display the updated DataFrame
print(df.head())

                                          tweet_text   label_text  \
0  rt @cal_oes: pls share: weâ€™re capturing wild...  informative   
1  rt @kakenews: california wildfires destroy mor...  informative   
2  rt @kakenews: california wildfires destroy mor...  informative   
3  rt @kakenews: california wildfires destroy mor...  informative   
4  rt @theatlantic: photos of california's destru...  informative   

                                      processed_text  
0  rt cal_oes pls share weâre capturing wildfire ...  
1  rt kakenews california wildfires destroy more ...  
2  rt kakenews california wildfires destroy more ...  
3  rt kakenews california wildfires destroy more ...  
4  rt theatlantic photos of californias destructi...  


In [10]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

In [12]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['label_text'], test_size=0.2, random_state=42)

In [13]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index) + 1
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [14]:
# Pad the sequences
max_len = max([len(x) for x in X_train])
X_train = pad_sequences(X_train, padding='post', maxlen=max_len)
X_test = pad_sequences(X_test, padding='post', maxlen=max_len)

In [25]:
# Build the CNN model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_len))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])


In [26]:
# Train the model
model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

Epoch 1/5


UnimplementedError: Graph execution error:

Detected at node 'mean_squared_error/Cast' defined at (most recent call last):
    File "C:\Users\kaust\anaconda3\lib\runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Users\kaust\anaconda3\lib\runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "C:\Users\kaust\anaconda3\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "C:\Users\kaust\anaconda3\lib\site-packages\traitlets\config\application.py", line 992, in launch_instance
      app.start()
    File "C:\Users\kaust\anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 711, in start
      self.io_loop.start()
    File "C:\Users\kaust\anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "C:\Users\kaust\anaconda3\lib\asyncio\base_events.py", line 603, in run_forever
      self._run_once()
    File "C:\Users\kaust\anaconda3\lib\asyncio\base_events.py", line 1906, in _run_once
      handle._run()
    File "C:\Users\kaust\anaconda3\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "C:\Users\kaust\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "C:\Users\kaust\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "C:\Users\kaust\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 406, in dispatch_shell
      await result
    File "C:\Users\kaust\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 729, in execute_request
      reply_content = await reply_content
    File "C:\Users\kaust\anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 411, in do_execute
      res = shell.run_cell(
    File "C:\Users\kaust\anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 531, in run_cell
      return super().run_cell(*args, **kwargs)
    File "C:\Users\kaust\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2961, in run_cell
      result = self._run_cell(
    File "C:\Users\kaust\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3016, in _run_cell
      result = runner(coro)
    File "C:\Users\kaust\anaconda3\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "C:\Users\kaust\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3221, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\Users\kaust\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3400, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "C:\Users\kaust\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3460, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\kaust\AppData\Local\Temp\ipykernel_14992\97561900.py", line 2, in <module>
      model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))
    File "C:\Users\kaust\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\kaust\anaconda3\lib\site-packages\keras\engine\training.py", line 1685, in fit
      tmp_logs = self.train_function(iterator)
    File "C:\Users\kaust\anaconda3\lib\site-packages\keras\engine\training.py", line 1284, in train_function
      return step_function(self, iterator)
    File "C:\Users\kaust\anaconda3\lib\site-packages\keras\engine\training.py", line 1268, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\kaust\anaconda3\lib\site-packages\keras\engine\training.py", line 1249, in run_step
      outputs = model.train_step(data)
    File "C:\Users\kaust\anaconda3\lib\site-packages\keras\engine\training.py", line 1051, in train_step
      loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\kaust\anaconda3\lib\site-packages\keras\engine\training.py", line 1109, in compute_loss
      return self.compiled_loss(
    File "C:\Users\kaust\anaconda3\lib\site-packages\keras\engine\compile_utils.py", line 265, in __call__
      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\kaust\anaconda3\lib\site-packages\keras\losses.py", line 142, in __call__
      losses = call_fn(y_true, y_pred)
    File "C:\Users\kaust\anaconda3\lib\site-packages\keras\losses.py", line 268, in call
      return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\kaust\anaconda3\lib\site-packages\keras\losses.py", line 1469, in mean_squared_error
      y_true = tf.cast(y_true, y_pred.dtype)
Node: 'mean_squared_error/Cast'
Cast string to float is not supported
	 [[{{node mean_squared_error/Cast}}]] [Op:__inference_train_function_6227]

In [None]:
# Evaluate the model
_, accuracy = model.evaluate(X_test, y_test)
print('Accuracy: %.2f' % (accuracy*100))