In [1]:
import pandas as pd

# read the TSV file into a DataFrame
df = pd.read_csv('task_humanitarian_text_img_agreed_lab_train.tsv', sep='\t')
ds = pd.read_csv('task_humanitarian_text_img_agreed_lab_test.tsv', sep='\t')

# display the first few rows of the DataFrame
print(df.head())

             event_name            tweet_id              image_id  \
0  california_wildfires  917793137925459968  917793137925459968_0   
1  california_wildfires  917793137925459968  917793137925459968_1   
2  california_wildfires  917793137925459968  917793137925459968_2   
3  california_wildfires  917815040962695168  917815040962695168_2   
4  california_wildfires  917828283047260161  917828283047260161_0   

                                          tweet_text  \
0  RT @KAKEnews: California wildfires destroy mor...   
1  RT @KAKEnews: California wildfires destroy mor...   
2  RT @KAKEnews: California wildfires destroy mor...   
3  RT @TheAtlantic: Photos of California's destru...   
4  Why California's #wildfires are worse in the f...   

                                               image  \
0  data_image/california_wildfires/10_10_2017/917...   
1  data_image/california_wildfires/10_10_2017/917...   
2  data_image/california_wildfires/10_10_2017/917...   
3  data_image/california

In [2]:
df_concat = pd.concat([df,ds])
df_concat.count()

event_name          7081
tweet_id            7081
image_id            7081
tweet_text          7081
image               7081
label               7081
label_text          7081
label_image         7081
label_text_image    7081
dtype: int64

In [3]:
df = df.drop(['event_name', 'tweet_id', 'image_id', 'image', 'label', 'label_image', 'label_text_image'], axis=1)

# display the updated DataFrame
print(df.head())

                                          tweet_text  \
0  RT @KAKEnews: California wildfires destroy mor...   
1  RT @KAKEnews: California wildfires destroy mor...   
2  RT @KAKEnews: California wildfires destroy mor...   
3  RT @TheAtlantic: Photos of California's destru...   
4  Why California's #wildfires are worse in the f...   

                          label_text  
0  infrastructure_and_utility_damage  
1  infrastructure_and_utility_damage  
2  infrastructure_and_utility_damage  
3  infrastructure_and_utility_damage  
4         other_relevant_information  


In [4]:
import re

# define a lambda function to preprocess a single tweet
preprocess_tweet = lambda tweet: re.sub(r'#\w+\s*|https?:\/\/.*\/\w*|[^\w\s]', '', tweet)

# apply the preprocessing function to the relevant columns of the DataFrame
df['processed_text'] = df['tweet_text'].apply(preprocess_tweet)

# display the updated DataFrame
print(df.head())


                                          tweet_text  \
0  RT @KAKEnews: California wildfires destroy mor...   
1  RT @KAKEnews: California wildfires destroy mor...   
2  RT @KAKEnews: California wildfires destroy mor...   
3  RT @TheAtlantic: Photos of California's destru...   
4  Why California's #wildfires are worse in the f...   

                          label_text  \
0  infrastructure_and_utility_damage   
1  infrastructure_and_utility_damage   
2  infrastructure_and_utility_damage   
3  infrastructure_and_utility_damage   
4         other_relevant_information   

                                      processed_text  
0  RT KAKEnews California wildfires destroy more ...  
1  RT KAKEnews California wildfires destroy more ...  
2  RT KAKEnews California wildfires destroy more ...  
3  RT TheAtlantic Photos of Californias destructi...  
4             Why Californias are worse in the fall   


In [5]:
# convert all text in the DataFrame to lowercase
df = df.applymap(lambda s: s.lower() if type(s) == str else s)

# display the updated DataFrame
print(df.head())

                                          tweet_text  \
0  rt @kakenews: california wildfires destroy mor...   
1  rt @kakenews: california wildfires destroy mor...   
2  rt @kakenews: california wildfires destroy mor...   
3  rt @theatlantic: photos of california's destru...   
4  why california's #wildfires are worse in the f...   

                          label_text  \
0  infrastructure_and_utility_damage   
1  infrastructure_and_utility_damage   
2  infrastructure_and_utility_damage   
3  infrastructure_and_utility_damage   
4         other_relevant_information   

                                      processed_text  
0  rt kakenews california wildfires destroy more ...  
1  rt kakenews california wildfires destroy more ...  
2  rt kakenews california wildfires destroy more ...  
3  rt theatlantic photos of californias destructi...  
4             why californias are worse in the fall   


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# create a CountVectorizer object to convert the preprocessed text into a numerical feature vector
vectorizer = CountVectorizer()

# convert the preprocessed text into a numerical feature vector
X = vectorizer.fit_transform(df['processed_text'])

# use the 'label_text' column as the labels
y = df['label_text']

In [7]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# create a DecisionTreeClassifier object and fit it to the training data
clf1 = DecisionTreeClassifier(random_state=42)
clf1.fit(X_train, y_train)

In [9]:
# evaluate the accuracy of the decision tree on the testing data
accuracy = clf1.score(X_test, y_test)
print('Accuracy:', accuracy)

Accuracy: 0.7357259380097879


In [10]:
from sklearn.metrics import precision_recall_fscore_support

# make predictions on the test data
y_pred = clf1.predict(X_test)

# calculate precision, recall, and F1 score
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')

print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1 score: {:.4f}".format(f1))

Precision: 0.7298
Recall: 0.7357
F1 score: 0.7282


In [11]:
tweet = 'buildings demolished completely'
tweet_vec = vectorizer.transform([tweet])
label = clf1.predict(tweet_vec)
print(label)

['infrastructure_and_utility_damage']


In [12]:
from sklearn.naive_bayes import MultinomialNB

clf2 = MultinomialNB()
clf2.fit(X_train, y_train)

In [13]:
# evaluate the accuracy of the naive-bayes on the testing data
accuracy = clf2.score(X_test, y_test)
print('Accuracy:', accuracy)

Accuracy: 0.7561174551386624


In [14]:
from sklearn.metrics import precision_recall_fscore_support

# make predictions on the test data
y_pred = clf2.predict(X_test)

# calculate precision, recall, and F1 score
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')

print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1 score: {:.4f}".format(f1))

Precision: 0.7458
Recall: 0.7561
F1 score: 0.7486


In [30]:
tweet = 'Tom and Jerry played basketball.'
tweet_vec = vectorizer.transform([tweet])
label = clf2.predict(tweet_vec)
print(label)

['not_humanitarian']
