In [71]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

!pip install chart_studio
!pip install textstat

import numpy as np 
import pandas as pd 

# text processing libraries
import re
import string
import nltk
from nltk.corpus import stopwords


# Visualisation libraries
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import chart_studio.plotly as py
import plotly.figure_factory as ff
from plotly.offline import iplot
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')


# sklearn 
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

# File system manangement
import os

# Pytorch
import torch

#Transformers
from transformers import BertTokenizer

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')



In [72]:
#Training data
train = pd.read_csv('categories.csv')
test = pd.read_csv('categories_test.csv')
print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

# First few rows of the training dataset
train.head()

# First few rows of the testing dataset
test.head()

Training data shape:  (30381, 3)
Testing data shape:  (8606, 3)


Unnamed: 0,id,text,target
0,735891446960623616,RT @DonBradshawNTV: How @MarshallAmpsUK came t...,other_relevant_information
1,731202020296818688,Red Cross distributes $30M to Fort McMurray wi...,displaced_people_and_evacuations
2,733665357236342784,Interesting insights on the shifting communica...,other_relevant_information
3,731963038429929472,RT @globeandmail: Oil sands producers helping ...,rescue_volunteering_or_donation_effort
4,728674838034944001,Ottawa to match Red Cross donations for Fort M...,rescue_volunteering_or_donation_effort


Unnamed: 0,id,text,target
0,728674116773904384,RT @FoothillsFCU23: In response the to the #Fo...,rescue_volunteering_or_donation_effort
1,729787427829612544,Redcross is offering charitable donation recei...,rescue_volunteering_or_donation_effort
2,730510385544085505,RT @globeandmail: Red Cross to transfer $50-mi...,rescue_volunteering_or_donation_effort
3,733705874594746368,Live: Emergency operations briefing on north A...,other_relevant_information
4,730606066023665665,"$9bn fire damage to Fort McMurray, ‘the beast’...",infrastructure_and_utility_damage


In [73]:
#Missing values in training set
train.isnull().sum()
#Missing values in test set
test.isnull().sum()

id        0
text      0
target    0
dtype: int64

id        0
text      0
target    0
dtype: int64

In [74]:
print("Examples from each category")
# Define the categories
categories = [
    'rescue_volunteering_or_donation_effort',
    'other_relevant_information',
    'infrastructure_and_utility_damage',
    'sympathy_and_support',
    'injured_or_dead_people',
    'caution_and_advice',
    'displaced_people_and_evacuations',
    'not_humanitarian',
    'requests_or_urgent_needs',
    'missing_or_found_people'
]

# Select one random tweet for each category
for category in categories:
    # Filter the DataFrame by the current category
    category_df = train[train['target'] == category]
    
    # Check if there are tweets in the current category
    if not category_df.empty:
        # Select a random row from the filtered DataFrame
        random_row = category_df.sample()
        
        # Extract the information from the selected row
        tweet_id = random_row['id'].values[0]
        tweet_text = random_row['text'].values[0]
        tweet_category = random_row['target'].values[0]
        
        # Print the information
        print(f'Tweet ID: {tweet_id}')
        print(f'Tweet Text: {tweet_text}')
        print(f'Tweet Category: {tweet_category}')
        print('\n---\n')
    else:
        print(f'No tweets found for category: {category}\n\n')

Examples from each category
Tweet ID: 902638703750807552
Tweet Text: TEXAS STRONG: All proceeds from this shirt will be donated to @RedCross to help with #HurricaneHarvey relief effort
Tweet Category: rescue_volunteering_or_donation_effort

---

Tweet ID: 905291564750131200
Tweet Text: Forclosure Monster, Mnuchin, plans to hold flood relief hostage to that STUPID STUPID wall.
Tweet Category: other_relevant_information

---

Tweet ID: 907448803980201984
Tweet Text: #LatestNews: Irma Knocks Out Power To About 5.8 Million Hurricane Irma knocked out power to about 5.8 million homes and businesses in Flor
Tweet Category: infrastructure_and_utility_damage

---

Tweet ID: 910551418817220608
Tweet Text: RT @_LoveLike_JESUS: . FOLLOWERS Keep in your Prayers The people of Puerto Rico, Hurricane Maria is destroying that island today . htt
Tweet Category: sympathy_and_support

---

Tweet ID: 914062482930577409
Tweet Text: SAN JUAN — More than week after Hurricane Maria smashed through Puerto Rico,

In [75]:
train['target'].value_counts()

target
rescue_volunteering_or_donation_effort    8512
other_relevant_information                5350
infrastructure_and_utility_damage         4233
sympathy_and_support                      3555
injured_or_dead_people                    2963
caution_and_advice                        1600
displaced_people_and_evacuations          1496
not_humanitarian                          1390
requests_or_urgent_needs                  1157
missing_or_found_people                    125
Name: count, dtype: int64

In [76]:
train['target'].value_counts(normalize=True)

target
rescue_volunteering_or_donation_effort    0.280175
other_relevant_information                0.176097
infrastructure_and_utility_damage         0.139331
sympathy_and_support                      0.117014
injured_or_dead_people                    0.097528
caution_and_advice                        0.052664
displaced_people_and_evacuations          0.049241
not_humanitarian                          0.045752
requests_or_urgent_needs                  0.038083
missing_or_found_people                   0.004114
Name: proportion, dtype: float64

In [77]:
train['target'].value_counts(normalize=True).iplot(kind='bar',
                                                      yTitle='Percentage', 
                                                      linecolor='black', 
                                                      opacity=0.7,
                                                      color='red',
                                                      theme='pearl',
                                                      bargap=0.6,
                                                      gridcolor='white',
                                                      title='Distribution  of Sentiment column in the train set')

In [78]:
test['target'].value_counts(normalize=True).iplot(kind='bar',
                                                      yTitle='Percentage', 
                                                      linecolor='black', 
                                                      opacity=0.7,
                                                      color='red',
                                                      theme='pearl',
                                                      bargap=0.6,
                                                      gridcolor='white',
                                                      title='Distribution  of Sentiment column in the test set')

In [81]:
# text preprocessing helper functions

def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text


def text_preprocessing(text):
    """
    Cleaning and parsing the text.

    """
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    # Remove 'rt' if it appears as a standalone word
    tokens = [token for token in tokens if token.lower() != 'rt']
    nopunc = clean_text(' '.join(tokens))
    tokenized_text = tokenizer.tokenize(nopunc)
    remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    combined_text = ' '.join(tokenized_text)
    return combined_text

In [83]:
# Applying the cleaning function to both test and training datasets
train['text_clean'] = train['text'].apply(str).apply(lambda x: text_preprocessing(x))
test['text_clean'] = test['text'].apply(str).apply(lambda x: text_preprocessing(x))

# Save the cleaned datasets to new CSV files
train.to_csv('cleaned_train.csv', index=False)
test.to_csv('cleaned_test.csv', index=False)

: 

In [31]:
train = pd.read_csv('cleaned_train.csv')
test = pd.read_csv('cleaned_test.csv')
train.head()
test.head()

Unnamed: 0,id,text,target,text_clean
0,735891446960623616,RT @DonBradshawNTV: How @MarshallAmpsUK came t...,other_relevant_information,rt donbradshawntv how marshallampsuk came to t...
1,731202020296818688,Red Cross distributes $30M to Fort McMurray wi...,displaced_people_and_evacuations,red cross distributes to fort mcmurray wildfir...
2,733665357236342784,Interesting insights on the shifting communica...,other_relevant_information,interesting insights on the shifting communica...
3,731963038429929472,RT @globeandmail: Oil sands producers helping ...,rescue_volunteering_or_donation_effort,rt globeandmail oil sands producers helping wo...
4,728674838034944001,Ottawa to match Red Cross donations for Fort M...,rescue_volunteering_or_donation_effort,ottawa to match red cross donations for fort m...


Unnamed: 0,id,text,target,text_clean
0,728674116773904384,RT @FoothillsFCU23: In response the to the #Fo...,rescue_volunteering_or_donation_effort,rt in response the to the fortmacfire we will ...
1,729787427829612544,Redcross is offering charitable donation recei...,rescue_volunteering_or_donation_effort,redcross is offering charitable donation recei...
2,730510385544085505,RT @globeandmail: Red Cross to transfer $50-mi...,rescue_volunteering_or_donation_effort,rt globeandmail red cross to transfer to evacu...
3,733705874594746368,Live: Emergency operations briefing on north A...,other_relevant_information,live emergency operations briefing on north al...
4,730606066023665665,"$9bn fire damage to Fort McMurray, ‘the beast’...",infrastructure_and_utility_damage,fire damage to fort mcmurray the beast will af...


In [33]:
train['text_len'] = train['text_clean'].astype(str).apply(len)
train['text_word_count'] = train['text_clean'].apply(lambda x: len(str(x).split()))
train.head()

Unnamed: 0,id,text,target,text_clean,text_len,text_word_count
0,735891446960623616,RT @DonBradshawNTV: How @MarshallAmpsUK came t...,other_relevant_information,rt donbradshawntv how marshallampsuk came to t...,118,18
1,731202020296818688,Red Cross distributes $30M to Fort McMurray wi...,displaced_people_and_evacuations,red cross distributes to fort mcmurray wildfir...,63,10
2,733665357236342784,Interesting insights on the shifting communica...,other_relevant_information,interesting insights on the shifting communica...,105,14
3,731963038429929472,RT @globeandmail: Oil sands producers helping ...,rescue_volunteering_or_donation_effort,rt globeandmail oil sands producers helping wo...,105,14
4,728674838034944001,Ottawa to match Red Cross donations for Fort M...,rescue_volunteering_or_donation_effort,ottawa to match red cross donations for fort m...,75,12


In [40]:
rescue = train[train['target']=='rescue_volunteering_or_donation_effort']
relevantinfo = train[train['target']=='other_relevant_information']
damage = train[train['target']=='infrastructure_and_utility_damage']
support = train[train['target']=='sympathy_and_support']
injury = train[train['target']=='injured_or_dead_people']
caution = train[train['target']=='caution_and_advice']
displacement = train[train['target']=='displaced_people_and_evacuations']
nothumanitarian = train[train['target']=='not_humanitarian']
urgentneeds = train[train['target']=='requests_or_urgent_needs']
missing = train[train['target']=='missing_or_found_people']


In [57]:
rescue['text_len'].iplot(
    kind='hist',
    bins=100,
    xTitle='text length',
    linecolor='black',
    color='red',
    yTitle='count',
    title='Rescue Text Length Distribution')

rescue['text_word_count'].iplot(
    kind='hist',
    bins=50,
    xTitle='text length',
    linecolor='black',
    color='red',
    yTitle='count',
    title='Rescue Text word count')

In [45]:
relevantinfo['text_len'].iplot(
    kind='hist',
    bins=100,
    xTitle='text length',
    linecolor='black',
    color='orange',
    yTitle='count',
    title='Relevant Info Text Length Distribution')

In [46]:
damage['text_len'].iplot(
    kind='hist',
    bins=100,
    xTitle='text length',
    linecolor='black',
    color='yellow',
    yTitle='count',
    title='Damage Text Length Distribution')

In [47]:
support['text_len'].iplot(
    kind='hist',
    bins=100,
    xTitle='text length',
    linecolor='black',
    color='green',
    yTitle='count',
    title='Support Text Length Distribution')

In [49]:
injury['text_len'].iplot(
    kind='hist',
    bins=100,
    xTitle='text length',
    linecolor='black',
    color='blue',
    yTitle='count',
    title='Injured or Dead Text Length Distribution')

In [50]:
caution['text_len'].iplot(
    kind='hist',
    bins=100,
    xTitle='text length',
    linecolor='black',
    color='purple',
    yTitle='count',
    title='Caution and Advice Text Length Distribution')

In [51]:
displacement['text_len'].iplot(
    kind='hist',
    bins=100,
    xTitle='text length',
    linecolor='black',
    color='pink',
    yTitle='count',
    title='Displacement and Evac Text Length Distribution')

In [53]:
nothumanitarian['text_len'].iplot(
    kind='hist',
    bins=100,
    xTitle='text length',
    linecolor='black',
    color='lightblue',
    yTitle='count',
    title='Not Humanitarian Text Length Distribution')

In [54]:
urgentneeds['text_len'].iplot(
    kind='hist',
    bins=100,
    xTitle='text length',
    linecolor='black',
    color='lightgreen',
    yTitle='count',
    title='Urgent Needs Text Length Distribution')

In [55]:
missing['text_len'].iplot(
    kind='hist',
    bins=100,
    xTitle='text length',
    linecolor='black',
    color='lightpurple',
    yTitle='count',
    title='Missing or Found Text Length Distribution')

In [59]:
#source of code : https://medium.com/@cristhianboujon/how-to-list-the-most-common-words-from-text-corpus-using-scikit-learn-dad4d0cab41d
def get_top_n_words(corpus, n=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.
    """
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [60]:
pos_unigrams = get_top_n_words(rescue['text_clean'],20)
# neg_unigrams = get_top_n_words(neg['text_clean'],20)
# neutral_unigrams = get_top_n_words(neutral['text_clean'],20)



#for word, freq in top_unigrams:
    #print(word, freq)
df1 = pd.DataFrame(pos_unigrams, columns = ['Text' , 'count'])
df1.groupby('Text').sum()['count'].sort_values(ascending=True).iplot(
    kind='bar', yTitle='Count', linecolor='black',color='red', title='Top 20 Unigrams in positve text',orientation='h')

# df2 = pd.DataFrame(neg_unigrams, columns = ['Text' , 'count'])
# df2.groupby('Text').sum()['count'].sort_values(ascending=True).iplot(
#     kind='bar', yTitle='Count', linecolor='black', color='green',title='Top 20 Unigrams in negative text',orientation='h')

# df3 = pd.DataFrame(neutral_unigrams, columns = ['Text' , 'count'])
# df3.groupby('Text').sum()['count'].sort_values(ascending=True).iplot(
#     kind='bar', yTitle='Count', linecolor='black', title='Top 20 Unigrams in neutral text',orientation='h')

In [62]:
def get_top_n_gram(corpus,ngram_range,n=None):
    vec = CountVectorizer(ngram_range=ngram_range,stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [63]:
pos_bigrams = get_top_n_gram(rescue['text_clean'],(2,2),20)
# neg_bigrams = get_top_n_gram(neg['text_clean'],(2,2),20)
# neutral_bigrams = get_top_n_gram(neutral['text_clean'],(2,2),20)



#for word, freq in top_bigrams:
    #print(word, freq)
df1 = pd.DataFrame(pos_bigrams, columns = ['Text' , 'count'])
df1.groupby('Text').sum()['count'].sort_values(ascending=True).iplot(
    kind='bar', yTitle='Count', linecolor='black',color='red', title='Top 20 Bigrams in positve text',orientation='h')

# df2 = pd.DataFrame(neg_bigrams, columns = ['Text' , 'count'])
# df2.groupby('Text').sum()['count'].sort_values(ascending=True).iplot(
#     kind='bar', yTitle='Count', linecolor='black', color='green',title='Top 20 Bigrams in negative text',orientation='h')

# df3 = pd.DataFrame(neutral_bigrams, columns = ['Text' , 'count'])
# df3.groupby('Text').sum()['count'].sort_values(ascending=True).iplot(
#     kind='bar', yTitle='Count', linecolor='black', title='Top 20 Bigrams in neutral text',orientation='h')

In [64]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [65]:
# Tokenizing the first text
print(train['text'][10]) # original sentence
print(tokenizer.tokenize(train['text'][10], add_special_tokens=True))
print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train['text'][10])))

Keyword arguments {'add_special_tokens': True} not recognized.


Are we sure we dont need more help? #FortMacFire
['are', 'we', 'sure', 'we', 'don', '##t', 'need', 'more', 'help', '?', '#', 'fort', '##mac', '##fire']
[2024, 2057, 2469, 2057, 2123, 2102, 2342, 2062, 2393, 1029, 1001, 3481, 22911, 10273]


In [66]:
tokenizer.sep_token, tokenizer.sep_token_id
tokenizer.cls_token, tokenizer.cls_token_id

('[SEP]', 102)

('[CLS]', 101)

In [67]:
# calculating length of the longest text

max_len = 0

for text in train['text']:

    # Tokenize the text and add special tokens i.e `[CLS]` and `[SEP]`
    input_ids = tokenizer.encode(text, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))


print('Max length: ', max_len)

Max length:  314


In [70]:
text = train['text'].values
target = train['target'].values

input_ids = []
attention_masks = []
token_type_ids = []

for i in range(len(text)):
    encoded = tokenizer.encode_plus(
      target[i],
      text[i],
      add_special_tokens=True,
      max_length=350,
      pad_to_max_length=True,
      return_token_type_ids=True,
      return_attention_mask=True,
      return_tensors='pt'
    )
    
    input_ids.append(encoded['input_ids'])
    attention_masks.append(encoded['attention_mask'])
    token_type_ids.append(encoded['token_type_ids'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
token_type_ids = torch.cat(token_type_ids, dim=0)

print('Original text: ',text[10])
print(len(input_ids[10]))
print(input_ids[10])
print(attention_masks[10])
print(token_type_ids[10])

Original text:  Are we sure we dont need more help? #FortMacFire
350
tensor([  101,  5343,  1035,  6951,  2075,  1035,  2030,  1035, 13445,  1035,
         3947,   102,  2024,  2057,  2469,  2057,  2123,  2102,  2342,  2062,
         2393,  1029,  1001,  3481, 22911, 10273,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,   