In [1]:
# import the libraries
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re, string
import nltk
import spacy
import warnings
warnings.filterwarnings('ignore')

In [None]:

train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')
sample_df=pd.read_csv('sample_submission.csv')

In [None]:
# check train_df
train_df.head()

In [None]:
# check the shape
train_df.shape

In [None]:
# check the missing data
train_df.isnull().sum()

In [None]:
# check the duplicated data
train_df.duplicated().sum()

In [None]:
# check the duplicated data
train_df['keyword'].duplicated().sum()

In [None]:
train_df['location'].duplicated().sum()

In [None]:
train_df['text'].duplicated().sum()

In [None]:
# check test_df
test_df.head()

In [None]:
# check the shape
test_df.shape

In [None]:
# check the missing data
test_df.isnull().sum()

In [None]:
# check the duplicated data
test_df.duplicated().sum()

In [None]:
test_df['keyword'].duplicated().sum()

In [None]:
test_df['location'].duplicated().sum()

In [None]:
test_df['text'].duplicated().sum()

In [None]:
# check test_df
test_df.head()

In [None]:
# check the shape
test_df.shape

In [None]:
# check the missing data
test_df.isnull().sum()

In [None]:
# check the duplicated data
test_df.duplicated().sum()

In [None]:
import matplotlib.pyplot as plt

plt.pie(train_df['target'].value_counts(), labels=['Non-disaster', 'Disaster'], autopct='%0.2f')
plt.legend()  # Adds a legend
plt.title('Distribution of Disaster Tweets')  # Adds a descriptive title
plt.show()

In [None]:
# set the random state
random_state = 4041

# import the wordcloud library
from wordcloud import WordCloud

# concat all the text for each labels
non_disaster_text = [''.join(t) for t in train_df[train_df['target']==0]['text']]
non_disaster_strings = ' '.join(map(str, non_disaster_text))
disaster_text = [''.join(t) for t in train_df[train_df['target']==1]['text']]
disaster_strings = ' '.join(map(str, disaster_text))

# generate word clouds
non_disaster_cloud = WordCloud(width=800, height=400, max_words=500, background_color='white', random_state=random_state).generate(non_disaster_strings)
disaster_cloud = WordCloud(width=800, height=400, max_words=500, random_state=random_state).generate(disaster_strings)

In [None]:
# create subplots for the generated clouds
fig, axes = plt.subplots(1, 2, figsize = (20,20))
axes[0].imshow(non_disaster_cloud, interpolation='bilinear')
axes[1].imshow(disaster_cloud, interpolation='bilinear')

# turn the axis off
[ax.axis('off') for ax in axes]

# add titles
axes[0].set_title('Non-disaster Tweets', fontsize=16)
axes[1].set_title('Disaster Tweets', fontsize=16)

# show the figure
plt.show()

# Text preprocessing part1

-remove urls


In [None]:
def remove_url(text):
    text = re.sub(r'((?:https?|ftp|file)://[-\w\d+=&@#/%?~|!:;\.,]*)', '', text)
    return text

train_df['text_cleaned'] = train_df['text'].apply(remove_url)
test_df['text_cleaned'] = test_df['text'].apply(remove_url)

# Remove HTML Tags

In [None]:
def remove_HTML(text):
    text = re.sub(r'<.*?>', '', text)
    return text

train_df['text_cleaned'] = train_df['text_cleaned'].apply(remove_HTML)
test_df['text_cleaned'] = test_df['text_cleaned'].apply(remove_HTML)

# Rmove Characters References

In [None]:
def remove_references(text):
    text = re.sub(r'&[a-zA-Z]+;?', '', text)
    return text

train_df['text_cleaned'] = train_df['text_cleaned'].apply(remove_references)
test_df['text_cleaned'] = test_df['text_cleaned'].apply(remove_references)


# Remove Non-printable Characters



In [None]:
string.printable

def remove_non_printable(text):
    text = ''.join([word for word in text if word in string.printable])
    return text

train_df['text_cleaned'] = train_df['text_cleaned'].apply(remove_non_printable)
test_df['text_cleaned'] = test_df['text_cleaned'].apply(remove_non_printable)


# Remove Numeric Values
Remove numeric values, including mixtures of alphabetical characters and numeric values such as 'M194', '5km'.

In [None]:
def remove_num(text):
    text = re.sub(r'\w*\d+\w*', '', text)
    return text

train_df['text_cleaned'] = train_df['text_cleaned'].apply(remove_num)
test_df['text_cleaned'] = test_df['text_cleaned'].apply(remove_num)

train_df.tail()

test_df.tail()

"""# Feature Engineering
Below are 10 features we're going to create:

- Number of sentences
- Number of words
- Number of characters
- Number of hashtags
- Number of mentions
- Number of all caps words
- Average length of words
- Number of proper nouns (PROPN)
- Number of non-proper nouns (NOUN)
- Percentage of characters that are punctuation

# Number of Sentences¶
"""

In [None]:
import nltk
nltk.download('punkt_tab')

# create a new feature for the number of sentences in each Tweet
train_df['sent_count'] = train_df['text'].apply(nltk.tokenize.sent_tokenize).apply(len)
test_df['sent_count'] = test_df['text'].apply(nltk.tokenize.sent_tokenize).apply(len)

# create a new feature for the number of words
train_df['word_count'] = train_df['text'].apply(nltk.tokenize.word_tokenize).apply(len)
test_df['word_count'] = test_df['text'].apply(nltk.tokenize.word_tokenize).apply(len)


# Number of Characters

In [None]:
# create a new feature for the number of characters excluding white spaces
train_df['char_count'] = train_df['text'].apply(lambda x: len(x) - x.count(" "))
test_df['char_count'] = test_df['text'].apply(lambda x: len(x) - x.count(" "))

In [None]:
# Number of Hashtags

In [None]:
# define a function that returns the number of hashtags in a string
def hash_count(string):
    words = string.split()
    hashtags = [w for w in words if w.startswith('#')]
    return len(hashtags)

# create a new feature for the number of hashtags
train_df['hash_count'] = train_df['text'].apply(hash_count)
test_df['hash_count'] = test_df['text'].apply(hash_count)

# Number of Mentions

In [None]:

# define a function that returns the number of mentions in a string
def ment_count(string):
    words = string.split()
    mentions = [w for w in words if w.startswith('@')]
    return len(mentions)

# create a new feature for the number of mentions
train_df['ment_count'] = train_df['text'].apply(ment_count)
test_df['ment_count'] = test_df['text'].apply(ment_count)

# Number of All Caps Words

In [None]:
import re

def all_caps_count(string):
    words = string.split()
    pattern = re.compile(r'\b[A-Z]{2,}\b')  # Matches words with 2 or more consecutive uppercase letters
    caps_words = [word for word in words if pattern.fullmatch(word)]
    return len(caps_words)

# Average Length of words

In [None]:
# define a function that returns the average length of words
def avg_word_len(string):
    words = string.split()
    total_len = sum([len(words[i]) for i in range(len(words))])
    avg_len = round(total_len / len(words), 2)
    return avg_len

# create a new feature for the average length of words
train_df['avg_word_len'] = train_df['text'].apply(avg_word_len)
test_df['avg_word_len'] = test_df['text'].apply(avg_word_len)

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

# define a function using nltk that returns the number of proper nouns in the text
def propn_count_nltk(text):
    tokens = nltk.word_tokenize(text)
    tagged = [token for token in nltk.pos_tag(tokens)]
    propn_count = len([token for (token, tag) in tagged if tag == 'NNP' or tag == 'NNPS'])
    return propn_count


# create a new feature for the number of proper nouns
train_df['propn_count_nltk'] = train_df['text'].apply(propn_count_nltk)
test_df['propn_count_nltk'] = test_df['text'].apply(propn_count_nltk)

In [None]:

# check the results
train_df[['id', 'text', 'text_cleaned', 'propn_count_nltk']].head()

In [None]:
# test how nltk worked with the first text
string = "Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all"
print([(token, tag) for (token, tag) in nltk.pos_tag(nltk.word_tokenize(string)) if tag == 'NNP'])

In [None]:
# test how nltk works with the first text after lowercasing it
string = "Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all"
print([(token, tag) for (token, tag) in nltk.pos_tag(nltk.word_tokenize(string.lower())) if tag == 'NNP'])

In [None]:
# load the model
nlp = spacy.load('en_core_web_sm')

# check the same string with spaCy
string = "Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all"
print([(token.text, token.pos_) for token in nlp(string) if token.pos_=='PROPN'])

In [None]:

# define a function that returns number of proper nouns with spaCy
def propn_count(text, model=nlp):
    doc = model(text)
    pos = [token.pos_ for token in doc]
    return pos.count('PROPN')

# create a new feature for numbers of proper nouns
train_df['propn_count'] = train_df['text'].apply(propn_count)
test_df['propn_count'] = test_df['text'].apply(propn_count)

In [None]:
# remove 'propn_count_nltk' columns
train_df = train_df.drop(['propn_count_nltk'], axis=1)
test_df = test_df.drop(['propn_count_nltk'], axis=1)


In [None]:

# check the results
train_df[['id', 'text', 'text_cleaned', 'propn_count']].head()

In [None]:
# Number of Non-proper Nouns (NOUN)

In [None]:
# define a function that returns number of non-proper nouns
def noun_count(text, model=nlp):
    doc = model(text)
    pos = [token.pos_ for token in doc]
    return pos.count('NOUN')

# create a new feature for numbers of non-proper nouns
train_df['noun_count'] = train_df['text'].apply(noun_count)
test_df['noun_count'] = test_df['text'].apply(noun_count)

# Percentage of Characters that are Punctuation

In [None]:
import string

# define a function that returns the percentage of punctuation
def punc_per(text):
    total_count = len(text) - text.count(" ")
    punc_count = sum([1 for c in text if c in string.punctuation])
    if punc_count != 0 and total_count != 0:
        return round(punc_count / total_count * 100, 2)
    else:
        return 0

# create a new feature for the percentage of punctuation in text
train_df['punc_per'] = train_df['text'].apply(punc_per)
test_df['punc_per'] = test_df['text'].apply(punc_per)

In [None]:

# check the results
train_df.tail()

In [None]:
# check the results
test_df.tail()
