### Loading Data  

In [1]:
# Mount google drive to import data
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# unzip the folder from drive to local space on colab
!unzip -q "/content/gdrive/MyDrive/Workshop/nlp-getting-started.zip" 

###Preprocessing

In [3]:
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import string
from nltk.stem.porter import PorterStemmer
import numpy as np

#Read data in to a pandas dataframe
df = pd.read_csv('/content/train.csv')

#drop the columns which have Null values
df = df.dropna(how="any", axis=1)

# Special thanks to https://www.kaggle.com/tanulsingh077 for this function
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    #Lower Case
    text = str(text).lower()

    #Remove links starting with https/www
    text = re.sub('https?://\S+|www\.\S+', '', text)

    #Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)

    #Remove new line character
    text = re.sub('\n', '', text)

    #Remove words containing numbers
    text = re.sub('\w*\d\w*', '', text)

    return text

#load the english language stop words list
stop_words = stopwords.words('english')

#We can also add more stopwords according to our data/problem
more_stopwords = ['u', 'im', 'c']
stop_words = stop_words + more_stopwords

#Define a function to remove the stop words from the corpus
def remove_stopwords(text):
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    return text

#Initialize the Porter Stemmer Object
stemmer = PorterStemmer()

#define the function to stem the words in the corpus
def stemm_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

def preprocess_data(text):
    # Clean puntuation, urls, and so on
    text = clean_text(text)

    # Remove stopwords
    text = remove_stopwords(text)

    # Stemm all the words in the sentence
    text = stemm_text(text)
    
    return text

df['text_clean'] = df['text'].apply(preprocess_data)
df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,id,text,target,text_clean
0,1,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquak may allah forgiv us
1,4,Forest fire near La Ronge Sask. Canada,1,forest fire near la rong sask canada
2,5,All residents asked to 'shelter in place' are ...,1,resid ask shelter place notifi offic evacu she...
3,6,"13,000 people receive #wildfires evacuation or...",1,peopl receiv wildfir evacu order california
4,7,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo rubi alaska smoke wildfir pour ...


### Word2vec
Word2vec is vectorized representation of text. It is a great way to incorporate the context in the tasks related to the language modeling e.g. next word prediction, text similarity, recommendation system, chat bots etc.

<img src='https://drive.google.com/uc?id=1AJ0ih-X4AOIlMsGjPZma8PlnjWh6-ek-' height='300px' width='600px' align='center'>

Points to ponder upon:


*   Straight Red column
*   Blue Column

This incorporates the understanding of semantics of human language into the neural network.

**Another Example**<br>
Sentence 1: The **child** said he would grow up to be a doctor.
<br>
Sentence 2: The **kid** said he would grow up to be a doctor.
<br>
The embedding of kid and child would be similar in the vector space.
<br>
<br>
* The deep learning technique Word2vec can be acheived through 2 algorithms:
1. **CBOW**: Predicts the target word from the context.
2. **SkipGram**: Predicts the context using the target word.

<br>
We can prepare the data using the aforementioned algorithms, train them using neural networks and get the embeddings. 

Other Models:
* GLOVE
* FASTEXT





In [4]:
#Import gensim to load the word2vec model
import gensim
from gensim.models import Word2Vec

In [5]:
#Tokenize the preprocessed text
words_in_sentences=[]
for i in df['text_clean']:
    words_in_sentences.append(i.split())

In [6]:
# Actual tweet vs. tokenized tweet

print(df['text_clean'][0])
print(words_in_sentences[0])

deed reason earthquak may allah forgiv us
['deed', 'reason', 'earthquak', 'may', 'allah', 'forgiv', 'us']


In [7]:
# Train the word2vec model
# We are passing the tokenized tweets
# We have set the window size 5
# Minimum count =1

w2v_model = gensim.models.Word2Vec(words_in_sentences,
                                   window=5,
                                   min_count=1)

In [8]:
# it represents all of the words that our Word2Vec model learned a vector for. 
# Or put another way, it's all of the words that appeared in the training data at least.

# Total size of the vocabulary
print(len(w2v_model.wv.index2word))

# let's print first 5 words of the vocabulary
print(w2v_model.wv.index2word[:10])

13735
['like', 'fire', 'get', 'amp', 'bomb', 'new', 'via', 'one', 'peopl', 'go']


In [9]:
# Generate embeddings for each word in the dataset
# Generate the sentences which now have the embeddings in place of the tokens

embeddings = []
for sequence in words_in_sentences:
  embedding = []
  for word in sequence:
    vector = w2v_model.wv[word]
    embedding.append(vector)
  embeddings.append(embedding)

In [10]:
#--------------TODO-------------#

# check the length of the embedding vector and compare it to the total number of tweets

#INSERT YOUR CODE HERE

#check the length of first tweet and the learned embeddings

#INSERT YOUR CODE HERE

len(embeddings[0])
print(words_in_sentences[0])



['deed', 'reason', 'earthquak', 'may', 'allah', 'forgiv', 'us']


In [11]:
#Vocabulary of learned word2vec
words=set(w2v_model.wv.index2word)

In [12]:
# Similar words to token "earthquak" in our corpus
w2v_model.wv.most_similar('earthquak')

[('say', 0.9980059862136841),
 ('see', 0.9979246854782104),
 ('storm', 0.9978454113006592),
 ('amp', 0.9978289604187012),
 ('evacu', 0.9978169798851013),
 ('via', 0.9978048801422119),
 ('go', 0.9978014230728149),
 ('like', 0.9977470636367798),
 ('one', 0.9977437257766724),
 ('know', 0.9977187514305115)]

In [13]:
# Similar words to token "forest" in our corpus
w2v_model.wv.most_similar('forest')

[('fire', 0.9981693029403687),
 ('emerg', 0.9981561303138733),
 ('amp', 0.9981023669242859),
 ('dont', 0.9980595707893372),
 ('make', 0.9980258345603943),
 ('say', 0.9980043172836304),
 ('flood', 0.998002827167511),
 ('time', 0.9979925751686096),
 ('go', 0.9979838728904724),
 ('get', 0.9979726076126099)]

In [14]:
#-----------------TODO---------------#

# Retrieve the similar words of "deed" and observe the difference in the similarity scores

# INSERT YOUR CODE HERE

Average Word2Vec for converting a given Sentence into numerical vector.
Note: Word2Vec is capable of providing an embedding for a given word but not for a sentence.

Average Word2Vec is a technique in which the average of word embeddings of all the words given in a sentence is used as the numerical vector for a given sentence.

### **Reading Material**

#### **Wrod2vec**
* https://www.kaggle.com/code/andreshg/nlp-glove-bert-tf-idf-lstm-explained/notebook#5.-Vectorization
* https://github.com/krishnaik06/NLP-Live/blob/main/Day%205-%20NLP%20Word2vec%20And%20AvgWord2vec.ipynb
* https://github.com/dilipvaleti/Binary-Classification-using-word2vect/blob/main/Classification%20using%20word2vect.ipynb

