In [1]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

In [2]:
my_path = 'C:\\Users\\shrey\\Desktop\\ML-DL-NLP\\Projects\\twitter'

In [3]:
train = pd.read_csv(f'{my_path}/train/train_E6oV3lV.csv')

print(train.shape)
train.head(10)

(31962, 3)


Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before the...
6,7,0,@user camping tomorrow @user @user @user @use...
7,8,0,the next school year is the year for exams.ð...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...
9,10,0,@user @user welcome here ! i'm it's so #gr...


In [4]:
#Number of Words

train['word_count'] = train['tweet'].apply(lambda x: len(str(x).split(" "))) # split and get the length of the sentence when whitespace is found
train[['tweet','word_count']].head()


Unnamed: 0,tweet,word_count
0,@user when a father is dysfunctional and is s...,21
1,@user @user thanks for #lyft credit i can't us...,22
2,bihday your majesty,5
3,#model i love u take with u all the time in ...,17
4,factsguide: society now #motivation,8


In [5]:
#Number of Characters
train['char_count'] = train['tweet'].str.len() ## this also includes spaces
train[['tweet','word_count', 'char_count']].head()

Unnamed: 0,tweet,word_count,char_count
0,@user when a father is dysfunctional and is s...,21,102
1,@user @user thanks for #lyft credit i can't us...,22,122
2,bihday your majesty,5,21
3,#model i love u take with u all the time in ...,17,86
4,factsguide: society now #motivation,8,39


In [6]:
train['char_count'] = train['tweet'].str.len() ## this also includes spaces
train[['tweet','char_count']].head()

Unnamed: 0,tweet,char_count
0,@user when a father is dysfunctional and is s...,102
1,@user @user thanks for #lyft credit i can't us...,122
2,bihday your majesty,21
3,#model i love u take with u all the time in ...,86
4,factsguide: society now #motivation,39


In [7]:
"""We will also extract another feature which will calculate the average word length of each tweet. 
This can also potentially help us in improving our model.

Here, we simply take the sum of the length of all the words and divide it by the total length of the tweet:"""

def avg_word(sentence):
    words = sentence.split()
    return (sum(len(x) for x in words)/len(words))

train['avg_word'] = train['tweet'].apply(lambda x: avg_word(x))
train[['tweet','avg_word']].head()

Unnamed: 0,tweet,avg_word
0,@user when a father is dysfunctional and is s...,4.555556
1,@user @user thanks for #lyft credit i can't us...,5.315789
2,bihday your majesty,5.666667
3,#model i love u take with u all the time in ...,4.928571
4,factsguide: society now #motivation,8.0


In [8]:
"""Number of stopwords

Generally, while solving an NLP problem, the first thing we do is to remove the stopwords.
But sometimes calculating the number of stopwords can also give us some extra information which we might have been losing before.

Here, we have imported stopwords from NLTK, which is a basic NLP library in python."""

from nltk.corpus import stopwords
stop = stopwords.words('english')

train['stopwords'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x in stop]))
train[['tweet','stopwords']].head()

Unnamed: 0,tweet,stopwords
0,@user when a father is dysfunctional and is s...,10
1,@user @user thanks for #lyft credit i can't us...,5
2,bihday your majesty,1
3,#model i love u take with u all the time in ...,5
4,factsguide: society now #motivation,1


In [9]:
""" Number of special characters

One more interesting feature which we can extract from a tweet is calculating the number of 
hashtags or mentions present in it. This also helps in extracting extra information from our text data.

Here, we make use of the ‘starts with’ function because hashtags (or mentions) always appear at the beginning of a word."""

train['hastags'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
train[['tweet','hastags']].head()

Unnamed: 0,tweet,hastags
0,@user when a father is dysfunctional and is s...,1
1,@user @user thanks for #lyft credit i can't us...,3
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,1
4,factsguide: society now #motivation,1


In [10]:
"""Number of numerics

Just like we calculated the number of words, we can also calculate the number of numerics which are present in the tweets. 
It does not have a lot of use, but this is still a useful feature that should be run while doing similar exercises.""" 

train['numerics'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
train[['tweet','numerics']].head()

Unnamed: 0,tweet,numerics
0,@user when a father is dysfunctional and is s...,0
1,@user @user thanks for #lyft credit i can't us...,0
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,0
4,factsguide: society now #motivation,0


In [11]:
'''Number of Uppercase words

Anger or rage is quite often expressed by writing in UPPERCASE words which makes this 
a necessary operation to identify those words.'''

train['upper'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
train[['tweet','upper']].head()

Unnamed: 0,tweet,upper
0,@user when a father is dysfunctional and is s...,0
1,@user @user thanks for #lyft credit i can't us...,0
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,0
4,factsguide: society now #motivation,0


In [12]:
""" Lower case

The first pre-processing step which we will do is transform our tweets into lower case. 
This avoids having multiple copies of the same words. For example, while calculating the word count, 
‘Analytics’ and ‘analytics’ will be taken as different words."""

train['tweet'] = train['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['tweet'].head()

0    @user when a father is dysfunctional and is so...
1    @user @user thanks for #lyft credit i can't us...
2                                  bihday your majesty
3    #model i love u take with u all the time in ur...
4                  factsguide: society now #motivation
Name: tweet, dtype: object

In [13]:
'''Removing Punctuation

The next step is to remove punctuation, as it doesn’t add any extra information while treating text data. 
Therefore removing all instances of it will help us reduce the size of the training data.'''

train['tweet'] = train['tweet'].str.replace('[^\w\s]','')
train['tweet'].head()

0    user when a father is dysfunctional and is so ...
1    user user thanks for lyft credit i cant use ca...
2                                  bihday your majesty
3    model i love u take with u all the time in urð...
4                    factsguide society now motivation
Name: tweet, dtype: object

In [14]:
""" Removal of Stop Words

As we discussed earlier, stop words (or commonly occurring words) should be removed from the text data. 
For this purpose, we can either create a list of stopwords ourselves or we can use predefined libraries."""

from nltk.corpus import stopwords
stop = stopwords.words('english')
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['tweet'].head()

0    user father dysfunctional selfish drags kids d...
1    user user thanks lyft credit cant use cause do...
2                                       bihday majesty
3                model love u take u time urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [15]:
 """Common word removal

Previously, we just removed commonly occurring words in a general sense. 
We can also remove commonly occurring words from our text data First, let’s check the 
10 most frequently occurring words in our text data then take call to remove or retain."""

freq = pd.Series(' '.join(train['tweet']).split()).value_counts()[:10] #Naive Method
freq

user     17473
love      2647
ð         2511
day       2199
â         1797
happy     1663
amp       1582
im        1139
u         1136
time      1110
dtype: int64

In [16]:
freq = list(freq.index)
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['tweet'].head()

0    father dysfunctional selfish drags kids dysfun...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [17]:
"""Rare words removal

Similarly, just as we removed the most common words, this time let’s remove rarely 
occurring words from the text. Because they’re so rare, the association between them and 
other words is dominated by noise. You can replace rare words with a more general form and 
then this will have higher counts"""

freq = pd.Series(' '.join(train['tweet']).split()).value_counts()[-10:]
freq

soiree               1
ððððððððmaternity    1
carriefisher         1
fulk                 1
nuff                 1
ufc207               1
mit                  1
bihdaycelebration    1
thatd                1
tucked               1
dtype: int64

In [18]:
freq = list(freq.index)
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['tweet'].head()

0    father dysfunctional selfish drags kids dysfun...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [19]:
"""Spelling correction

We’ve all seen tweets with a plethora of spelling mistakes. Our timelines are often filled 
with hastly sent tweets that are barely legible at times.

In that regard, spelling correction is a useful pre-processing step because this also 
will help us in reducing multiple copies of words. For example, “Analytics” and “analytcs” 
will be treated as different words even if they are used in the same sense.

To achieve this we will use the textblob library. If you are not familiar with it, 
you can check my previous article on ‘NLP for beginners using textblob’.
"""

from textblob import TextBlob
train['tweet'][:5].apply(lambda x: str(TextBlob(x).correct()))

0    father dysfunctional selfish drags kiss dysfun...
1    thanks left credit can use cause dont offer wh...
2                                       midday majesty
3                               model take or ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [20]:
"""Tokenization

Tokenization refers to dividing the text into a sequence of words or sentences. 
In our example, we have used the textblob library to first transform our tweets into a blob and then converted them into a series of words."""

TextBlob(train['tweet'][0]).words #check for all the tokens of 1st tweet

WordList(['father', 'dysfunctional', 'selfish', 'drags', 'kids', 'dysfunction', 'run'])

In [21]:
"""Stemming

Stemming refers to the removal of suffices, like “ing”, “ly”, “s”, etc. 
by a simple rule-based approach. For this purpose, we will use PorterStemmer from the 
NLTK library."""

from nltk.stem import PorterStemmer
st = PorterStemmer()
train['tweet'][:].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0            father dysfunct selfish drag kid dysfunct run
1        thank lyft credit cant use caus dont offer whe...
2                                           bihday majesti
3                                  model take urð ðððð ððð
4                                  factsguid societi motiv
5        22 huge fan fare big talk leav chao pay disput...
6                                     camp tomorrow dannyâ
7        next school year year examsð cant think school...
8        land allin cav champion cleveland clevelandcavali
9                                               welcom gr8
10       ireland consum price index mom climb previou 0...
11       selfish orlando standwithorlando pulseshoot or...
12                      get see daddi today 80day gettingf
13       cnn call michigan middl school build wall chan...
14       comment australia opkillingbay seashepherd hel...
15                ouchjunior angryðgot7 junior yugyoem omg
16                                 thank paner thank pos

In [22]:
 """Lemmatization

Lemmatization is a more effective option than stemming because it converts the word into 
its root word, rather than just stripping the suffices. 
It makes use of the vocabulary and does a morphological analysis to obtain the root word.
Therefore, we usually prefer using lemmatization over stemming."""

from textblob import Word
train['tweet'] = train['tweet'][:].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['tweet'].head()

0    father dysfunctional selfish drag kid dysfunct...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [23]:
"""N-grams

N-grams are the combination of multiple words used together. Ngrams with N=1 are called 
unigrams. Similarly, bigrams (N=2), trigrams (N=3) and so on can also be used.

Unigrams do not usually contain as much information as compared to bigrams and trigrams. 
The basic principle behind n-grams is that they capture the language structure, 
like what letter or word is likely to follow the given one. The longer the n-gram 
(the higher the n), the more context you have to work with. Optimum length really depends 
on the application – if your n-grams are too short, you may fail to capture important 
differences. On the other hand, if they are too long, you may fail to capture the 
“general knowledge” and only stick to particular cases.

So, let’s quickly extract bigrams from our tweets using the ngrams function of the textblob library."""

TextBlob(train['tweet'][0]).ngrams(2)

[WordList(['father', 'dysfunctional']),
 WordList(['dysfunctional', 'selfish']),
 WordList(['selfish', 'drag']),
 WordList(['drag', 'kid']),
 WordList(['kid', 'dysfunction']),
 WordList(['dysfunction', 'run'])]

In [24]:
"""Term frequency

Term frequency is simply the ratio of the count of a word present in a sentence, to the length of the sentence.

Therefore, we can generalize term frequency as:

TF = (Number of times term T appears in the particular row) / (number of terms in that row)

Below, I have tried to show you the term frequency table of a tweet.

Scikit-learn’s Tfidftransformer and Tfidfvectorizer aim to do the same thing, 
which is to convert a collection of raw documents to a matrix of TF-IDF features

"""

tf1 = (train['tweet'][:]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']
tf1

Unnamed: 0,words,tf
0,selfish,10.0
1,father,903.0
2,dysfunction,3.0
3,drag,9.0
4,dysfunctional,1.0
5,run,117.0
6,kid,257.0
7,use,114.0
8,van,7.0
9,cant,802.0


In [25]:
"""Inverse Document Frequency

The intuition behind inverse document frequency (IDF) is that a word is not of much use 
to us if it’s appearing in all the documents.

Therefore, the IDF of each word is the log of the ratio of the total number of 
rows to the number of rows in which that word is present.

IDF = log(N/n), where, N is the total number of rows and n is the number of rows in 
which the word was present.

Inverse Document Frequency (IDF): is a scoring of how rare the word is across documents. 
IDF is a measure of how rare a term is. Rarer the term, more is the IDF score.

So, let’s calculate IDF for the same tweets for which we calculated the term frequency."""

for i,word in enumerate(tf1['words']):
    tf1.loc[i, 'idf'] = np.log(train.shape[0]/(len(train[train['tweet'].str.contains(word)])))

tf1

Unnamed: 0,words,tf,idf
0,selfish,10.0,7.664253
1,father,903.0,3.475609
2,dysfunction,3.0,9.679156
3,drag,9.0,6.346951
4,dysfunctional,1.0,10.372303
5,run,117.0,4.642203
6,kid,257.0,4.567168
7,use,114.0,3.552287
8,van,7.0,5.236505
9,cant,802.0,3.538194


In [26]:
""" Term Frequency – Inverse Document Frequency (TF-IDF)

TF-IDF is the multiplication of the TF and IDF which we calculated above."""

tf1['tfidf'] = tf1['tf'] * tf1['idf']
tf1

Unnamed: 0,words,tf,idf,tfidf
0,selfish,10.0,7.664253,76.642528
1,father,903.0,3.475609,3138.474606
2,dysfunction,3.0,9.679156,29.037467
3,drag,9.0,6.346951,57.122562
4,dysfunctional,1.0,10.372303,10.372303
5,run,117.0,4.642203,543.137774
6,kid,257.0,4.567168,1173.762178
7,use,114.0,3.552287,404.960674
8,van,7.0,5.236505,36.655532
9,cant,802.0,3.538194,2837.631778


In [27]:
"""We can see that the TF-IDF has penalized words like ‘don’t’, ‘can’t’, and ‘use’ 
because they are commonly occurring words. However, it has given a high weight to 
“disappointed” since that will be very useful in determining the sentiment of the tweet.

We don’t have to calculate TF and IDF every time beforehand and then multiply it to 
obtain TF-IDF. Instead, sklearn has a separate function to directly obtain it:"""

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,2))
train_vect = tfidf.fit_transform(train['tweet'])

train_vect

<31962x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 120495 stored elements in Compressed Sparse Row format>

In [28]:
""" Bag of Words

Bag of Words (BoW) refers to the representation of text which describes the presence of 
words within the text data. The intuition behind this is that two similar text fields 
will contain similar kind of words, and will therefore have a similar bag of words. Further, that from the text alone we can learn something about the meaning of the document.

For implementation, sklearn provides a separate function for it as shown below:"""

from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(train['tweet'])
train_bow

<31962x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 128378 stored elements in Compressed Sparse Row format>

In [29]:
"""Sentiment Analysis

If you recall, our problem was to detect the sentiment of the tweet.
So, before applying any ML/DL models (which can have a separate feature detecting the 
sentiment using the textblob library), let’s check the sentiment of the first few tweets."""

train['tweet'][:5].apply(lambda x: TextBlob(x).sentiment)

0    (-0.3, 0.5354166666666667)
1                    (0.2, 0.2)
2                    (0.0, 0.0)
3                    (0.0, 0.0)
4                    (0.0, 0.0)
Name: tweet, dtype: object

In [30]:
"""Above, you can see that it returns a tuple representing polarity and subjectivity of 
each tweet. Here, we only extract polarity as it indicates the sentiment as value nearer 
to 1 means a positive sentiment and values nearer to -1 means a negative sentiment. 
This can also work as a feature for building a machine learning model."""

train['sentiment'] = train['tweet'].apply(lambda x: TextBlob(x).sentiment[0] )
train[['tweet','sentiment']].head()

Unnamed: 0,tweet,sentiment
0,father dysfunctional selfish drag kid dysfunct...,-0.3
1,thanks lyft credit cant use cause dont offer w...,0.2
2,bihday majesty,0.0
3,model take urð ðððð ððð,0.0
4,factsguide society motivation,0.0


In [31]:
from gensim.scripts.glove2word2vec import glove2word2vec

glove_input_file = './glove.6B.50d.txt/glove.6B.50d.txt'

In [32]:
word2vec_output_file = 'glove.6B.100d.txt.word2vec'

In [33]:
glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 50)

In [34]:
from gensim.models import KeyedVectors # load the Stanford GloVe model

model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

In [35]:
model['go']

array([ 1.4828e-01,  1.7761e-01,  4.2346e-01, -3.1489e-01,  3.2273e-01,
       -7.2413e-01, -7.8955e-01,  4.9214e-01, -2.0693e-01, -5.5088e-04,
       -4.7877e-01,  2.8853e-01, -5.7376e-01,  2.7217e-01,  1.1129e+00,
        5.7808e-01,  6.9321e-01, -2.8652e-01, -5.4545e-02, -6.1826e-01,
        1.7227e-01,  2.9263e-01,  3.8184e-01,  6.2186e-01,  5.5461e-01,
       -1.7411e+00, -2.8802e-01, -1.7140e-01,  7.4743e-01, -1.0135e+00,
        3.3596e+00,  1.1370e+00, -1.0028e+00,  1.7685e-01, -6.1795e-03,
       -6.3491e-02,  1.9077e-01,  4.4046e-02,  3.8228e-01, -4.1607e-01,
       -5.0359e-01, -8.3803e-02,  1.7508e-01,  4.0420e-01,  7.7324e-02,
        1.7415e-01,  1.2541e-01, -2.1820e-01,  1.2971e-01,  3.2953e-01],
      dtype=float32)

In [36]:
model['away']

array([ 0.34176  , -0.32715  ,  0.66209  , -0.71138  ,  0.28488  ,
       -0.19242  , -0.85185  ,  0.56403  , -0.13852  , -0.06717  ,
       -0.42702  , -0.20546  , -0.70012  , -0.13799  ,  0.29457  ,
        0.1881   ,  0.50458  , -0.14432  , -0.73977  , -0.63253  ,
        0.06105  ,  0.55907  ,  0.45083  ,  0.16689  ,  0.55929  ,
       -1.924    ,  0.48437  ,  0.66656  ,  0.89432  , -1.0412   ,
        3.1784   ,  1.0617   , -0.15902  ,  0.0067243, -0.35329  ,
        0.39728  , -0.44211  ,  0.41718  ,  0.38365  , -0.39747  ,
       -0.15511  ,  0.21717  ,  0.047058 ,  0.3904   , -0.20639  ,
        0.075575 ,  0.09143  , -1.0418   ,  0.24466  , -1.1117   ],
      dtype=float32)

In [37]:
(model['go'] + model['away'])

array([ 0.49004   , -0.14953999,  1.0855501 , -1.02627   ,  0.60761   ,
       -0.91655   , -1.6414    ,  1.05617   , -0.34544998, -0.06772088,
       -0.90579   ,  0.08307   , -1.27388   ,  0.13418001,  1.40747   ,
        0.76618   ,  1.19779   , -0.43084002, -0.794315  , -1.25079   ,
        0.23332   ,  0.85169995,  0.83267   ,  0.78875005,  1.1139    ,
       -3.6651    ,  0.19634998,  0.49515998,  1.6417501 , -2.0547    ,
        6.538     ,  2.1987    , -1.1618199 ,  0.1835743 , -0.3594695 ,
        0.333789  , -0.25134   ,  0.461226  ,  0.76593   , -0.81354   ,
       -0.6587    ,  0.133367  ,  0.222138  ,  0.7946    , -0.12906599,
        0.24972501,  0.21684   , -1.26      ,  0.37437   , -0.78217006],
      dtype=float32)

In [38]:
(model['go'] + model['away'])/2

array([ 0.24502   , -0.07477   ,  0.54277503, -0.513135  ,  0.303805  ,
       -0.458275  , -0.8207    ,  0.528085  , -0.17272499, -0.03386044,
       -0.452895  ,  0.041535  , -0.63694   ,  0.06709   ,  0.703735  ,
        0.38309   ,  0.598895  , -0.21542001, -0.3971575 , -0.625395  ,
        0.11666   ,  0.42584997,  0.416335  ,  0.39437503,  0.55695   ,
       -1.83255   ,  0.09817499,  0.24757999,  0.82087505, -1.02735   ,
        3.269     ,  1.09935   , -0.58090997,  0.09178715, -0.17973475,
        0.1668945 , -0.12567   ,  0.230613  ,  0.382965  , -0.40677   ,
       -0.32935   ,  0.0666835 ,  0.111069  ,  0.3973    , -0.064533  ,
        0.12486251,  0.10842   , -0.63      ,  0.187185  , -0.39108503],
      dtype=float32)