# Words similarity analysis
This notebook was an attempt to check if we could avoid deep learning and predict sentiment based on similarity of word vectors to either positive or negative ones.

In [1]:
from itertools import chain
from pathlib import Path
import re
import string
from typing import List, Tuple, Union

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import numpy as np
import pandas as pd

In [2]:
data_ = pd.read_csv('/nlp/data/Tweets.csv')[['text', 'airline_sentiment']]
messages = data_['text']
sentiment = data_['airline_sentiment']
# work only on a sample - it should be enough to find out which words are the most popular in the negative or positive tweets.
data_ = data_.iloc[:2000]
messages = messages.iloc[:2000]
sentiment = sentiment.iloc[:2000]

## Removing punctuation marks from phrases
At the moment we don't bother with contraction and simply treat `aren't` as `aren` etc.

In [3]:
def remove_punctation_marks(msg: str) -> List[str]:
    return list(filter(None, re.split(r'[\W]', msg)))

def lower_str_list(str_list: List[str]) -> List[str]:
    return [w.lower() for w in str_list]

# few examples
for msg in messages[:3]:
    print(remove_punctation_marks(msg))
    print(lower_str_list(remove_punctation_marks(msg)))


['VirginAmerica', 'What', 'dhepburn', 'said']
['virginamerica', 'what', 'dhepburn', 'said']
['VirginAmerica', 'plus', 'you', 've', 'added', 'commercials', 'to', 'the', 'experience', 'tacky']
['virginamerica', 'plus', 'you', 've', 'added', 'commercials', 'to', 'the', 'experience', 'tacky']
['VirginAmerica', 'I', 'didn', 't', 'today', 'Must', 'mean', 'I', 'need', 'to', 'take', 'another', 'trip']
['virginamerica', 'i', 'didn', 't', 'today', 'must', 'mean', 'i', 'need', 'to', 'take', 'another', 'trip']


## Find most popular words in negative, positive, and neutral tweets
I'll skip all stop words from the list from   [here](https://raw.githubusercontent.com/stanfordnlp/CoreNLP/master/data/edu/stanford/nlp/patterns/surface/stopwords.txt)

They are far too popular to be inforamtive what sentiment the tweet has.

In [10]:
with open('/nlp/data/stopwords.txt') as f:
    stop_words = f.read().splitlines()
stop_words.remove('not') # we want to have negation words - can be helpful for negative sentiment
stop_words.remove('no')

In [5]:
data = data_.copy()
data['airline_sentiment'] = data_['airline_sentiment'].replace({'negative': -1, 'neutral': 0, 'positive': 1})


In [6]:
def get_most_popular_words_from_series(s: pd.Series, how_many=50, stop_words=[]) -> pd.Series:
    all_words = []
    for phrase in s:
        all_words = chain(all_words, remove_punctation_marks(phrase))
    all_words = lower_str_list(all_words)
    result = pd.Series(all_words).value_counts()
    result = result[~result.index.isin(stop_words)]
    return result[:how_many]

## The most popular words for given class
Chose a few words which are the most popular for a given class

In [11]:
# negative
neg_df = data[data['airline_sentiment'] == -1]
get_most_popular_words_from_series(neg_df['text'], stop_words=stop_words)

flight          2943
united          2899
usairways       2375
t               2263
americanair     2110
not             1357
no              1326
southwestair    1214
jetblue         1052
get              988
s                945
cancelled        926
now              833
service          750
2                735
hours            649
just             622
help             619
hold             614
customer         614
time             598
plane            532
m                521
delayed          508
amp              503
still            492
us               480
call             462
co               455
hour             452
flightled        448
one              442
http             437
will             435
bag              420
flights          420
gate             411
ve               399
don              388
late             378
back             375
need             373
phone            369
3                347
waiting          341
please           328
thanks           315
4            

In [20]:
negative_vectors = [
    'cancelled', 'get', 'service', 'not', 'no', 'now', 'hour', 'hold', 'delayed', 'still',
    'call', 'gate', 'late', 'bag', 'need', 'waiting', 'please', 'airline'
]

In [12]:
# positive
pos_df = data[data['airline_sentiment'] == 1]
get_most_popular_words_from_series(pos_df['text'], stop_words=stop_words)

thanks           611
jetblue          595
southwestair     576
united           528
thank            455
flight           381
t                355
americanair      355
usairways        276
great            236
co               233
http             217
s                208
just             179
service          162
virginamerica    156
love             136
will             116
customer         114
get              114
guys             110
much             109
good             109
best             105
awesome          100
got              100
time              97
now               90
us                87
help              84
today             83
amp               82
airline           81
amazing           78
not               78
back              73
m                 73
flying            70
crew              70
gate              65
fly               64
re                63
no                63
made              63
appreciate        62
like              61
ll                61
please       

In [19]:
positive_vectors = [
    'thank', 'great', 'appreciate', 'love', 'guys', 'much', 'good', 'best', 'awesome', 'us', 'amazing'
]

In [15]:
# neutral
pos_df = data[data['airline_sentiment'] == 0]
get_most_popular_words_from_series(pos_df['text'], stop_words=stop_words)

jetblue          748
united           737
t                732
southwestair     671
flight           615
co               526
http             501
americanair      499
s                403
usairways        402
get              238
will             217
please           182
virginamerica    177
just             173
help             170
flights          169
need             164
thanks           157
not              152
dm               132
m                130
no               118
2                115
now              114
tomorrow         109
us               107
fleek            107
know             104
fleet            103
cancelled        101
amp               98
time              98
way               94
change            88
one               80
fly               79
new               79
flying            78
back              78
like              77
today             77
number            76
check             74
see               71
go                71
thank             69
got          

In [16]:
# there is nothing explicitly characteristic so I will skip neutral for now

## Load glove similarity model
We don't want to work on the bag of words model due to its lack of similarity information between the words (e.g. word *hotel* is orthogonal to word *motel*).  
The alternative to the bag of words is using word representation like the [GloVe](https://nlp.stanford.edu/projects/glove/).  
It is encoding word into *n* dimensional vector (using PCA) which includes similarity of words based on its context (i.e. do they occur next to each other).  


In [17]:
glove_file = datapath('/nlp/data/glove.6B.100d.txt')
word2vec_glove_file = get_tmpfile("glove.6B.100d.word2vec.txt")
glove2word2vec(glove_file, word2vec_glove_file)
model = KeyedVectors.load_word2vec_format(word2vec_glove_file)

In [21]:
def get_average_similarity(model: KeyedVectors, word: str, vectors: List[str]) -> Union[float, None]:
    """
    Predict average similarity betweeen a word and a vector of sentimental words.
    """
    similarity = 0
    for vec in vectors:
        try:
            similarity += model.similarity(word, vec)
        except KeyError as e:
            return None
    return similarity / len(vectors)


def get_pharse_average_similarity(
        phrase: List[str], model: KeyedVectors, vectors: List[str]) -> Tuple[float, float]:
    """
    Get average positive and negative similarity
    """
    vec_sim = 0
    words_count = 0
    for word in phrase:
        if word not in model.vocab:
            continue
        vec_sim += get_average_similarity(model, word, vectors)
        words_count += 1
    if vec_sim == 0:
        return np.nan
    return vec_sim / words_count


## Check similarity between class words and tweet text
If similarity between the words in a tweet text and the most popular words in a class is sufficient enough to distinct the negative class from the positive one we could ommit the deep learning and save computing power for more sophisticated tasks ;)

In [22]:
data_['positive_similarity'] = data_['text'].apply(get_pharse_average_similarity, args=(model, positive_vectors))
data_['negative_similarity'] = data_['text'].apply(get_pharse_average_similarity, args=(model, negative_vectors))
data_

Unnamed: 0,text,airline_sentiment,positive_similarity,negative_similarity
0,@VirginAmerica What @dhepburn said.,0,0.305686,0.305942
1,@VirginAmerica plus you've added commercials t...,1,0.289627,0.306438
2,@VirginAmerica I didn't today... Must mean I n...,0,0.309066,0.321683
3,@VirginAmerica it's really aggressive to blast...,-1,0.290075,0.300603
4,@VirginAmerica and it's a really big bad thing...,-1,0.321671,0.322343
5,@VirginAmerica seriously would pay $30 a fligh...,-1,0.284316,0.292499
6,"@VirginAmerica yes, nearly every time I fly VX...",1,0.284237,0.286410
7,@VirginAmerica Really missed a prime opportuni...,0,0.283590,0.295685
8,"@virginamerica Well, I didn't…but NOW I DO! :-D",1,0.326240,0.319640
9,"@VirginAmerica it was amazing, and arrived an ...",1,0.304602,0.309279


# Summary
Unfortunately the similarity to classes is ambiguous and we can see clearly that it is not enough to distinct correct class from the other ones.  
**The next step is using deep learning and RNN for this task.**