# Semantic and Sentiment analysis

- understand semantic word vectors
- understand sentiment analysis
- leverage Sentiment analysis for txt classification 

In [1]:
# inorder to use Spacy's embedded word vectors we must download the larger 
# spacy english models 
# full details @ https://spacy.io/usage/models

- how word vectors are created
- word2vec is a two layer neural net that processes text
- input is text corpus and output is set of vectors
- purpose : group similar words together in the vecotr space

- trains words aganist neighbor words
- either use context to predict a target word (CBOW)
- or user word to predict a target context

- cosine similarity used to see the similarity
- we can perform vector arthemetic with word vectors- (new_vector = king -man + woman)
- new vector closest to queen

## Word vectors with Spacy and python

In [2]:
import spacy

In [4]:
nlp = spacy.load('en_core_web_lg')

In [5]:
nlp(u'lion').vector

array([  1.2746  ,   0.46242 ,  -1.1829  ,  -5.2661  ,  -2.7128  ,
         1.8521  ,  -0.94273 ,   2.1865  ,   6.503   ,   0.6704  ,
         1.5361  ,   2.5992  ,  -0.36233 ,   4.3965  ,  -6.5644  ,
         1.6141  ,  -1.2897  ,   2.1184  ,  -0.63654 ,  -3.4572  ,
        -4.3771  ,   4.2074  ,  -3.6411  ,  -0.97214 ,   1.3253  ,
        -2.3125  ,  -3.6531  ,  -2.8398  ,   2.7913  ,  -1.53    ,
        -2.9984  ,  -2.6357  ,   0.50615 ,  -2.6925  ,   4.3401  ,
        -5.6017  ,   0.045691,   4.3832  ,  -0.19535 ,  -1.0751  ,
         0.32172 ,   2.4395  ,   4.6638  ,   3.4471  ,  -3.3847  ,
        -1.8238  ,   0.70212 ,   0.58557 ,   5.0032  ,  -3.1072  ,
         1.2364  ,   7.4595  ,   0.057368,   1.0111  ,  -1.0827  ,
         0.69113 ,   2.8009  ,  -3.4383  ,  -1.0599  ,  -2.2627  ,
        -5.149   ,  -5.0636  ,   3.1405  ,   1.0793  ,  -0.72892 ,
        -3.9939  ,  -0.69551 ,  -0.55767 ,   3.2555  ,  -2.9449  ,
         4.7114  ,   1.6388  ,   1.3828  ,   1.4255  ,  -3.233

In [6]:
# doc and span objects also have vectors, they are derived from avg of token
# vectors
# document to vector

In [8]:
nlp('The qucick box fox jumped').vector.shape

(300,)

In [17]:
tokens = nlp(u'lion cat pet')

In [18]:
for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

lion lion 1.0
lion cat 0.3854507803916931
lion pet 0.20031584799289703
cat lion 0.3854507803916931
cat cat 1.0
cat pet 0.732966423034668
pet lion 0.20031584799289703
pet cat 0.732966423034668
pet pet 1.0


In [19]:
tokens = nlp(u'like love hate')
for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

like like 1.0
like love 0.5212638974189758
like hate 0.5065141320228577
love like 0.5212638974189758
love love 1.0
love hate 0.5708349943161011
hate like 0.5065141320228577
hate love 0.5708349943161011
hate hate 1.0


In [20]:
# sometimes it's good to represent the dimension as eucledian norm 
# vector.norm

In [22]:
nlp.vocab.vectors.shape

(514157, 300)

In [50]:
tokens = nlp(u"dog cat nargle Arun")

In [51]:
# OOV stands for out of vocabulary

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 75.254234 False
cat True 63.188496 False
nargle False 0.0 True
Arun True 40.841415 False


In [61]:
from scipy import spatial

In [62]:
cosine_similarity = lambda vec1, vec2: 1 - spatial.distance.cosine(vec1, vec2)

In [63]:
king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

In [64]:
# king - man + woman ---> new vector similar to queen, princess, highness

In [65]:
new_vector = king-man+woman

In [85]:
computed_similarity = []
vect = []
for s in nlp.vocab.vectors:
    vocab = nlp.vocab[s]

for word in nlp.vocab:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_similarity(new_vector, word.vector)
                computed_similarity.append((word, similarity))

In [86]:
computed_similarity = sorted(computed_similarity, key = lambda item: -item[1])

In [87]:
print([t[0].text for t in computed_similarity[:10]])

['king', 'kings', 'princes', 'consort', 'monarch', 'princesses', 'princess', 'ruler', 'consorts', 'rulers']


### we are getting wrong ans because of the updated version does not check with all
### correct it by:
 for s in nlp.vocab.vectors:
    
    vocab = nlp.vocab[s]

## Sentiment Analysis

## VADER Sentiment with Python and NLTK

- we don't have labels
- vader, sentiment reasoning
- nltk package avaiable
- every word has a -ve and +ve score
- understand phrase , so don't love recognized

In [89]:
import nltk

In [90]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...


True

In [91]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [92]:
sid = SentimentIntensityAnalyzer()

In [93]:
a = 'This is a good movie'

In [95]:
sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}

In [96]:
a = 'this was the most awesome, best movie EVER MADE'

In [97]:
sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.441, 'pos': 0.559, 'compound': 0.8712}

In [98]:
a = 'this was the WORST movie that has ever disgraced the screen'

In [100]:
sid.polarity_scores(a)

{'neg': 0.465, 'neu': 0.535, 'pos': 0.0, 'compound': -0.8331}

In [101]:

# amazon review

In [102]:
import pandas as pd

In [104]:
df = pd.read_csv('resources/amazonreviews.tsv', sep='\t')

In [105]:
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [106]:
df['label'].value_counts()

neg    5097
pos    4903
Name: label, dtype: int64

In [107]:
df.dropna(inplace= True)

In [110]:
# removing blanks

blanks = []
for i, lb, rv in df.itertuples():
    # (index, label, review)
    if type(rv) == str:
        if rv.isspace():
            blanks.append(i)

In [111]:
blanks

[]

In [113]:
df.iloc[0]['review']

'Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'

In [114]:
sid.polarity_scores(df.iloc[0]['review'])

{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'compound': 0.9454}

In [119]:
df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))

In [120]:
df.head()

Unnamed: 0,label,review,scores
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co..."
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co..."
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com..."
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com..."
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp..."


In [121]:
df['compound'] = df['scores'].apply(lambda d: d['compound'])

In [122]:
df.head()

Unnamed: 0,label,review,scores,compound
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781


In [123]:
df['comp_score'] = df['compound'].apply(lambda score: 'pos' if score >=0 else 'neg')

In [124]:
df.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454,pos
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957,pos
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858,pos
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814,pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781,pos


In [126]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [127]:
accuracy_score(df['label'], df['comp_score'])

0.7097

In [128]:
print(classification_report(df['label'], df['comp_score']))

              precision    recall  f1-score   support

         neg       0.86      0.52      0.64      5097
         pos       0.64      0.91      0.75      4903

    accuracy                           0.71     10000
   macro avg       0.75      0.71      0.70     10000
weighted avg       0.75      0.71      0.70     10000



In [129]:
print(confusion_matrix(df['label'], df['comp_score']))

[[2629 2468]
 [ 435 4468]]


## Sentiment Analysis

- sentiment analysis

In [149]:
import numpy as np
import pandas as pd

In [150]:
df = pd.read_csv('resources/moviereviews.tsv', sep='\t')

In [151]:
df.dropna(inplace = True)

In [152]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [153]:
blanks = []
index = 0

for reviews in df['review']:
    if reviews.isspace()== True:
        blanks.append(index)
    index = index +1 

In [154]:
blanks

[57,
 71,
 146,
 150,
 280,
 304,
 310,
 320,
 339,
 347,
 423,
 496,
 626,
 668,
 803,
 838,
 963,
 1063,
 1278,
 1430,
 1466,
 1498,
 1504,
 1730,
 1818,
 1871,
 1958]

In [155]:
df.drop(index=blanks, inplace = True)

In [156]:
df.shape

(1938, 2)

In [157]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [158]:
sid = SentimentIntensityAnalyzer()

In [160]:
df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))
df['compound'] = df['scores'].apply(lambda d:d['compound'])
df['comp_score'] =df['compound'].apply(lambda score: 'pos' if score >= 0 else 'neg')

In [161]:
df.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951,pos
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972,pos
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484,neg


In [163]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [165]:
print(classification_report(df['label'], df['comp_score']))

              precision    recall  f1-score   support

         neg       0.72      0.43      0.54       969
         pos       0.59      0.83      0.69       969

    accuracy                           0.63      1938
   macro avg       0.66      0.63      0.62      1938
weighted avg       0.66      0.63      0.62      1938



In [166]:
print(confusion_matrix(df['label'], df['comp_score']))

[[419 550]
 [161 808]]



# Assesment

In [167]:
# perform vector arthemetic on your own words

In [169]:
import numpy as np
import pandas as pd
import spacy

In [170]:
nlp = spacy.load('en_core_web_lg')

In [178]:
words = nlp(u'pen cold winter dessert')

In [179]:
for token1 in words:
    for token2 in words:
        print(token1.text, token2.text, token1.similarity(token2))

pen pen 1.0
pen cold 0.08533231168985367
pen winter -0.06380052864551544
pen dessert 0.03481709957122803
cold pen 0.08533231168985367
cold cold 1.0
cold winter 0.5590035915374756
cold dessert 0.24725329875946045
winter pen -0.06380052864551544
winter cold 0.5590035915374756
winter winter 1.0
winter dessert 0.2659265697002411
dessert pen 0.03481709957122803
dessert cold 0.24725329875946045
dessert winter 0.2659265697002411
dessert dessert 1.0


In [180]:
from scipy import spatial

In [204]:
vec = nlp.vocab['dessert'].vector
score =[]

cosine_similarity = lambda vec1, vec2: 1-spatial.distance.cosine(vec1, vec2)


    
# dist = spatial.distance(vec1, vec2)

In [215]:
for s in nlp.vocab.vectors:
    vocab = nlp.vocab[s]

for s in nlp.vocab:
    if s.is_alpha:
        sc = cosine_similarity(vec, s.vector)
        score.append((sc, s))
    

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [216]:
score[:10]

[(nan, <spacy.lexeme.Lexeme at 0x23604127400>),
 (0.009182250127196312, <spacy.lexeme.Lexeme at 0x23604127480>),
 (0.04615941271185875, <spacy.lexeme.Lexeme at 0x236041273c0>),
 (-0.025823593139648438, <spacy.lexeme.Lexeme at 0x23604127540>),
 (-0.0299764946103096, <spacy.lexeme.Lexeme at 0x23604127180>),
 (nan, <spacy.lexeme.Lexeme at 0x236041275c0>),
 (nan, <spacy.lexeme.Lexeme at 0x23604127640>),
 (-0.11177358031272888, <spacy.lexeme.Lexeme at 0x23604127680>),
 (-0.015846258029341698, <spacy.lexeme.Lexeme at 0x23604127700>),
 (-0.029810747131705284, <spacy.lexeme.Lexeme at 0x23604127580>)]

In [217]:
sorted_score = sorted(score, key = lambda x: -x[0])

In [219]:
for i in range(10):
    print(sorted_score[i][1].text)

nuthin
cheesecake
Dessert
savory
delicioso
custard
chocolatey
salads
snacky
Desserts


In [221]:
bed = nlp.vocab['bed']
blanket = nlp.vocab['blanket']
light = nlp.vocab['light']

vect = bed.vector+blanket.vector-light.vector

In [222]:
vec_score = []

for s in nlp.vocab:
    if s.has_vector:
        if s.is_alpha:
            sc = cosine_similarity(vect, s.vector)
            vec_score.append((sc, s))
            

In [223]:
sorted_score = sorted(vec_score, key = lambda x: -x[0])
    

In [224]:
for i in range(10):
    print(sorted_score[i][1].text)

bed
couch
cot
pillowtop
sofa
crib
Asofa
bedste
sofabed
bunk


CHALLENGE : write a function that takes 3 strings, performs a+b+c arthmetic and returns a top-ten result

In [230]:
def vector_math(a,b,c):
    bed = nlp.vocab[a]
    blanket = nlp.vocab[b]
    light = nlp.vocab[c]

    vect = bed.vector+blanket.vector-light.vector
    vec_score = []

    for s in nlp.vocab:
        if s.has_vector:
            if s.is_alpha:
                sc = cosine_similarity(vect, s.vector)
                vec_score.append((sc, s))
    sorted_score = sorted(vec_score, key = lambda x: -x[0])
    for i in range(20):
        print(sorted_score[i][1].text)
    
    

In [231]:
vector_math('king', 'man', 'woman')

king
kings
kingi
kingii
prince
ruler
kingly
princes
princeling
princedom
kingsize
kingdom
kinglink
kingship
kingpost
conqueror
knight
princelings
kingsnake
kinglet


## Task #2 Perform VADER sentiment analysis

write code to return a set of SentimentIntensityAnalyzer polarity scores

In [239]:
review = 'The movie was ok, the actors performed average, and the story directed descent'

In [240]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [241]:
sid = SentimentIntensityAnalyzer()

In [242]:
sid.polarity_scores(review)

{'neg': 0.0, 'neu': 0.845, 'pos': 0.155, 'compound': 0.296}

In [243]:
# write func

In [244]:
def review_rating(string):
    scores = sid.polarity_scores(string)
    if scores['pos'] > 0.2:
        print('good')
    else:
        print('not good')

In [245]:
review_rating(review)

not good
