### semantics and word verctors

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
# vector form of a word
nlp(u'lion').vector

array([ 1.8963e-01, -4.0309e-01,  3.5350e-01, -4.7907e-01, -4.3311e-01,
        2.3857e-01,  2.6962e-01,  6.4332e-02,  3.0767e-01,  1.3712e+00,
       -3.7582e-01, -2.2713e-01, -3.5657e-01, -2.5355e-01,  1.7543e-02,
        3.3962e-01,  7.4723e-02,  5.1226e-01, -3.9759e-01,  5.1333e-03,
       -3.0929e-01,  4.8911e-02, -1.8610e-01, -4.1702e-01, -8.1639e-01,
       -1.6908e-01, -2.6246e-01, -1.5983e-02,  1.2479e-01, -3.7276e-02,
       -5.7125e-01, -1.6296e-01,  1.2376e-01, -5.5464e-02,  1.3244e-01,
        2.7519e-02,  1.2592e-01, -3.2722e-01, -4.9165e-01, -3.5559e-01,
       -3.0630e-01,  6.1185e-02, -1.6932e-01, -6.2405e-02,  6.5763e-01,
       -2.7925e-01, -3.0450e-03, -2.2400e-02, -2.8015e-01, -2.1975e-01,
       -4.3188e-01,  3.9864e-02, -2.2102e-01, -4.2693e-02,  5.2748e-02,
        2.8726e-01,  1.2315e-01, -2.8662e-02,  7.8294e-02,  4.6754e-01,
       -2.4589e-01, -1.1064e-01,  7.2250e-02, -9.4980e-02, -2.7548e-01,
       -5.4097e-01,  1.2823e-01, -8.2408e-02,  3.1035e-01, -6.33

In [4]:
# vector form of a sentence
nlp(u'The quick brown fox jumped.').vector

array([-1.72348157e-01,  1.13993334e-02, -5.07186651e-02,  3.04736700e-02,
        1.02041634e-02,  1.36314198e-01, -1.43308327e-01, -1.56106679e-02,
        8.28423277e-02,  1.67149007e+00, -3.76483351e-01,  8.27163458e-04,
        3.39448266e-02, -1.83933794e-01, -2.17978001e-01,  2.91569922e-02,
        7.38043338e-02,  1.03464663e+00, -6.53124973e-02, -3.02769482e-01,
       -1.21348329e-01, -2.10911278e-02, -5.04233642e-03, -1.49620354e-01,
        7.19222352e-02, -1.14348307e-02, -2.94718355e-01, -9.38479975e-02,
        2.39413325e-02, -2.36319661e-01, -1.38145790e-01,  1.62768334e-01,
        8.31211656e-02, -1.94691680e-02,  3.39896716e-02, -9.83766690e-02,
        1.40833361e-02,  4.41392362e-02, -3.44334841e-02, -1.59693331e-01,
        1.34939671e-01, -5.46016656e-02,  4.42504995e-02, -1.69036329e-01,
        1.77600995e-01,  1.24370992e-01, -2.35389009e-01, -1.37921795e-02,
        1.09435163e-01, -3.67643349e-02, -1.80684015e-01,  5.96696734e-02,
        1.74656641e-02, -

In [5]:
nlp(u'fox').vector.shape, nlp(u'The quick brown fox jumped.').vector.shape

((300,), (300,))

In [6]:
# words has close or relative meanings have great similarity
tokens = nlp(u'lion cat pet')
for token1 in tokens:
    for token2 in tokens:
        print(token1.text, ' - ' , token2.text, ' -> ',token1.similarity(token2))

lion  -  lion  ->  1.0
lion  -  cat  ->  0.5265437
lion  -  pet  ->  0.39923772
cat  -  lion  ->  0.5265437
cat  -  cat  ->  1.0
cat  -  pet  ->  0.7505456
pet  -  lion  ->  0.39923772
pet  -  cat  ->  0.7505456
pet  -  pet  ->  1.0


In [7]:
tokens = nlp(u'like love hate')
for token1 in tokens:
    for token2 in tokens:
        print(token1.text, ' - ' , token2.text, ' -> ',token1.similarity(token2))

like  -  like  ->  1.0
like  -  love  ->  0.657904
like  -  hate  ->  0.6574652
love  -  like  ->  0.657904
love  -  love  ->  1.0
love  -  hate  ->  0.6393099
hate  -  like  ->  0.6574652
hate  -  love  ->  0.6393099
hate  -  hate  ->  1.0


In [8]:
# listing words has_vector norm of a vector and is a word out of vocabulary features
tokens = nlp(u'dog cat nargle')
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
nargle False 0.0 True


In [9]:
# finding the vector of 'king - man + woman'
from scipy import spatial
nlp = spacy.load('en_core_web_lg')
cosine_similarity = lambda vec1, vec2: 1 - spatial.distance.cosine(vec1, vec2)

king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

new_vector = king - man + woman

In [10]:
computed_similarities = []

for word in nlp.vocab:
    if word.has_vector and word.is_lower and word.is_alpha:
        similarity = cosine_similarity(new_vector, word.vector)
        computed_similarities.append((word, similarity))              

In [11]:
computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])
print([t[0].text for t in computed_similarities[:10]])

['king', 'woman', 'she', 'who', 'when', 'dare', 'was', 'not', 'had', 'let']


### sentiment analysis

In [12]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [13]:
sid = SentimentIntensityAnalyzer()

In [14]:
comment1 = 'This is a good movie.'
sid.polarity_scores(comment1)

{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}

In [15]:
comment2 = 'This was the best, most awesome movie EVER MADE!!!'
sid.polarity_scores(comment2)

{'neg': 0.0, 'neu': 0.425, 'pos': 0.575, 'compound': 0.8877}

In [16]:
comment3 = 'This was the worst film to ever disgrace the screen.'
sid.polarity_scores(comment3)

{'neg': 0.477, 'neu': 0.523, 'pos': 0.0, 'compound': -0.8074}

In [17]:
import pandas as pd
df = pd.read_csv('sources/amazonreviews.tsv', sep='\t')
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [18]:
df['label'].value_counts()

neg    5097
pos    4903
Name: label, dtype: int64

In [19]:
# dropping blanks and nulls
df.dropna(inplace=True)

blank_rows = []
for index, label, row in df.itertuples():
    if row.isspace():
        blank_rows.append(index)
df.drop(blank_rows, inplace=True)

In [20]:
sid.polarity_scores(df.iloc[0]['review'])

{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'compound': 0.9454}

In [21]:
df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))
df.head()

Unnamed: 0,label,review,scores
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co..."
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co..."
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com..."
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com..."
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp..."


In [22]:
df['compound'] = df['scores'].apply(lambda score: score['compound'])
df.head()

Unnamed: 0,label,review,scores,compound
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781


In [23]:
df['comp_score'] = df['compound'].apply(lambda score: 'pos' if score >= 0 else 'neg')
df.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454,pos
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957,pos
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858,pos
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814,pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781,pos


In [24]:
# accuracy
df[df['label']==df['comp_score']].shape[0]/df.shape[0]

0.7092

In [25]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print('Accuracy score: ')
print(accuracy_score(df['label'], df['comp_score']))
print('-----------------')
print('Classification report: ')
print(classification_report(df['label'], df['comp_score']))

Accuracy score: 
0.7092
-----------------
Classification report: 
              precision    recall  f1-score   support

         neg       0.86      0.51      0.64      5097
         pos       0.64      0.91      0.75      4903

    accuracy                           0.71     10000
   macro avg       0.75      0.71      0.70     10000
weighted avg       0.75      0.71      0.70     10000



### project

In [26]:
import pandas as pd 
import numpy as np

In [27]:
df = pd.read_csv('sources/moviereviews.tsv', sep='\t')
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [28]:
df.dropna(inplace=True)

blanks = [] 

for i,lb,rv in df.itertuples(): 
    if type(rv)==str:       
        if rv.isspace():    
            blanks.append(i)    

df.drop(blanks, inplace=True)

In [29]:
df['label'].value_counts()

pos    969
neg    969
Name: label, dtype: int64

In [30]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

In [31]:
df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))

df['compound']  = df['scores'].apply(lambda score_dict: score_dict['compound'])

df['comp_score'] = df['compound'].apply(lambda c: 'pos' if c >=0 else 'neg')

df.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951,pos
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972,pos
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484,neg


In [32]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
print('Accuracy score')
print(accuracy_score(df['label'],df['comp_score']))
print('-----')
print('Classification report')
print(classification_report(df['label'],df['comp_score']))
print('-----')
print('Confusion matrix')
print(confusion_matrix(df['label'],df['comp_score']))

Accuracy score
0.6357069143446853
-----
Classification report
              precision    recall  f1-score   support

         neg       0.72      0.44      0.55       969
         pos       0.60      0.83      0.70       969

    accuracy                           0.64      1938
   macro avg       0.66      0.64      0.62      1938
weighted avg       0.66      0.64      0.62      1938

-----
Confusion matrix
[[427 542]
 [164 805]]
