In [25]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [7]:
nlp('The lion lives in the jungle').vector.shape

(300,)

In [8]:
tokens = nlp(u'lion cats pet')

In [9]:
for token1 in tokens:
    for token2 in tokens:
        print(token1.text,token2.text,token1.similarity(token2)) 

lion lion 1.0
lion cats 0.37899526953697205
lion pet 0.20031584799289703
cats lion 0.37899526953697205
cats cats 1.0
cats pet 0.6664726138114929
pet lion 0.20031584799289703
pet cats 0.6664726138114929
pet pet 1.0


In [10]:
tokens = nlp(u'like love hate')

for token1 in tokens:
    for token2 in tokens:
        print(token1.text,token2.text,token1.similarity(token2)) 

like like 1.0
like love 0.5212638974189758
like hate 0.5065140724182129
love like 0.5212638974189758
love love 1.0
love hate 0.5708349943161011
hate like 0.5065140724182129
hate love 0.5708349943161011
hate hate 1.0


In [27]:
len(nlp.vocab.vectors)

514157

In [13]:
tokens = nlp(u'dog cat haroon')

for token in tokens:
    print(token.text,token.has_vector,token.vector_norm,token.is_oov)

dog True 75.254234 False
cat True 63.188496 False
haroon False 0.0 True


In [23]:
from scipy import spatial

cosine_simalarity = lambda vec1,vec2 : 1 - spatial.distance.cosine(vec1,vec2)

In [28]:
king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

In [29]:
new_vector = king-man+woman

In [30]:
computed_similarities = []

for word in nlp.vocab:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_simalarity(new_vector,word.vector)
                computed_similarities.append((word,similarity))

In [31]:
computed_similarities = sorted(computed_similarities,key=lambda item : -item[1])

In [34]:
print([t[0].text for t in computed_similarities[:10]])

['king', 'and', 'that', 'where', 'she', 'they', 'woman', 'there', 'should', 'these']


In [41]:
cosine_simalarity(nlp.vocab['king'].vector,nlp.vocab['throne'].vector)

0.7028458714485168

In [45]:
import nltk
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mohammedharoon/nltk_data...


True

In [46]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [47]:
sid = SentimentIntensityAnalyzer()

In [48]:
a = 'This is a good movie'

In [49]:
sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}

In [50]:
a = 'This was the best, most awesome movie EVER MADE!!'
sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.433, 'pos': 0.567, 'compound': 0.88}

In [51]:
a = 'This was the WORST movie EVER MADE!!'
sid.polarity_scores(a)

{'neg': 0.474, 'neu': 0.526, 'pos': 0.0, 'compound': -0.7519}

In [52]:
import numpy as nlp
import pandas as pd

df = pd.read_csv('./TextFiles/amazonreviews.tsv',sep='\t')
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [59]:
df['label'].value_counts()

label
neg    5097
pos    4903
Name: count, dtype: int64

In [60]:
df.isnull().sum()

label     0
review    0
dtype: int64

In [61]:
blanks = []

for ind,lab,rev in df.itertuples():
    if type(rev) == str:
        if rev.isspace():
            blanks.append(ind)

In [62]:
blanks

[]

In [71]:
df.iloc[0]['review']

'Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'

In [72]:
sid.polarity_scores(df.iloc[0]['review'])

{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'compound': 0.9454}

In [73]:
df['scores'] = df['review'].apply(lambda review : sid.polarity_scores(review))

In [74]:
df.head()

Unnamed: 0,label,review,scores
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co..."
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co..."
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com..."
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com..."
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp..."


In [77]:
df['compound'] = df['scores'].apply(lambda score : score['compound'])

In [78]:
df.head()

Unnamed: 0,label,review,scores,compound
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781


In [79]:
df['comp_score'] = df['compound'].apply(lambda score : 'pos' if score >= 0 else 'neg')

In [80]:
df.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454,pos
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957,pos
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858,pos
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814,pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781,pos


In [81]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

accuracy_score(df['label'],df['comp_score'])

0.7097

In [84]:
print(classification_report(df['label'],df['comp_score']))

              precision    recall  f1-score   support

         neg       0.86      0.52      0.64      5097
         pos       0.64      0.91      0.75      4903

    accuracy                           0.71     10000
   macro avg       0.75      0.71      0.70     10000
weighted avg       0.75      0.71      0.70     10000



In [85]:
print(confusion_matrix(df['label'],df['comp_score']))

[[2629 2468]
 [ 435 4468]]


In [86]:
df = pd.read_csv('./TextFiles/moviereviews.tsv',sep='\t')
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [91]:
df.isnull().sum()

label      0
review    35
dtype: int64

In [92]:
df.dropna(inplace=True)

In [94]:
blanks = []
for index,label,review in df.itertuples():
    if type(review) == str:
        if review.isspace():
            blanks.append(index)
blanks

[57,
 71,
 147,
 151,
 283,
 307,
 313,
 323,
 343,
 351,
 427,
 501,
 633,
 675,
 815,
 851,
 977,
 1079,
 1299,
 1455,
 1493,
 1525,
 1531,
 1763,
 1851,
 1905,
 1993]

In [96]:
df.drop(blanks,inplace=True)

In [98]:
df['label'].value_counts()

label
neg    969
pos    969
Name: count, dtype: int64

In [99]:
df['polarity'] = df['review'].apply(lambda review : sid.polarity_scores(review))
df.head()

Unnamed: 0,label,review,polarity
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co..."
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com..."
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com..."
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co..."
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co..."


In [100]:
df['scores'] = df['polarity'].apply(lambda dict : dict['compound'])
df['predn'] = df['scores'].apply(lambda score : 'pos' if score>=0 else 'neg')
df.head()

Unnamed: 0,label,review,polarity,scores,predn
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951,pos
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972,pos
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484,neg


In [101]:
accuracy_score(df['label'],df['predn'])

0.6357069143446853