In [63]:
# loading small english model
import spacy
nlp = spacy.load('en_core_web_md')

In [64]:
# Version
spacy.__version__

'3.7.2'

In [65]:
# Check components in the nlp pipeline
nlp.components

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x17b71b940>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x17b71be20>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x15d692200>),
 ('senter', <spacy.pipeline.senter.SentenceRecognizer at 0x17b71b2e0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x15e09d8c0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x15e0ab640>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x15d6925f0>)]

In [66]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x17b71b940>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x17b71be20>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x15d692200>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x15e09d8c0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x15e0ab640>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x15d6925f0>)]

In [67]:
# Get component names
nlp.component_names

['tok2vec',
 'tagger',
 'parser',
 'senter',
 'attribute_ruler',
 'lemmatizer',
 'ner']

In [68]:
# Using SpaCy Textblob: Sentiment Analysis
from spacytextblob.spacytextblob import SpacyTextBlob

In [69]:
# Adding sentiment pipe to nlp pipeline
nlp.add_pipe("spacytextblob")

<spacytextblob.spacytextblob.SpacyTextBlob at 0x29bc69cd0>

In [70]:
# Recheck our pipeline
nlp.components

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x17b71b940>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x17b71be20>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x15d692200>),
 ('senter', <spacy.pipeline.senter.SentenceRecognizer at 0x17b71b2e0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x15e09d8c0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x15e0ab640>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x15d6925f0>),
 ('spacytextblob', <spacytextblob.spacytextblob.SpacyTextBlob at 0x29bc69cd0>)]

In [71]:
sample_text = "I had a really horrible day. It was the worst day ever! But every now and then I have a really good day that makes me happy."

In [72]:
doc = nlp(sample_text)

In [73]:
print(doc)

I had a really horrible day. It was the worst day ever! But every now and then I have a really good day that makes me happy.


In [74]:
# Check Sentiment
print(doc._.polarity)

-0.125


In [75]:
# Subjectivity
doc._.subjectivity

0.9

In [76]:
# Check Assessment: list for each token
doc._.assessments

[(['really', 'horrible'], -1.0, 1.0, None),
 (['worst', '!'], -1.0, 1.0, None),
 (['really', 'good'], 0.7, 0.6000000000000001, None),
 (['happy'], 0.8, 1.0, None)]

In [77]:
# Testing polarity determination within spacy
best_text = "This was the best song in the album!"
not_the_best_text = "This was not the best song in the album."
text = "This was a song in the album."
not_the_worst_text = "This was not the worst song in the album."
worst_text = "This was the worst song in the album!"

# applying nlp pipeline to text
best = nlp(best_text)
not_the_best = nlp(not_the_best_text)
neutral = nlp(text)
not_the_worst = nlp(not_the_worst_text)
worst = nlp(worst_text)

In [78]:
print(best._.polarity)
print(not_the_best._.polarity)
print(neutral._.polarity)
print(not_the_worst._.polarity)
print(worst._.polarity)

1.0
1.0
0.0
-1.0
-1.0


In [79]:
# spacy seems to have trouble with determining the degree of polarity
# Note to self: Make a pipe to remove stop words when applied to text

In [80]:
print(best._.subjectivity)
print(not_the_best._.subjectivity)
print(neutral._.subjectivity)
print(not_the_worst._.subjectivity)
print(worst._.subjectivity)

0.3
0.3
0.0
1.0
1.0


In [81]:
text = "John loves eating apples when he works at Apple" # Note: loves and love behave differently, implement lemmatize?
docx = nlp(text)

In [82]:
docx._.polarity

0.0

In [83]:
docx._.subjectivity

0.0

In [84]:
docx._.assessments

[]

In [85]:
# load in two "happy" songs and two "sad" songs

# Blue Skies by Frank Sinatra
blue_skies = nlp(
    "Blue skies, smiling at me \
    Nothing but blue skies do I see \
    Blue days, all of them gone \
    Nothing but blue skies from now on \
    Never saw the sun shining so bright \
    Never saw things looking so right \
    Noticing the days hurrying by \
    When you're in love, my how they fly like \
    Bluebirds singing a song \
    Nothing but bluebirds all day long \
    Blue skies, smiling at me \
    Nothing but blue skies do I see \
    Blue days, all of them gone \
    Nothing but blue skies from now on"
)

# What a Wonderful World by Louis Armstrong
wond_world = nlp(
    "I see trees of green \
    Red roses too \
    I see them bloom \
    For me and you \
    And I think to myself \
    What a wonderful world \
    I see skies of blue \
    And clouds of white \
    The bright blessed day \
    The dark sacred night \
    And I think to myself \
    What a wonderful world \
    The colors of the rainbow \
    So pretty in the sky \
    Are also on the faces \
    Of people going by \
    I see friends shaking hands \
    Saying, \"How do you do?\" \
    They\'re really saying \
    I love you \
    I hear babies cry \
    I watch them grow \
    They\'ll learn much more \
    Than I\'ll ever know \
    And I think to myself \
    What a wonderful world \
    Yes, I think to myself \
    What a wonderful world \
    Ooh, yes"
)

# The Sound of Silence by Simon and Garfunkel
silence = nlp(
    "Hello darkness, my old friend \
    I\'ve come to talk with you again \
    Because a vision softly creeping \
    Left its seeds while I was sleeping \
    And the vision that was planted in my brain \
    Still remains \
    Within the sound of silence \
    In restless dreams I walked alone \
    Narrow streets of cobblestone \
    \'Neath the halo of a street lamp \
    I turned my collar to the cold and damp \
    When my eyes were stabbed by the flash of a neon light \
    That split the night \
    And touched the sound of silence \
    And in the naked light I saw \
    Ten thousand people, maybe more \
    People talking without speaking \
    People hearing without listening \
    People writing songs that voices never share \
    No one dared \
    Disturb the sound of silence \
    \"Fools\" said I, \"You do not know \
    Silence like a cancer grows \
    Hear my words that I might teach you \
    Take my arms that I might reach you\" \
    But my words like silent raindrops fell \
    And echoed in the wells of silence \
    And the people bowed and prayed \
    To the neon god they made \
    And the sign flashed out its warning \
    In the words that it was forming \
    And the sign said, \"The words of the prophets \
    Are written on the subway walls \
    And tenement halls \
    And whispered in the sounds of silence\" "
)

# Mad World by Tears for Fears
mad_world = nlp(
    "All around me are familiar faces \
    Worn out places, worn out faces \
    Bright and early for their daily races \
    Going nowhere, going nowhere \
    Their tears are filling up their glasses \
    No expression, no expression \
    Hide my head I want to drown my sorrow \
    No tomorrow, no tomorrow \
    And I find it kind of funny \
    I find it kind of sad \
    The dreams in which I\'m dying are the best I\'ve ever had \
    I find it hard to tell you \'cause I find it hard to take \
    When people run in circles it\'s a very, very \
    Mad world \
    Mad world \
    Mad world \
    Mad world \
    Children waiting for the day they feel good \
    Happy birthday, happy birthday \
    Made to feel the way that every child should \
    Sit and listen, sit and listen \
    Went to school and I was very nervous \
    No one knew me, no one knew me \
    Hello teacher tell me what's my lesson \
    Look right through me, look right through me \
    And I find it kind of funny \
    I find it kind of sad \
    The dreams in which I'm dying are the best I've ever had \
    I find it hard to tell you 'cause I find it hard to take \
    When people run in circles it\'s a very, very \
    Mad world \
    Mad world \
    Mad world \
    Mad world \
    And I find it kind of funny \
    I find it kind of sad \
    The dreams in which I\'m dying are the best I\'ve ever had \
    I find it hard to tell you \'cause I find it hard to take \
    When people run in circles it\'s a very, very \
    Mad world \
    Mad world \
    Halargian world \
    Mad world"
)

In [86]:
print(mad_world.similarity(wond_world))

0.9570395874858139


In [87]:
print(mad_world._.polarity)
print(mad_world._.subjectivity)

0.02164115646258506
0.7647959183673472


In [88]:
print(wond_world._.polarity)
print(wond_world._.subjectivity)

0.4142857142857143
0.5642857142857143


In [89]:
print(silence._.polarity)
print(silence._.subjectivity)

0.1357142857142857
0.3964285714285714


In [90]:
print(blue_skies._.polarity)
print(blue_skies._.subjectivity)

0.17197802197802198
0.31043956043956045


In [91]:
mad_world._.assessments

[(['familiar'], 0.375, 0.5, None),
 (['bright'], 0.7000000000000001, 0.7999999999999999, None),
 (['early'], 0.1, 0.3, None),
 (['daily'], 0.0, 0.0, None),
 (['kind'], 0.6, 0.9, None),
 (['funny'], 0.25, 1.0, None),
 (['kind'], 0.6, 0.9, None),
 (['sad'], -0.5, 1.0, None),
 (['best'], 1.0, 0.3, None),
 (['hard'], -0.2916666666666667, 0.5416666666666666, None),
 (['hard'], -0.2916666666666667, 0.5416666666666666, None),
 (['very', 'very', 'mad'], -0.8125, 1.0, None),
 (['mad'], -0.625, 1.0, None),
 (['mad'], -0.625, 1.0, None),
 (['mad'], -0.625, 1.0, None),
 (['good'], 0.7, 0.6000000000000001, None),
 (['happy'], 0.8, 1.0, None),
 (['happy'], 0.8, 1.0, None),
 (['very'], 0.2, 0.3, None),
 (['right'], 0.2857142857142857, 0.5357142857142857, None),
 (['right'], 0.2857142857142857, 0.5357142857142857, None),
 (['kind'], 0.6, 0.9, None),
 (['funny'], 0.25, 1.0, None),
 (['kind'], 0.6, 0.9, None),
 (['sad'], -0.5, 1.0, None),
 (['best'], 1.0, 0.3, None),
 (['hard'], -0.2916666666666667, 0.5

In [92]:
wond_world._.assessments

[(['green'], -0.2, 0.3, None),
 (['red'], 0.0, 0.0, None),
 (['wonderful'], 1.0, 1.0, None),
 (['blue'], 0.0, 0.1, None),
 (['white'], 0.0, 0.0, None),
 (['bright'], 0.7000000000000001, 0.7999999999999999, None),
 (['dark'], -0.15, 0.4, None),
 (['wonderful'], 1.0, 1.0, None),
 (['pretty'], 0.25, 1.0, None),
 (['really'], 0.2, 0.2, None),
 (['love'], 0.5, 0.6, None),
 (['much', 'more'], 0.5, 0.5, None),
 (['wonderful'], 1.0, 1.0, None),
 (['wonderful'], 1.0, 1.0, None)]

In [93]:
best  = nlp("Monday is the best day of the week!")
worst = nlp("Monday is the worst day of the week!")

best.similarity(worst)

# Idea, remove shared words between songs?

0.9855474863186682

In [94]:
print(best._.polarity)
print(worst._.polarity)

1.0
-1.0


In [113]:
songs = [
    ["Blue Skies", blue_skies],
    ["What a Wonderful World", wond_world],
    ["The Sound of Silence", silence],
    ["Mad World", mad_world]
]

# Raw comparison without any adjustments
for i in range(len(songs)):
    s1_name = songs[i][0]
    s1_song = songs[i][1]
    for j in range(i + 1, len(songs)):
        s2_name = songs[j][0]
        s2_song = songs[j][1]
        song_pair = s1_name + ' <-> ' + s2_name
        print(f'{song_pair: ^50} | polarity: {s1_song._.polarity - s2_song._.polarity: >20} | similarity: {s1_song.similarity(s2_song)}')

      Blue Skies <-> What a Wonderful World        | polarity: -0.24230769230769234 | similarity: 0.8942985693398715
       Blue Skies <-> The Sound of Silence         | polarity: 0.036263736263736274 | similarity: 0.879295268353476
             Blue Skies <-> Mad World              | polarity:  0.15033686551543693 | similarity: 0.8973674854927597
 What a Wonderful World <-> The Sound of Silence   | polarity:   0.2785714285714286 | similarity: 0.8683464537820359
       What a Wonderful World <-> Mad World        | polarity:  0.39264455782312924 | similarity: 0.9570395874858139
        The Sound of Silence <-> Mad World         | polarity:  0.11407312925170064 | similarity: 0.8872459668576334


In [120]:
import numpy as np
songs = [
    ["Blue Skies", blue_skies],
    ["What a Wonderful World", wond_world],
    ["The Sound of Silence", silence],
    ["Mad World", mad_world]
]

# Raw comparison without any adjustments
for i in range(len(songs)):
    s1_name = songs[i][0]
    s1_song = songs[i][1]

    for j in range(i + 1, len(songs)):
        s2_name = songs[j][0]
        s2_song = songs[j][1]

        song_pair = s1_name + ' <-> ' + s2_name
        ideal = np.array([1, 0])
        cosine_mood = np.array([
            s1_song.similarity(s2_song),
            s1_song._.polarity - s2_song._.polarity
        ])
        difference = ideal - cosine_mood

        print(f'{song_pair: ^50} | distance: {np.hypot(difference[0], difference[1])}')

      Blue Skies <-> What a Wonderful World        | distance: 0.2643592445803196
       Blue Skies <-> The Sound of Silence         | distance: 0.12603448262148412
             Blue Skies <-> Mad World              | distance: 0.18202913548958435
 What a Wonderful World <-> The Sound of Silence   | distance: 0.30811474656058274
       What a Wonderful World <-> Mad World        | distance: 0.394987779344513
        The Sound of Silence <-> Mad World         | distance: 0.160393736807723


In [121]:
# Removing Stop Words
no_stop_songs = [0]*len(songs)
for i in range(len(songs)):
    song = songs[i][1]
    new_song = nlp(' '.join([str(t) for t in song if not t.is_stop]))
    no_stop_songs[i] = [songs[i][0], new_song]

no_stop_songs 

[['Blue Skies',
  Blue skies , smiling      blue skies      Blue days , gone      blue skies      saw sun shining bright      saw things looking right      Noticing days hurrying      love , fly like      Bluebirds singing song      bluebirds day long      Blue skies , smiling      blue skies      Blue days , gone      blue skies],
 ['What a Wonderful World',
  trees green      Red roses      bloom           think      wonderful world      skies blue      clouds white      bright blessed day      dark sacred night      think      wonderful world      colors rainbow      pretty sky      faces      people going      friends shaking hands      Saying , " ? "      saying      love      hear babies cry      watch grow      learn      know      think      wonderful world      Yes , think      wonderful world      Ooh , yes],
 ['The Sound of Silence',
 ['Mad World',
  familiar faces      Worn places , worn faces      Bright early daily races      Going , going      tears filling glasses      

In [123]:
import numpy as np

# Similarity Distances after filtering stop words
for i in range(len(songs)):
    s1_name = songs[i][0]
    s1_song = songs[i][1]

    s1_no_name = no_stop_songs[i][0]
    s1_no_song = no_stop_songs[i][1]

    for j in range(i + 1, len(songs)):
        s2_name = songs[j][0]
        s2_song = songs[j][1]

        s2_no_name = no_stop_songs[j][0]
        s2_no_song = no_stop_songs[j][1]

        song_pair = s1_name + ' <-> ' + s2_name
        ideal = np.array([1, 0])
        cosine_mood = np.array([
            s1_no_song.similarity(s2_no_song),
            s1_song._.polarity - s2_song._.polarity
        ])
        difference = ideal - cosine_mood
        print(f'{song_pair: ^50} | distance: {round(np.hypot(difference[0], difference[1]), 3)}')

      Blue Skies <-> What a Wonderful World        | distance: 0.308
       Blue Skies <-> The Sound of Silence         | distance: 0.271
             Blue Skies <-> Mad World              | distance: 0.383
 What a Wonderful World <-> The Sound of Silence   | distance: 0.31
       What a Wonderful World <-> Mad World        | distance: 0.412
        The Sound of Silence <-> Mad World         | distance: 0.252
