# Similarity between Different hashtags using spacy

In [1]:
#load data
Data= open("Data.txt","r").read().splitlines()
Data[0]

'2009-07-05 16:10:09\tlimeseed\tRT Djjask MJ Billie Dream Jask Edit Enjoy play with love Free and sour #Michael Jackson #housemusic\tdjjask mj billi dream jask edit enjoy play love free sour jackson'

In [2]:
#load list of hashtags
hash_list=['#ripmj', '#youbelongwithme', '#michaeljackson','#taylorswift','#farah','#dead']        #set(open("hashtags.txt","r").read().splitlines())
len(hash_list)

6

### Create a Document containing clean tweets for each hashtag


In [3]:
#creating a hashtag documents
from collections import defaultdict

doc_dict=defaultdict(list)

for line in Data:
    time,user,raw_tweet,clean_tweet= line.split('\t')
    hash_in_tweet = [h for h in raw_tweet.split() if h.startswith('#')]
    for hsh in hash_in_tweet:
        if hsh in hash_list:
            doc_dict[hsh].extend(clean_tweet.split())

In [4]:
#Sample Document of hashtag #ripmj
doc_dict['#ripmj'][:10]

['mysteri',
 'celeb',
 'to',
 'write',
 'intro',
 'to',
 'michael',
 'jackson',
 'is',
 'memoir']

## Similarity between documents using spacy

In [5]:
# Import spacy and English models
import spacy

nlp = spacy.load('en')

### Sentence TO Vector 

In [6]:
#example 
doc = nlp(u"Apples and oranges are similar. Boots and hippos aren't.")

In [7]:
doc.vector

array([  7.16891706e-01,   8.81015956e-01,   1.05671249e-01,
         5.12764812e-01,   2.55639702e-01,  -3.74948978e-02,
        -1.07981110e+00,   5.93306720e-01,   1.37239087e+00,
        -2.70559996e-01,   4.87302154e-01,  -6.09425604e-01,
         7.14151442e-01,   2.86402196e-01,   5.55537522e-01,
        -6.27292335e-01,  -9.60178375e-01,   4.80861783e-01,
        -8.03113461e-01,  -1.39905259e-01,   1.14121020e+00,
        -1.04586089e+00,  -8.09619963e-01,   1.78098693e-01,
        -1.03611588e-01,   4.56884235e-01,   9.95423421e-02,
         3.69559079e-01,   4.30800647e-01,   1.58107567e+00,
        -1.60123900e-01,   2.34150648e-01,  -4.88874435e-01,
         6.39671862e-01,   6.01233058e-02,  -9.82226506e-02,
        -6.80491149e-01,  -1.35961354e+00,  -1.26623952e+00,
         6.90772951e-01,   9.69657004e-01,   3.32755357e-01,
         4.61152881e-01,   8.71967793e-01,  -1.06316102e+00,
         5.41103005e-01,   1.22881830e+00,  -3.31587158e-02,
         1.55729401e+00,

In [8]:
#Similarity between apple and Orange
apple=doc[0]
orange=doc[2]
print(apple.similarity(orange))

0.519023


In [9]:
#Back to our mj example
hash_list[:3]

['#ripmj', '#youbelongwithme', '#michaeljackson']

In [10]:
#set to unicode for all hashtags documents
def s2u(hsh):
    return unicode(' '.join(doc_dict[hsh]))

In [11]:
#insert unicode into spacy nlp model 

ripmj = nlp(s2u('#ripmj'))
michaeljackson =nlp(s2u('#michaeljackson'))

dead=nlp(s2u('#dead'))
farah_fawcett =nlp(s2u('#farah'))

taylorswift=nlp(s2u('#taylorswift'))
youbelongwithme =nlp(s2u('#youbelongwithme'))

In [12]:
# HASHTAG TO VECTOR

ripmj.vector[:50]

array([ 0.08098714,  0.32888556,  0.22106627,  0.52919757,  0.43473989,
        0.14844364, -2.13575006, -0.47618693,  0.63527089,  0.75624341,
        0.68723834, -1.27159286,  0.00983675, -0.13590679, -1.61605716,
       -0.6661911 , -1.7092253 , -1.12454951, -1.42362142,  0.1617438 ,
        0.89615762,  0.8162567 ,  0.23041788,  0.36727837, -0.67473179,
       -1.16513574, -0.05817394,  1.83028221, -0.23104751, -0.51598132,
       -0.79086059,  0.34373793,  0.63455135, -0.27843681, -0.7973845 ,
       -0.82402134,  1.72632694, -0.87892234,  0.99042892,  0.83888119,
       -1.20075047, -0.24830444,  1.01165187,  0.21414438, -0.38837963,
        0.11115067, -0.93013382, -0.0334683 ,  0.86103106, -0.47727364], dtype=float32)

### Lets see Similarity between #ripmj and others.

#### Spacy can clearly tell that #ripmj is more related to #dead and #michaeljackson rather than to a song #youbelongwithme 


In [13]:
print(ripmj.similarity(michaeljackson))

0.992128974238


In [14]:
print(ripmj.similarity(youbelongwithme))

0.829779754703


In [15]:
print(ripmj.similarity(dead))

0.981694389032



### Lets see whoose dead ?

**Interestingly, taylor is more related to #dead than farah_fawcett**

In [16]:
print(dead.similarity(taylorswift))

0.957861034282


In [17]:
print(dead.similarity(michaeljackson))

0.993253423913


In [18]:
print(dead.similarity(farah_fawcett))

0.944423525541
