# Word Similarity

#### Dataset link: https://github.com/skathirmani/datasets

In [1]:
import pandas as pd
import nltk
import numpy as np
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

In [73]:

common_stop_words = nltk.corpus.stopwords.words('english')
custom_stop_words = ['', 'amp', 'rt']
stop_words_all = np.hstack([common_stop_words, custom_stop_words])

stemmer = PorterStemmer()

182

### Analysis on Amazon Reviews

In [144]:
amazon_data_path = 'https://github.com/skathirmani/datasets/raw/master/amazon_reviews_11.zip'
amazon = pd.read_csv(amazon_data_path)
amazon['reviewText'] = amazon['reviewText'].fillna('')

## Similarity Analysis

- Cos(0) = 1
- Cos(90) = 0
- Cos(180) = -1

In [152]:
docs = amazon['reviewText']
docs = docs.str.lower()
docs = docs.str.replace('[^a-z#@ ]', '')
docs = docs.str.split(' ')
words_rows = docs.tolist()
words_all = []
words_rows_clean = []
docs_clean = []
for row in words_rows:
    row_words = [stemmer.stem(word) for word in row if word not in stop_words_all]  
    words_rows_clean.append(row_words)
    docs_clean.append(' '.join(row_words))
    words_all.extend(row_words)

    
model = CountVectorizer()
sparse_matrix = model.fit_transform(docs_clean)
dtm = pd.DataFrame(sparse_matrix.toarray(),
                   columns=model.get_feature_names())

In [148]:
from sklearn.metrics.pairwise import cosine_similarity

In [161]:
cosine_similarity([dtm['nook'], dtm['amazon']])

array([[1.        , 0.47979502],
       [0.47979502, 1.        ]])

In [160]:
cosine_similarity([dtm['nook'], dtm['amazon']])[0][1]

0.4797950195876083

In [176]:
def get_similar_words(input_term):
    df_cs = pd.DataFrame(columns=['input_term', 'term', 'cs'])
    for word in dtm.columns:
        cs = cosine_similarity([dtm[word], dtm[input_term]])[0][1]
        df_cs = df_cs.append({'input_term': input_term,
                              'term': word,
                              'cs': cs}, ignore_index=True)
    df_cs = df_cs.sort_values(by='cs', ascending=False)
    df_cs = df_cs[df_cs['input_term'] != df_cs['term']].head(10)
    return df_cs['term'].values

get_similar_words('tablet')

array(['app', 'android', 'invit', 'version', 'netflix', 'seven', 'music',
       'pandora', 'bonusbut', 'membershowev'], dtype=object)