# Vectors and Similarity Measures

### Comparing texts by creating vectors based on word counts and using cosine similarity

In [None]:
import glob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#The files we want to vectorize
text_files = glob.glob('soderberg-corpus/*.txt')
print(text_files)

In [None]:
# Converting our texts into a document-term matrix.
#Initialize vectorizer
cv = CountVectorizer(input='filename',
                             lowercase=True,
                             stop_words='english')

N.B. This uses the default Scikit-Learn stopwords list. Try it first with the default, but you can use your own custom stopwords list if you want to modify your results.

In [None]:
#Read in your txt file as list
#with open('custom-stopwords.txt', 'r') as f:
    #custom_stopwords = [s.rstrip('\n') for s in f.readlines()]

In [None]:
#Set up tf-idf vectorizing
#cv = CountVectorizer(input='filename' , stop_words=custom_stopwords)

In [None]:
# This does the actual vectorization and creates a document term matrix
dtm = cv.fit_transform(text_files)

In [None]:
#Shows you truncated vectors for each document/text in the corpus
print(dtm.toarray())

In [None]:
#Shows you all items in vocabulary with their column index 
cv.vocabulary_.items()

In [None]:
# Return total number of documents and the number of items in the vocabulary
dc, vc = dtm.shape
print('document count:',dc,'vocabulary count:',vc)

In [None]:
# What are our top words across all documents?
vocab_sums = dtm.sum(axis=0)
sorted_vocab = [(v, vocab_sums[0, i]) for v, i in cv.vocabulary_.items()]
sorted_vocab = sorted(sorted_vocab, key = lambda x: x[1], reverse=True)

# Display top twenty words
for i in range(1,20):
    print(sorted_vocab[i][0],"->",sorted_vocab[i][1])

In [None]:
#Assessing similarties between texts 
#based on shared term frequencies across documents

# Creating a similarity matrix using cosine similarity
cosine_matrix = cosine_similarity(dtm)
cosine_matrix

In [None]:
#Creates a dataframe with cosine similarities between the texts
#The closer to 1 the more similar
cosine_sim = pd.DataFrame(cosine_matrix, 
                                columns=text_files, index=text_files)
cosine_sim

In [None]:
#Visualize the cosine similarity between texts with heatmap 
#The closer to 1 the more similar (based on word counts)
#i.e. texts that share similar words

fig, ax = plt.subplots(figsize=(10,10))

sns.heatmap(data=cosine_sim, annot=False, yticklabels=True,
           xticklabels=True, ax=ax)

In [None]:
#Visualize the cosine similarity between texts with clustermap 
#Clusters together texts that are most similar
#i.e. texts that share similar words

sns.clustermap(data=cosine_sim, annot=False)

### Comparing texts by creating vectors based on tf-idf scores and using cosine similarity

In [None]:
# Import the libraries we're going to use
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from scipy.spatial.distance import pdist, squareform
import glob
from pathlib import Path 
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Data: set up path to files and a variable with file names
directory_path = 'soderberg-corpus'
text_files = glob.glob(f'{directory_path}/*.txt')
text_titles = [Path(text).stem for text in text_files]

In [None]:
#Set up tf-idf vectorizing with custom settings
tfidf_vectorizer = TfidfVectorizer(input='filename', 
                                   stop_words='english')

N.B. This uses the default Scikit-Learn stopwords list. Try it first with the default, but you can use your own custom stopwords list if you want to modify your results.

In [None]:
#Read in your txt file as list
#with open('custom-stopwords.txt', 'r') as f:
    #custom_stopwords = [s.rstrip('\n') for s in f.readlines()]

In [None]:
#Set up tf-idf vectorizing
#tfidf_vectorizer = TfidfVectorizer(input='filename' , stop_words=custom_stopwords)

In [None]:
#Actually do the vectorizing
tfidf_vector = tfidf_vectorizer.fit_transform(text_files)

In [None]:
# Creating a similarity matrix using cosine similarity
cosine_matrix_tfidf = cosine_similarity(tfidf_vector)
cosine_matrix_tfidf

In [None]:
#Assessing similarties between texts based on tf-idf scores
#Creates a dataframe with cosine similarities between the texts
#calculated from vectors of tfidf scores for each text
cosine_sim_tfidf = pd.DataFrame(cosine_matrix_tfidf, 
                                columns=text_titles, index=text_titles)
cosine_sim_tfidf

In [None]:
#Visualize the cosine similarities between texts with heatmap 
#based on significance scores
#(i.e. texts that share distinctive words)

fig, ax = plt.subplots(figsize=(10,10))

sns.heatmap(data=cosine_sim_tfidf, annot=False, yticklabels=True,
           xticklabels=True, ax=ax)

In [None]:
#Visualize relations between texts with cluster map
sns.clustermap(data=cosine_sim_tfidf, annot=False)

### Comparing words using word2vec and cosine similarity

For a given word, what other words share similar semantic space across the corpus?

In [None]:
import gensim
from pathlib import Path
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer

In [None]:
#Append all texts into a list of lists of all_docs
directory_path = 'soderberg-corpus/'
all_docs = []

for filepath in Path(directory_path).glob("*.txt"):
    with open(filepath, 'r', encoding='utf-8') as file:
        text = file.read()
        all_docs.append(text)

In [None]:
#Tokenize the text into sentences (and the sentences into word tokens)
tokenizer = RegexpTokenizer(r'\w+')

def make_sentences(list_txt):
    all_txt = []
    for txt in list_txt:
        lower_txt = txt.lower()
        sentences = sent_tokenize(lower_txt)
        sentences = [tokenizer.tokenize(sent) for sent in sentences]
        all_txt += sentences
    return all_txt

sentences = make_sentences(all_docs)
sentences

In [None]:
# Traning the models
# Try playing around with vector_size and min_count 
#to see how that affects the models
soderberg_model = gensim.models.Word2Vec(
    sentences,
    min_count=5, # default is 5; ignores all words with total frequency lower than 5 
    vector_size=150) # size of Neural Network layers; default is 100.

In [None]:
# Find nearest word vectors by cosine similarity
#the closer to 1 the more similar
soderberg_model.wv.most_similar('sun', topn=10)

In [None]:
# Find cosine similarity between two given word vectors
#the closer to 1 the more similar
print(soderberg_model.wv.similarity(w1='sun',w2='god'))