In [1]:
import codecs, nltk, string, os, gensim
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

exclude = set(string.punctuation)

# this represent any text as a single "doc-embedding" we use it both for the query and the sentences
# input should be a string
def text_embedding(text):
    
    #you should check in the embeddings you use if the words have been lowercased or not. 
    #try ask the embedding for "barack" and for "Barack"
    # if the Barack works, then comment the following line
    text = text.lower()
    
    # we tokenize the text in single words
    text = nltk.tokenize.WordPunctTokenizer().tokenize(text)
    
    # we remove numbers and punctuation
    text = [token for token in text if token not in exclude and token.isalpha()]
    
    
    doc_embed = []
    
    # for each word we get the embedding and we append it to a list
    for word in text:
            try:
                embed_word = emb_model[word]
                doc_embed.append(embed_word)
            except KeyError:
                continue
    # we average the embeddings of all the words, getting an overall doc embedding
    if len(doc_embed)>0:
        avg = [float(sum(col))/len(col) for col in zip(*doc_embed)]

        avg = np.array(avg).reshape(1, -1)

        # the output is a doc-embedding
        return avg
    else:
        return "Empty"



In [4]:
#import here your word-embeddings - put the path to the file (if it's .bin change the binary to True)
#emb_model = gensim.models.KeyedVectors.load_word2vec_format('../../resources/small-embeddings.txt', binary=False)

# german wikipedia from https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
#emb_model = gensim.models.KeyedVectors.load_word2vec_format('/Users/federiconanni/Downloads/wiki.de.vec', binary=False)
#emb_model = gensim.models.KeyedVectors.load_word2vec_format('C:/Users/Dr. J/Desktop/wiki.de.vec', binary=False)
#emb_model = gensim.models.KeyedVectors.load_word2vec_format('C:/Users/Dr. J/Desktop/wiki.en.vec', binary=False)


# does not work cross-ling?! 

emb_model = gensim.models.KeyedVectors.load_word2vec_format('C:/DRJB/QTA Analysis/wordembeddings/edited.wiki.big-five.mapped.vec', binary=False)

# switch for in-domain 
#emb_model = gensim.models.KeyedVectors.load_word2vec_format('C:/QTA/topic-detect/in-domain-embeddings.txt', binary=False)



In [5]:
#this can be a list of words on the same fine-grained topic, like "people", "elites"
# add more words after a space to make it more precise
# query = "volk bürger"

#focus on "key key" terms based on dictionary, let the embeddings find more 
#query = "volk"
#query = "bürger einbürgerung steuerzahler gemeinschaft"
#query = "elite politiker establishment herrschend"
#query = "korruption täuschung betrügen verrat schämen skandal wahrheit unehrlich lüge"
#query = "verantwortung glaubwürdigkeit"
#query = "souverän neutral"
#query = "demokratisch referendum volksabstimmung volksinitiative"
#query = "konsens kompromiss"
#query = "repräsentation parlament regierung "
#query = "populisten populismus demagogisch demagogen"

# populism at its best?
query = "volk elite souverän"

# efficient flip-side query?
#query = "konsens repräsentation populismus"

# query = "volk elite souverän parlament regierung repräsentation"

#query = "people elite sovereign"

query_emb = text_embedding(query)

# add the path to the folder where you have your manifestos as text documents
# collection_path = "C:/Users/Dr. J/Dropbox/sparserhetoric/deu2017/"

# only Germany for less output 
#collection_path = "C:/Users/Dr. J/Dropbox/sparserhetoric/germany17/"

# German texts last three elections for Germany, Austria and Switzerland - 6X manifestos from polidoc.net
#collection_path = "C:/Users/Dr. J/Dropbox/sparserhetoric/polidoc_allgerman/"
#collection_path = "C:/Users/Dr. J/Dropbox/sparserhetoric/en/"

# big five 
collection_path = "C:/DRJB/QTA Analysis/big_five_08_18/"


In [6]:
# this will be a dictionary of documents, for example manifestos, divided in sentences, which are represented as sentence embeddings
collection = {}

# you loop over the folder
for filename in os.listdir(collection_path):
    # you open each file
    # note encoding 
    content = codecs.open(collection_path+filename,"r","utf-8").read()
    # you split it in sentences
    content = nltk.sent_tokenize(content)
    
    # you represent each sentence in each document as a word-embedding, which captures the meaning of the sentence
    content = [[sent, text_embedding(sent)] for sent in content if type(text_embedding(sent))!= str]
    collection[filename] = content

In [7]:
# now, the information retrieval part

for filename,sentences in collection.items():
    
    # compare the cosine similarity between the embedding of the query and each sentence embedding
    ranking = [[sent, cosine_similarity(query_emb,sent_emb)[0][0]] for sent, sent_emb in sentences]
    # you rank them, based on the similarity
    ranking.sort(key=lambda x: x[1],reverse=True)
    
    print (filename)
    # you can change here for having more sentences as output
   # for sent, score in ranking[:20]:
    for sent, score in ranking[:3]:
        print (sent, score)
    print (" \n")

31110.000.2007.1.1.txt
 

31110.000.2012.2.1.txt
 

31111.000.2007.1.1.txt
 

31320.000.2007.1.1.txt
 

31320.000.2012.1.1.txt
 

31320.000.2017.1.1.txt
 

31430.000.2017.1.1.txt
 

31624.000.2007.1.1.txt
 

31626.000.2007.1.1.txt
 

31626.000.2012.1.1.txt
 

31627.000.2017.1.1.txt
 

31720.000.2007.1.1.txt
 

31720.000.2017.1.1.txt
 

31951.000.2017.1.1.txt
 

32230.000.2018.1.1.txt
 

32340.000.2008.1.1.txt
 

32340.000.2013.1.1.txt
 

32340.000.2018.1.1.txt
 

32350.000.2018.1.1.txt
 

32404.000.2008.1.1.txt
 

32430.000.2013.1.1.txt
 

32440.000.2018.1.1.txt
 

32530.000.2008.1.1.txt
 

32530.000.2013.1.1.txt
 

32610.000.2018.1.1.txt
 

32730.000.2018.1.1.txt
 

32902.000.2008.1.1.txt
 

32993.000.2008.1.1.txt
 

32993.000.2013.1.1.txt
 

32995.000.2008.1.1.txt
 

32997.000.2013.1.1.txt
 

32997.000.2018.1.1.txt
 

33025.000.2016.1.1.txt
 

33095.000.2016.1.1.txt
 

33096.000.2015.1.1.txt
 

33096.000.2016.1.1.txt
 

33097.000.2016.1.1.txt
 

33098.000.2015.1.1.txt
 

33098.000.20