In [1]:
import codecs, nltk, string, os, gensim
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import subprocess


exclude = set(string.punctuation)

# this represent any text as a single "doc-embedding" we use it both for the query and the sentences
# input should be a string
def text_embedding(text):
    
    #you should check in the embeddings you use if the words have been lowercased or not. 
    #try ask the embedding for "barack" and for "Barack"
    # if the Barack works, then comment the following line
    text = text.lower()
    
    # we tokenize the text in single words
    text = nltk.tokenize.WordPunctTokenizer().tokenize(text)
    
    # we remove numbers and punctuation
    text = [token for token in text if token not in exclude and token.isalpha()]
    
    
    doc_embed = []
    
    # for each word we get the embedding and we append it to a list
    for word in text:
            try:
                embed_word = emb_model[word]
                doc_embed.append(embed_word)
            except KeyError:
                continue
    # we average the embeddings of all the words, getting an overall doc embedding
    if len(doc_embed)>0:
        avg = [float(sum(col))/len(col) for col in zip(*doc_embed)]

        avg = np.array(avg).reshape(1, -1)

        # the output is a doc-embedding
        return avg
    else:
        return "Empty"
    
def clean(text):
    text = text.replace("\n","").replace("\r","").replace("\t","")
    return text



In [2]:
# import here your word-embeddings - put the path to the file (if it's .bin change the binary to True)
#emb_model = gensim.models.KeyedVectors.load_word2vec_format('../../resources/small-embeddings.txt', binary=False)
# german wikipedia from https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
emb_model = gensim.models.KeyedVectors.load_word2vec_format('C:/Users/Dr. J/Desktop/wiki.de.vec', binary=False)

# switch for in-domain 
#emb_model = gensim.models.KeyedVectors.load_word2vec_format('in-domain-embeddings.txt', binary=False)
#emb_model = gensim.models.KeyedVectors.load_word2vec_format('C:/QTA/topic-detect/in-domain-embeddings.txt', binary=False)

# big five - not working 
#emb_model = gensim.models.KeyedVectors.load_word2vec_format('U:/topfish-master/embeddingspaces/wiki.big-five.mapped.vec', binary=False)

In [15]:
#this can be a list of words on the same fine-grained topic, like "people", "elites"
# add more words after a space to make it more precise
# query = "volk bürger"

# focus on "key key" terms based on dictionary, let the embeddings find more 
#query = "volk"
#query = "bürger einbürgerung steuerzahler gemeinschaft"
#query = "elite politiker establishment herrschend"
#query = "korruption täuschung betrügen verrat schämen skandal wahrheit unehrlich lüge"
#query = "verantwortung glaubwürdigkeit"
#query = "souverän neutral"
#query = "demokratisch referendum volksabstimmung volksinitiative"
#query = "konsens kompromiss"
#query = "repräsentation parlament regierung"
#query = "populisten populismus demagogisch demagogen"

# query = "volk elite souverän"
# query = "volk elite politiker establishment herrschend souverän"
# query = "konsens kompromiss repräsentation populisten populismus"

# populism at its best?
# query = "volk elite souverän"

# efficient flip-side query? 
query = "konsens repräsentation populismus"

query_emb = text_embedding(query)


In [16]:
# add the path to the folder where you have your manifestos as text documents

# collection_path = "C:/Users/Dr. J/Dropbox/sparserhetoric/deu2017/"
# collection_path = "../../resources/deu2017/"

# only Germany for less output 
# collection_path = "C:/Users/Dr. J/Dropbox/sparserhetoric/germany17/"

# all German manifestos from polidoc as of 180911
# collection_path = "U:/QTA PA/polidoc_allgerman/"

# German texts last three elections for Germany, Austria and Switzerland - 63 manifestos from polidoc.net
collection_path = "C:/Users/Dr. J/Dropbox/sparserhetoric/polidoc_allgerman/"

# full sample eight countries all manifestos on polidoc (will be updated)
# collection_path = "U:/QTA PA/polidoc big five 180910/"

In [17]:
# this will be a dictionary of documents, for example manifestos, divided in sentences, which are represented as sentence embeddings
collection = {}

# you loop over the folder
for filename in [x for x in os.listdir(collection_path) if ".txt" in x]:
    print (filename)
    # you open each file
    # note encoding 
#    content = codecs.open(collection_path+filename,"r","utf-8").read()
    content = codecs.open(collection_path+filename,"r","utf-8-sig").read()
    
    #remove breaklines
    content = clean(content)
    
    # you split it in sentences
    content = nltk.sent_tokenize(content)
    
    # you represent each sentence in each document as a word-embedding, which captures the meaning of the sentence
    content = [[sent, text_embedding(sent)] for sent in content if type(text_embedding(sent))!= str]
    collection[filename] = content

41113.000.2009.1.1.txt
41113.000.2013.1.1.txt
41113.000.2017.1.1.txt
41223.000.2009.1.1.txt
41223.000.2013.1.1.txt
41223.000.2017.1.1.txt
41320.000.2009.1.1.txt
41320.000.2013.1.1.txt
41320.000.2017.1.1.txt
41420.000.2013.1.1.txt
41420.000.2017.1.1.txt
41521.000.2009.1.1.txt
41521.000.2013.1.1.txt
41521.000.2017.1.1.txt
41523.000.2017.1.1.txt
41702.000.2009.1.1.txt
41702.000.2013.1.1.txt
41950.000.2009.1.1.txt
41950.000.2013.1.1.txt
41953.000.2013.1.1.txt
41953.000.2017.1.1.txt
42110.000.2008.1.1.txt
42110.000.2013.1.1.txt
42110.000.2017.1.1.txt
42220.000.2013.1.1.txt
42220.000.2017.1.1.txt
42320.000.2008.1.1.txt
42320.000.2013.1.1.txt
42320.000.2017.1.1.txt
42420.000.2013.1.1.txt
42420.000.2017.1.1.txt
42421.000.2008.1.1.txt
42422.000.2013.1.1.txt
42450.000.2013.1.1.txt
42450.000.2017.1.1.txt
42520.000.2008.1.1.txt
42520.000.2013.1.1.txt
42520.000.2017.1.1.txt
42951.000.2008.1.1.txt
42952.000.2008.1.1.txt
43110.000.2007.1.1.txt
43110.000.2011.1.1.txt
43110.000.2015.1.1.txt
43120.000.2

In [18]:
# filter on how many sentences you want to retrieve
#max_sent = 50
max_sent = 20

# filter on the cosine similarity

threshold = 0.55
#threshold = 0.30

In [19]:
# now, the information retrieval part


for filename,sentences in collection.items():
    
    # compare the cosine similarity between the embedding of the query and each sentence embedding
    ranking = [[sent, cosine_similarity(query_emb,sent_emb)[0][0]] for sent, sent_emb in sentences]
    # you rank them, based on the similarity
    ranking.sort(key=lambda x: x[1],reverse=True)
    
    # use this if you want to use max_sent
    out = " "
    for sent, score in ranking[:max_sent]:
        out += sent+" "
    
    # use this if you want to use cosine similarity trheshold (comment max_sent part)
    #out = " "
    #for sent, score in ranking:
    #    if score > threshold:
    #        out += sent+" "   

    # save selected sentences in files (so that you can use TopFish / Wordfish)
    #output = open("topic-output/filtered-"+filename,"w")
    output = codecs.open("C:/Users/Dr. J/Desktop/topic-output/filtered-"+filename,"w","UTF-8")
    output.write("DE\n"+out)
    output.close()
    
    # would be nice: write to "utf-8"



In [54]:
# add yours
#topfish_path = "/Users/federiconanni/topfish/scaler.py"
topfish_path = "M:/topfish-master/scaler_j.py"
#topfish_emb_path = "/Users/federiconanni/topfish/wiki.big-five.mapped.vec"
topfish_emb_path = "M:/topfish-master/embeddingspaces/wiki.big-five.mapped.vec"
out_file = "C:/Users/Dr. J/Desktop/output2"
subprocess.call("python "+topfish_path+" topic-output/ "+topfish_emb_path+" "+out_file, shell=True)
# not running, I do it separately 

1

In [20]:
out_file = "C:/Users/Dr. J/Desktop/output2"

In [21]:
scaling = open(out_file,"r").read().strip().split("\n")
scaling = [x.split() for x in scaling]
scaling.sort(key=lambda x: x[1])
for el in scaling:
    print (" ".join(el))

filtered-41320.000.2009.1.1.txt 0.0
filtered-43120.000.2015.1.1.txt 0.04728506188869085
filtered-43520.000.2011.1.1.txt 0.14044908243667079
filtered-41113.000.2009.1.1.txt 0.1587914003730955
filtered-41950.000.2009.1.1.txt 0.15886547104348359
filtered-41523.000.2017.1.1.txt 0.17676665438037886
filtered-42520.000.2017.1.1.txt 0.18959254452311142
filtered-41521.000.2013.1.1.txt 0.19317640129247451
filtered-42320.000.2008.1.1.txt 0.19922711890125794
filtered-42520.000.2008.1.1.txt 0.2013186874190862
filtered-43530.000.2015.1.1.txt 0.2616945658566007
filtered-41953.000.2017.1.1.txt 0.27731381532509036
filtered-42320.000.2013.1.1.txt 0.28170931854455333
filtered-41420.000.2017.1.1.txt 0.28574264370325225
filtered-41320.000.2013.1.1.txt 0.2918947031068171
filtered-43954.000.2015.1.1.txt 0.3015327043224242
filtered-43520.000.2007.1.1.txt 0.3093197615353843
filtered-41950.000.2013.1.1.txt 0.3290802067937827
filtered-41113.000.2017.1.1.txt 0.34470495049818006
filtered-41521.000.2017.1.1.txt 0.3