In [1]:
import codecs, nltk, string, os, gensim
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import subprocess


exclude = set(string.punctuation)

# this represent any text as a single "doc-embedding" we use it both for the query and the sentences
# input should be a string
def text_embedding(text):
    
    #you should check in the embeddings you use if the words have been lowercased or not. 
    #try ask the embedding for "barack" and for "Barack"
    # if the Barack works, then comment the following line
    text = text.lower()
    
    # we tokenize the text in single words
    text = nltk.tokenize.WordPunctTokenizer().tokenize(text)
    
    # we remove numbers and punctuation
    text = [token for token in text if token not in exclude and token.isalpha()]
    
    
    doc_embed = []
    
    # for each word we get the embedding and we append it to a list
    for word in text:
            try:
                embed_word = emb_model[word]
                doc_embed.append(embed_word)
            except KeyError:
                continue
    # we average the embeddings of all the words, getting an overall doc embedding
    if len(doc_embed)>0:
        avg = [float(sum(col))/len(col) for col in zip(*doc_embed)]

        avg = np.array(avg).reshape(1, -1)

        # the output is a doc-embedding
        return avg
    else:
        return "Empty"
    
def clean(text):
    text = text.replace("\n","").replace("\r","").replace("\t","")
    return text



In [2]:
#import here your word-embeddings - put the path to the file (if it's .bin change the binary to True)
#emb_model = gensim.models.KeyedVectors.load_word2vec_format('../../resources/small-embeddings.txt', binary=False)

# german wikipedia from https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
#emb_model = gensim.models.KeyedVectors.load_word2vec_format('in-domain-embeddings.txt', binary=False)
emb_model = gensim.models.KeyedVectors.load_word2vec_format('C:/Users/Dr. J/Desktop/wiki.de.vec', binary=False)

# switch for in-domain 
#emb_model = gensim.models.KeyedVectors.load_word2vec_format('C:/QTA/topic-detect/in-domain-embeddings.txt', binary=False)



In [53]:
#this can be a list of words on the same fine-grained topic, like "people", "elites"
# add more words after a space to make it more precise
# query = "volk bürger"

# focus on "key key" terms based on dictionary, let the embeddings find more 
#query = "volk"
#query = "bürger einbürgerung steuerzahler gemeinschaft"
#query = "elite politiker establishment herrschend"
#query = "korruption täuschung betrügen verrat schämen skandal wahrheit unehrlich lüge"
#query = "verantwortung glaubwürdigkeit"
#query = "souverän neutral"
#query = "demokratisch referendum volksabstimmung volksinitiative"
#query = "konsens kompromiss"
#query = "repräsentation parlament regierung "
#query = "populisten populismus demagogisch demagogen"

query = "volk elite souverän"

query_emb = text_embedding(query)

# add the path to the folder where you have your manifestos as text documents
collection_path = "C:/Users/Dr. J/Dropbox/sparserhetoric/deu2017/"

# only Germany for less output 
# collection_path = "C:/Users/Dr. J/Dropbox/sparserhetoric/germany17/"
# collection_path = "../../resources/deu2017/"

In [54]:
# this will be a dictionary of documents, for example manifestos, divided in sentences, which are represented as sentence embeddings
collection = {}

# you loop over the folder
for filename in [x for x in os.listdir(collection_path) if ".txt" in x]:
    # you open each file
    # note encoding 
    content = codecs.open(collection_path+filename,"r","utf-8").read()
    
    #remove breaklines
    content = clean(content)
    
    # you split it in sentences
    content = nltk.sent_tokenize(content)
    
    # you represent each sentence in each document as a word-embedding, which captures the meaning of the sentence
    content = [[sent, text_embedding(sent)] for sent in content if type(text_embedding(sent))!= str]
    collection[filename] = content

In [55]:
# filter on how many sentences you want to retrieve
#max_sent = 50
max_sent = 20

# filter on the cosine similarity

threshold = 0.50
#threshold = 0.30

In [56]:
# now, the information retrieval part


for filename,sentences in collection.items():
    
    # compare the cosine similarity between the embedding of the query and each sentence embedding
    ranking = [[sent, cosine_similarity(query_emb,sent_emb)[0][0]] for sent, sent_emb in sentences]
    # you rank them, based on the similarity
    ranking.sort(key=lambda x: x[1],reverse=True)
    
    # use this if you want to use max_sent
    out = " "
    for sent, score in ranking[:max_sent]:
        out += sent+" "
    
    # use this if you want to use cosine similarity trheshold (comment max_sent part)
    #out = " "
    #for sent, score in ranking:
    #    if score > threshold:
    #        out += sent+" "   

    # save selected sentences in files (so that you can use TopFish / Wordfish)
    #output = open("topic-output/filtered-"+filename,"w")
    output = open("C:/Users/Dr. J/Desktop/topic-output/filtered-"+filename,"w")
    output.write("DE\n"+out)
    output.close()



In [62]:
# add yours
#topfish_path = "/Users/federiconanni/topfish/scaler.py"
topfish_path = "U:/topfish-master/scaler_j.py"
#topfish_emb_path = "/Users/federiconanni/topfish/wiki.big-five.mapped.vec"
topfish_emb_path = "U:/topfish-master/embeddingspaces/wiki.big-five.mapped.vec"
out_file = "C:/Users/Dr. J/Desktop/output2"
subprocess.call("python "+topfish_path+" topic-output/ "+topfish_emb_path+" "+out_file, shell=True)

1

In [61]:
scaling = open(out_file,"r").read().strip().split("\n")
scaling = [x.split() for x in scaling]
scaling.sort(key=lambda x: x[1])
for el in scaling:
    print (" ".join(el))

filtered-de_cdu.txt 0.0
filtered-de_spd.txt 0.42394955439046234
filtered-ch_sp.txt 0.47209514913226625
filtered-de_afd.txt 0.7593411488510238
filtered-ch_svp.txt 1.0
