In [15]:
import codecs, nltk, string, os, gensim
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import subprocess


exclude = set(string.punctuation)

# this represent any text as a single "doc-embedding" we use it both for the query and the sentences
# input should be a string
def text_embedding(text,lang):
    
    lang = lang.lower()
    
    #you should check in the embeddings you use if the words have been lowercased or not. 
    #try ask the embedding for "barack" and for "Barack"
    # if the Barack works, then comment the following line
    text = text.lower()
    
    # we tokenize the text in single words
    text = nltk.tokenize.WordPunctTokenizer().tokenize(text)
    
    # we remove numbers and punctuation
    text = [token for token in text if token not in exclude and token.isalpha()]
    
    
    doc_embed = []
    
    # for each word we get the embedding and we append it to a list
    for word in text:
            try:
                embed_word = emb_model[lang+"__"+word]
                doc_embed.append(embed_word)
            except KeyError:
#                print ("not found:", word)
                continue
    # we average the embeddings of all the words, getting an overall doc embedding
    if len(doc_embed)>0:
        avg = [float(sum(col))/len(col) for col in zip(*doc_embed)]

        avg = np.array(avg).reshape(1, -1)

        # the output is a doc-embedding
        return avg
    else:
        return "Empty"
    
def clean(text):
    text = text.replace("\n","").replace("\r","").replace("\t","")
    return text

In [5]:
emb_model = gensim.models.KeyedVectors.load_word2vec_format('../../TextScaling/topfish/edited.wiki.big-five.mapped.vec', binary=False)


In [11]:

query = "elite politiker establishment herrschend"

lang = "DE"

res = emb_model.wv.most_similar(positive=['de__volk'], topn=50)


query_emb = text_embedding(query, lang)

# add the path to the folder where you have your manifestos as text documents
# collection_path = "C:/Users/Dr. J/Dropbox/sparserhetoric/deu2017/"

# only Germany for less output 
#collection_path = "C:/Users/Dr. J/Dropbox/sparserhetoric/germany17/"
#collection_path = "../resources/deu2017/"

collection_path = "/Users/federiconanni/Dropbox/University/research/sparserhetoric/polidoc_bigfive_longitude/"



not found: herrschend


In [16]:
# this will be a dictionary of documents, for example manifestos, divided in sentences, which are represented as sentence embeddings
collection = {}

# Be cafeful here, you need to carefully map the language of the document so that I know which embeddings to use
# for the moment I use the first number in the filename

lang_map ={"4":"DE","5":"EN"}

for filename in [x for x in os.listdir(collection_path) if ".txt" in x]:
    lang = lang_map[filename[0]]
    print (filename, lang)

    # you open each file
    # note encoding 
    content = codecs.open(collection_path+filename,"r","utf-8").read()
    
    #remove breaklines
    content = clean(content)
    
    # you split it in sentences
    content = nltk.sent_tokenize(content)
    
    # you represent each sentence in each document as a word-embedding, which captures the meaning of the sentence
    content = [[sent, text_embedding(sent,lang)] for sent in content if type(text_embedding(sent,lang))!= str]
    collection[filename] = content

42420.000.2013.1.1.txt DE
41950.000.2009.1.1.txt DE
53202.000.2016.1.1.txt EN
53320.000.2007.1.1.txt EN
43530.000.2007.1.1.txt DE
51210.000.2010.1.1.txt EN
53110.000.2011.1.1.txt EN
51401.000.2010.1.1.txt EN
41420.000.2013.1.1.txt DE
51320.000.2017.1.1.txt EN
53951.000.2011.1.1.txt EN
42520.000.2017.1.1.txt DE
43810.000.2015.1.1.txt DE
42110.000.2013.1.1.txt DE
41113.000.2017.1.1.txt DE
43520.000.2007.1.1.txt DE
51620.000.2015.1.1.txt EN
53206.000.2016.1.1.txt EN
42421.000.2008.1.1.txt DE
51902.000.2015.1.1.txt EN
43320.000.2015.1.1.txt DE
53320.000.2016.1.1.txt EN
53621.000.2016.1.1.txt EN
43531.000.2007.1.1.txt DE
53620.000.2007.1.1.txt EN
43110.000.2011.1.1.txt DE
53520.000.2016.1.1.txt EN
43520.000.2015.1.1.txt DE
53220.000.2011.1.1.txt EN
41950.000.2013.1.1.txt DE
43810.000.2007.1.1.txt DE
41320.000.2017.1.1.txt DE
42320.000.2017.1.1.txt DE
41953.000.2017.1.1.txt DE
42450.000.2013.1.1.txt DE
51421.000.2010.1.1.txt EN
51620.000.2017.1.1.txt EN
42110.000.2008.1.1.txt DE
51902.000.20

In [17]:
# filter on how many sentences you want to retrieve
max_sent = 50

# filter on the cosine similarity

threshold = 0.50

In [19]:
# now, the information retrieval part


for filename,sentences in collection.items():
    
    lang = lang_map[filename[0]]
    # compare the cosine similarity between the embedding of the query and each sentence embedding
    ranking = [[sent, cosine_similarity(query_emb,sent_emb)[0][0]] for sent, sent_emb in sentences]
    # you rank them, based on the similarity
    ranking.sort(key=lambda x: x[1],reverse=True)
    
    # use this if you want to use max_sent
    out = " "
    for sent, score in ranking[:max_sent]:
        out += sent+" "
    
    # use this if you want to use cosine similarity trheshold (comment max_sent part)
    #out = " "
    #for sent, score in ranking:
    #    if score > threshold:
    #        out += sent+" "   

    # save selected sentences in files (so that you can use TopFish / Wordfish)
    output = open("topic-output/filtered-"+filename,"w")
    output.write(lang+"\n"+out)
    output.close()



In [21]:
# add yours

import shutil
shutil.rmtree('topic-output/.ipynb_checkpoints/')

topfish_path = "/Users/federiconanni/topfish/scaler.py"
topfish_emb_path = "/Users/federiconanni/topfish/wiki.big-five.mapped.vec"
out_file = "topic-scaling.txt"
subprocess.call("python "+topfish_path+" topic-output/ "+topfish_emb_path+" "+out_file, shell=True)

scaling = open(out_file,"r").read().strip().split("\n")
scaling = [x.split() for x in scaling]
scaling.sort(key=lambda x: x[1])
for el in scaling:
    print (" ".join(el))

filtered-51902.000.2017.1.1.txt 0.0
filtered-51902.000.2015.1.1.txt 0.0336582162474567
filtered-51301.000.2010.1.1.txt 0.048262262995620205
filtered-51903.000.2015.1.1.txt 0.06110037600156924
filtered-53202.000.2016.1.1.txt 0.14043733852124254
filtered-53220.000.2011.1.1.txt 0.14800291974196017
filtered-53520.000.2016.1.1.txt 0.15781466739111957
filtered-51330.000.2010.1.1.txt 0.16058377329110307
filtered-53620.000.2011.1.1.txt 0.17499753490746917
filtered-53520.000.2011.1.1.txt 0.17689637006279205
filtered-53620.000.2016.1.1.txt 0.1781059608907141
filtered-51101.000.2017.1.1.txt 0.19128128622247212
filtered-51320.000.2010.1.1.txt 0.19136855188626278
filtered-53621.000.2016.1.1.txt 0.19158794860509307
filtered-53951.000.2011.1.1.txt 0.19280910374109825
filtered-51620.000.2015.1.1.txt 0.19286259372929912
filtered-53620.000.2007.1.1.txt 0.19585305265628986
filtered-51421.000.2010.1.1.txt 0.1971705271354065
filtered-53951.000.2016.1.1.txt 0.197300943377592
filtered-51951.000.2017.1.1.txt 