In [11]:
import codecs, nltk, string, os, gensim
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import subprocess


exclude = set(string.punctuation)

# this represent any text as a single "doc-embedding" we use it both for the query and the sentences
# input should be a string
def text_embedding(text,lang):
    
    lang = lang.lower()
    
    #you should check in the embeddings you use if the words have been lowercased or not. 
    #try ask the embedding for "barack" and for "Barack"
    # if the Barack works, then comment the following line
    text = text.lower()
    
    # we tokenize the text in single words
    text = nltk.tokenize.WordPunctTokenizer().tokenize(text)
    
    # we remove numbers and punctuation
    text = [token for token in text if token not in exclude and token.isalpha()]
    
    
    doc_embed = []
    
    # for each word we get the embedding and we append it to a list
    for word in text:
            try:
                embed_word = emb_model[lang+"__"+word]
                doc_embed.append(embed_word)
            except KeyError:
#                print ("not found:", word)
                continue
    # we average the embeddings of all the words, getting an overall doc embedding
    if len(doc_embed)>0:
        avg = [float(sum(col))/len(col) for col in zip(*doc_embed)]

        avg = np.array(avg).reshape(1, -1)

        # the output is a doc-embedding
        return avg
    else:
        return "Empty"
    
def clean(text):
    text = text.replace("\n","").replace("\r","").replace("\t","")
    return text

In [12]:
#emb_model = gensim.models.KeyedVectors.load_word2vec_format('../../../TextScaling/topfish/edited.wiki.big-five.mapped.vec', binary=False)
emb_model = gensim.models.KeyedVectors.load_word2vec_format('C:/Users/Dr. J/Desktop/edited.wiki.big-five.mapped.vec', binary=False)

In [13]:

# query = "elite politiker establishment herrschend"

# query = "people elite sovereign"

# query = "volk elite souverän"

query = "regierung parlament repräsentation"

# query = "parliament government representation"

lang = "DE"

#lang = "EN"

query_emb = text_embedding(query, lang)

# add the path to the folder where you have your manifestos as text documents
# collection_path = "C:/Users/Dr. J/Dropbox/sparserhetoric/deu2017/"

# only Germany for less output 
#collection_path = "C:/Users/Dr. J/Dropbox/sparserhetoric/germany17/"
#collection_path = "../resources/deu2017/"

#collection_path = "/Users/federiconanni/Dropbox/University/research/sparserhetoric/polidoc_bigfive_longitude/"

collection_path = "C:/Users/Dr. J/Dropbox/sparserhetoric/polidoc_bigfive_longitude/"


In [14]:
# this will be a dictionary of documents, for example manifestos, divided in sentences, which are represented as sentence embeddings
collection = {}

# Be cafeful here, you need to carefully map the language of the document so that I know which embeddings to use
# for the moment I use the first number in the filename

lang_map ={"4":"DE","5":"EN"}

for filename in [x for x in os.listdir(collection_path) if ".txt" in x]:
    lang = lang_map[filename[0]]
    print (filename, lang)

    # you open each file
    # note encoding 
    content = codecs.open(collection_path+filename,"r","utf-8").read()
    
    #remove breaklines
    content = clean(content)
    
    # you split it in sentences
    content = nltk.sent_tokenize(content)
    
    # you represent each sentence in each document as a word-embedding, which captures the meaning of the sentence
    content = [[sent, text_embedding(sent,lang)] for sent in content if type(text_embedding(sent,lang))!= str]
    collection[filename] = content

41113.000.2009.1.1.txt DE
41113.000.2013.1.1.txt DE
41113.000.2017.1.1.txt DE
41223.000.2009.1.1.txt DE
41223.000.2013.1.1.txt DE
41223.000.2017.1.1.txt DE
41320.000.2009.1.1.txt DE
41320.000.2013.1.1.txt DE
41320.000.2017.1.1.txt DE
41420.000.2009.1.1.txt DE
41420.000.2013.1.1.txt DE
41420.000.2017.1.1.txt DE
41521.000.2009.1.1.txt DE
41521.000.2013.1.1.txt DE
41521.000.2017.1.1.txt DE
41523.000.2017.1.1.txt DE
41702.000.2009.1.1.txt DE
41702.000.2013.1.1.txt DE
41950.000.2009.1.1.txt DE
41950.000.2013.1.1.txt DE
41953.000.2013.1.1.txt DE
41953.000.2017.1.1.txt DE
42110.000.2008.1.1.txt DE
42110.000.2013.1.1.txt DE
42110.000.2017.1.1.txt DE
42220.000.2013.1.1.txt DE
42220.000.2017.1.1.txt DE
42320.000.2008.1.1.txt DE
42320.000.2013.1.1.txt DE
42320.000.2017.1.1.txt DE
42420.000.2008.1.1.txt DE
42420.000.2013.1.1.txt DE
42420.000.2017.1.1.txt DE
42421.000.2008.1.1.txt DE
42422.000.2013.1.1.txt DE
42450.000.2013.1.1.txt DE
42450.000.2017.1.1.txt DE
42520.000.2008.1.1.txt DE
42520.000.20

In [15]:
# filter on how many sentences you want to retrieve
max_sent = 40

# filter on the cosine similarity

threshold = 0.50

In [16]:
# now, the information retrieval part

import shutil

try:
    shutil.rmtree('topic-output/')
except Exception as e:
    print (e)

    
import os
if not os.path.exists('topic-output/'):
    os.makedirs('topic-output/')

    
for filename,sentences in collection.items():
    
    lang = lang_map[filename[0]]
    # compare the cosine similarity between the embedding of the query and each sentence embedding
    ranking = [[sent, cosine_similarity(query_emb,sent_emb)[0][0]] for sent, sent_emb in sentences]
    # you rank them, based on the similarity
    ranking.sort(key=lambda x: x[1],reverse=True)
    
    # use this if you want to use max_sent
    out = " "
    for sent, score in ranking[:max_sent]:
        out += sent+" "
    
    # use this if you want to use cosine similarity trheshold (comment max_sent part)
    #out = " "
    #for sent, score in ranking:
    #    if score > threshold:
    #        out += sent+" "   

    # save selected sentences in files (so that you can use TopFish / Wordfish)
    #output = open("topic-output/filtered-"+filename,"w")
    output = codecs.open("C:/Users/Dr. J/Desktop/topic-output/filtered-"+filename,"w","UTF-8")
    output.write(lang+"\n"+out)
    output.close()



In [None]:
# add yours
    
topfish_emb_path = "/Users/federiconanni/topfish/wiki.big-five.mapped.vec"
topfish_path = "/Users/federiconanni/topfish/scaler.py"
out_file = "topic-scaling.txt"


subprocess.call("python "+topfish_path+" topic-output/ "+topfish_emb_path+" "+out_file, shell=True)

scaling = open(out_file,"r").read().strip().split("\n")
scaling = [x.split() for x in scaling]
scaling.sort(key=lambda x: x[1])
for el in scaling:
    print (" ".join(el))

In [23]:
# alternative JB machine 

import os
os.chdir('M:\\topfish-master')

from embeddings import text_embeddings
import nlp
from helpers import io_helper
from sts import simple_sts 
from sys import stdin
import argparse
from datetime import datetime

supported_lang_strings = {"en" : "english", "fr" : "french", "de" : "german", "es" : "spanish", "it" : "italian"}

parser = argparse.ArgumentParser(description='Performs text scaling (assigns a score to each text on a linear scale).')
parser.add_argument('-d', '--datadir', default='C:\\Users\\Dr. J\\Desktop\\topic-output', help='A path to the directory containing the input text files for scaling (one score will be assigned per file).')
parser.add_argument('-e','--embs', default='M:\\topfish-master\\embeddingspaces\\wiki.big-five.mapped.vec', help='A path to the file containing pre-trained word embeddings')
parser.add_argument('-o', '--output', default='C:\\Users\\Dr. J\\Desktop\\output2',  help='Path for scaling results.')
parser.add_argument('-s', '--stopwords', default='M:\\topfish-master\\stopwords\\bigfive.txt', help='Path for stopwords')

args = parser.parse_args()

files = io_helper.load_all_files(args.datadir)
filenames = [x[0] for x in files]
texts = [x[1] for x in files]
filenames

languages = [x.split("\n", 1)[0].strip().lower() for x in texts]
languages
langs = [(l if l in supported_lang_strings.values() else supported_lang_strings[l]) for l in languages]

stopwords = io_helper.load_lines(args.stopwords)

predictions_serialization_path = args.output

embeddings = text_embeddings.Embeddings()
embeddings.load_embeddings(args.embs, limit = 1000000, language = 'default', print_loading = True, skip_first_line = True)

nlp.scale_efficient(filenames, texts, langs, embeddings, predictions_serialization_path, stopwords)
print(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + " Scaling completed.", flush = True)


ModuleNotFoundError: No module named 'tensorflow'