In [6]:
import codecs, nltk, string, os, gensim
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


exclude = set(string.punctuation)

# this represent any text as a single "doc-embedding" we use it both for the query and the sentences
# input should be a string
def text_embedding(text):
    
    #you should check in the embeddings you use if the words have been lowercased or not. 
    #try ask the embedding for "barack" and for "Barack"
    # if the Barack works, then comment the following line
    text = text.lower()
    
    # we tokenize the text in single words
    text = nltk.tokenize.WordPunctTokenizer().tokenize(text)
    
    # we remove numbers and punctuation
    text = [token for token in text if token not in exclude and token.isalpha()]
    
    
    doc_embed = []
    
    # for each word we get the embedding and we append it to a list
    for word in text:
            try:
                embed_word = emb_model[word]
                doc_embed.append(embed_word)
            except KeyError:
                continue
    # we average the embeddings of all the words, getting an overall doc embedding
    if len(doc_embed)>0:
        avg = [float(sum(col))/len(col) for col in zip(*doc_embed)]

        avg = np.array(avg).reshape(1, -1)

        # the output is a doc-embedding
        return avg
    else:
        return "Empty"

In [7]:
#import here your word-embeddings - put the path to the file (if it's .bin change the binary to True)
#emb_model = gensim.models.KeyedVectors.load_word2vec_format('../../resources/small-embeddings.txt', binary=False)

# german wikipedia from https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
#emb_model = gensim.models.KeyedVectors.load_word2vec_format('/Users/federiconanni/Downloads/wiki.de.vec', binary=False)
#emb_model = gensim.models.KeyedVectors.load_word2vec_format('C:/Users/Dr. J/Desktop/wiki.de.vec', binary=False)
emb_model = gensim.models.KeyedVectors.load_word2vec_format('C:/Users/Dr. J/Desktop/wiki.en.vec', binary=False)

# switch for in-domain 
#emb_model = gensim.models.KeyedVectors.load_word2vec_format('C:/QTA/topic-detect/in-domain-embeddings.txt', binary=False)



In [8]:
#this can be a list of words on the same fine-grained topic, like "people", "elites"
# add more words after a space to make it more precise
# query = "volk bürger"

#focus on "key key" terms based on dictionary, let the embeddings find more 
#query = "volk"
#query = "bürger einbürgerung steuerzahler gemeinschaft"
#query = "elite politiker establishment herrschend"
#query = "korruption täuschung betrügen verrat schämen skandal wahrheit unehrlich lüge"
#query = "verantwortung glaubwürdigkeit"
#query = "souverän neutral"
#query = "demokratisch referendum volksabstimmung volksinitiative"
#query = "konsens kompromiss"
#query = "repräsentation parlament regierung "
#query = "populisten populismus demagogisch demagogen"

# populism at its best?
#query = "volk elite souverän"

# efficient flip-side query?
#query = "konsens repräsentation populismus"

# query = "volk elite souverän parlament regierung repräsentation"

query = "people elite sovereign"

query_emb = text_embedding(query)

# add the path to the folder where you have your manifestos as text documents
# collection_path = "C:/Users/Dr. J/Dropbox/sparserhetoric/deu2017/"

# only Germany for less output 
#collection_path = "C:/Users/Dr. J/Dropbox/sparserhetoric/germany17/"

# German texts last three elections for Germany, Austria and Switzerland - 63 manifestos from polidoc.net
#collection_path = "C:/Users/Dr. J/Dropbox/sparserhetoric/polidoc_allgerman/"
collection_path = "C:/Users/Dr. J/Dropbox/sparserhetoric/en/"

In [9]:
# this will be a dictionary of documents, for example manifestos, divided in sentences, which are represented as sentence embeddings
collection = {}

# you loop over the folder
for filename in os.listdir(collection_path):
    # you open each file
    # note encoding 
    content = codecs.open(collection_path+filename,"r","utf-8").read()
    # you split it in sentences
    content = nltk.sent_tokenize(content)
    
    # you represent each sentence in each document as a word-embedding, which captures the meaning of the sentence
    content = [[sent, text_embedding(sent)] for sent in content if type(text_embedding(sent))!= str]
    collection[filename] = content

In [10]:
# now, the information retrieval part

for filename,sentences in collection.items():
    
    # compare the cosine similarity between the embedding of the query and each sentence embedding
    ranking = [[sent, cosine_similarity(query_emb,sent_emb)[0][0]] for sent, sent_emb in sentences]
    # you rank them, based on the similarity
    ranking.sort(key=lambda x: x[1],reverse=True)
    
    print (filename)
    # you can change here for having more sentences as output
   # for sent, score in ranking[:20]:
    for sent, score in ranking[:3]:
        print (sent, score)
    print (" \n")

51101.000.2010.1.1.txt
• Require all police forces to have equality
and diversity liaison officers whose remit
is to tackle, and take preventive action on,
crimes against LGBTI people, people from
ethnic minorities (including refugees and
asylum seekers) and disabled people. 0.6644086353270502
27

consumers and
citizens, public
and private

Most people living now are much richer than
people living in 1950. 0.6546856021686847
• Provide more rights for homeless people,
giving local authorities the same duties with
regard to single people and childless couples
as to families, and ending the practice of
declaring people ‘intentionally homeless’. 0.6413565476817209
 

51101.000.2017.1.1.txt
A CITIZENS’ DEMOCRACY

The Green Party believes that politics should be done by people – not to them. 0.6254089371842244
- Stop declaring people as
‘intentionally homeless’ and give Local Authorities the same duties towards single people and childless couples as to families. 0.6248138303105859
O

51903.000.2010.1.1.txt
66% of people have identified fear of crime as being the main problem
facing older people today. 0.6138030103001059
International
Affairs

We believe that too many powers have been ceded by our
national Government to the European Union. 0.6126521700468074
Families who lose loved ones deserve the fullest support from
the Government. 0.6115694453750534
 

51903.000.2015.1.1.txt
Create a society based
on fairness and
opportunity for everyone
FROM WESTMINSTER, WE WANT:
- Welfare Reform to incentivise work but also
protect those most in need;
- The removal of the bedroom tax at a
national level;
- A national entitlement to enhanced support for
members of the Armed Forces across the
United Kingdom;
- Victims and Members of the Armed Forces,
serving and retired to be given protected status,
akin to the protection for groups under Section
75 of the Northern Ireland Act;
- A UK wide definition of a victim which excludes
perpetrators;
- To ensure economic migrants must hav

53203.000.2011.1.1.txt
Meanwhile billions is being taken from working people and given to bankers, builders and international speculators. 0.6416320017330007
We are committed to building a mass left alternative to unite working people, whether public or private sector, Irish or migrant, with the unemployed, welfare recipients, pensioners and students in the struggle to change society. 0.6256079576948765
The ULA will be standing candidates throughout the country and we are inviting all people, campaigns and groups that want to fight for real change and who agree with our demands to become part of the Alliance. 0.5970491572562528
 

53204.000.2016.1.1.txt
- The richest 13 people in Ireland are billionaires with a wealth of €38 billion between them. 0.6497577389736149
- For a united struggle for jobs, decent housing, decent pay, against water charges etc to unite all ordinary people regardless of their race, ethnicity or nationality. 0.629389219100563
Global solidarity
The AAA stands in s

53951.000.2016.1.1.txt
At its core is a belief that all people are equal and all nations are sovereign. 0.6694780774926028
Accountability in politics
People have become justifiably disillusioned with politicians and the political system. 0.6526410381695372
The overall community and voluntary sector is worth billions to the economy, employs tens of thousands of people, and encompasses around 7,500 charitable, community and voluntary groups, which provide essential services to children, older people, people with disabilities or ill health, drug users, women and Travellers. 0.6463327781226964
 

