# Automatic Political Stance Detection

A combination of topic modelling techniques and sentiment analysis applied to British newspaper articles written about the topic Brexit

![Pipeline](images/pipeline_ANLP_project.png)

In [1]:
%%html
<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }

  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>

In [13]:
# All imports
import re, string, gensim, os
from nltk.tokenize import word_tokenize
from subprocess import Popen, PIPE, STDOUT
from nltk.corpus import stopwords
from gensim import corpora, models
from pprint import pprint
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.pyplot as plt
import numpy as np
from itertools import groupby
from nltk.sentiment.vader import SentimentIntensityAnalyzer

## Preprocessing

To make our data suitable for use with the topic modelling techniques & sentiment analysis, we first extract the headlines and articles, using regular expressions. This removes all metadata that is included in the downloads from LexisNexis. We then tokenise and lemmatise the articles and remove stopwords.

### Cleaning

In [None]:
with open("sample_for_notebook.txt") as f_raw:
    print("Raw data\n")
    print(f_raw.read())

In [None]:
def extract_headlines(cleaned_article, output_file_hl):
    '''Extracts just the headline from the raw data and saves it to a text file. 
    Used in conjunction with the function clean_corpora.'''
    
    with open(output_file_hl, "a") as output_hl:
        hl = re.sub(r"(BYLINE:.*|SECTION:.*|BODY: .*)$", "", cleaned_article)
        print(hl.strip(), file=output_hl)
        
def clean_corpora(input_file, output_file, output_file_headlines=None, guardian=False, sun=False, telegraph=False, guardian_hl=False, 
                  sun_hl=False, telegraph_hl=False):
    '''Takes the raw LexisNexis files as input and extracts the headlines & articles and just headlines (optional), 
    writes them to a text file. Due to subtle formatting differences between the newspapers, 
    there are slight differences in the preprocessing'''
    
    open(output_file, 'w').close() #clears output file
    
    text = open(input_file).read()
    split_text = text.strip().split("All Rights Reserved")
    
    with open(output_file, "a") as output:
        for un_article in split_text:
            un_article1 = un_article.replace("\n", " ")
            article3 = un_article1.replace("\\", "")
            article2 = re.sub(r"(BYLINE: .* [0-9]* words|SECTION: .* LETTER|SECTION: .* words)", "", article3)
            article = re.sub(r"(LOAD-DATE:.*|LOAD-DATE)$", "", article2)
    
            if guardian:          
                article_g = re.sub(r"^(.*GMT)", "", article)
                print(article_g.strip(), file=output)
                
                if guardian_hl:
                    article_g1 = re.sub(r"^(.*GMT)", "", article3)
                    extract_headlines(article_g1, output_file_headlines)
                               
            if sun:
                article_s = re.sub(r"^(.*National Edition|.*Edition [0-9]*;)", "", article)
                article_s = re.sub(r"([a-zA-Z.]*@the[-]*sun\.co\.uk.*)$", "", article_s)
                print(article_s.strip(), file=output)
            
                if sun_hl:
                    article_s1 = re.sub(r"^(.*National Edition|.*Edition [0-9]*;)", "", article3)
                    extract_headlines(article_s1, output_file_headlines)
                    
            if telegraph:
                article_t1 = re.sub(r"^(.*National Edition|.*Edition [0-9]*;|Scotland)", "", article)
                article_t = re.sub(r"(Copyright [0-9]+ Telegraph Media Group Limited|BYLINE: .* BODY:|[0-9]* of [0-9]* DOCUMENTS|HEADLINE:)", "", article_t1)
                print(article_t.strip(), file=output)
                
                if telegraph_hl:
                    article_t1 = re.sub(r"(HEADLINE:|Scotland)", "", article_t1)
                    extract_headlines(article_t1, output_file_headlines)

In [None]:
clean_corpora("sample_for_notebook.txt", "sample_output.txt", "sample_output_headlines.txt",
             guardian=True, guardian_hl=True)

with open("sample_output.txt") as f, open("sample_output_headlines.txt") as f_hl:
    print("\nHeadline and article\n")
    print(f.read())
    print("\nHeadline\n")
    print(f_hl.read())

### Tokenization

In [14]:
# reads in the corpus, one headline/article per line
# returns an array of headlines/articles
def read_corpus(input_file):
	corpus = []
	with open(input_file) as f:
		for line in f.readlines():
			if line != '\n':
				corpus.append(line.strip())
	return corpus

# the tokenizer provided by nltk is used 
# returns a list of tokenized headlines/articles 
def tokenize_corpus(articles):
	tokenized_articles = []
	for sentence in articles:
		tokenized_articles.append(word_tokenize(sentence))
	return tokenized_articles

# Writes one tokenized article per line in a file
def write_to_file(tokenized_articles, output_file):
	with open(output_file, 'w') as o:
		for article in tokenized_articles:
			o.write(" ".join(article))
			o.write("\n")

In [15]:
sample_corpus = read_corpus("sample_output_headlines.txt")
tokenized_sample = tokenize_corpus(sample_corpus)
write_to_file(tokenized_sample, "tokenized_output_file.txt")
print(tokenized_sample)

[['It', 'wo', "n't", 'be', 'easy', 'to', 'stop', 'Brexit', '.', 'But', 'here', 'are', 'four', 'ways', 'to', 'do', 'it', ';', 'Chip', 'away', ',', 'every', 'day', '.', 'This', 'is', 'a', 'long', 'game', 'but', ',', 'as', 'harsh', 'reality', 'bites', ',', 'time', 'will', 'be', 'on', 'the', 'side', 'of', 'the', 'remainers']]


### Lemmatization

In [16]:
# reads in the corpus, one headline/article per line
# returns an array of headlines/articles
def get_sentences(input_file):
	sentences = []
	with open(input_file) as f:
		for line in f.readlines():
			if line != "\n":
				sentences.append(line.strip())
	return sentences

# the headlines/articles are lemmatized using the Treetagger
# returns a list of lists with the original word, the POS-Tag and the lemma per article
def annotate_articles(sentences):
	annotated_articles = []
	for sentence in sentences:
		sentence.lower()
		p = Popen(['/Applications/Treetagger/cmd/tree-tagger-english'], stdout=PIPE, stdin=PIPE, stderr=PIPE, encoding="utf8")
		out = p.communicate(input=sentence)[0]
		article = []
		annotated_words = out.split("\n")
		for word_anno in annotated_words:
			word_anno = word_anno.split("\t")
			if len(word_anno) == 3:
				annotation = (word_anno[0], (word_anno[1], word_anno[2]))
				article.append(annotation)
		annotated_articles.append(article)
	return annotated_articles

# for each word in an article the lemma is extracted
# if lemma is unknown, the original word is chosen
# return a list of lemmatized articles
def get_lemma(articles):
	lemma_articles = []
	for article in articles:
		lemma_article = []
		for word, anno in article:
			if anno[1] == "<unknown>":
				lemma_article.append(word)
			else:
				lemma_article.append(anno[1])
		lemma_articles.append(lemma_article)
	return lemma_articles

# Writes one lemmatized article per line in a file
def write_to_file(lemma_articles, out_file):
	with open(out_file, "w") as out:
		for article in lemma_articles:
			out.write(" ".join(article))
			out.write("\n")

In [17]:
lemmatized_sentences = get_sentences("tokenized_output_file.txt")
annotated_sample = annotate_articles(lemmatized_sentences)
lemmatised_sample = get_lemma(annotated_sample)
write_to_file(lemmatised_sample, "lemmatised_output_file.txt")
print(lemmatised_sample)

[['it', 'wo', "n't", 'be', 'easy', 'to', 'stop', 'Brexit', '.', 'but', 'here', 'be', 'four', 'way', 'to', 'do', 'it', ';', 'chip', 'away', ',', 'every', 'day', '.', 'this', 'be', 'a', 'long', 'game', 'but', ',', 'as', 'harsh', 'reality', 'bite', ',', 'time', 'will', 'be', 'on', 'the', 'side', 'of', 'the', 'remainers']]


### Stop Word Removal

In [None]:
def remove_stop_words(processed_file, output_file):
    '''Takes the lemmatized, tokenized file as input and removes punctuation, stopwords and other strangely formatted
    words/punctuation'''
    
    stop_words_nltk = list(stopwords.words('english'))
    to_delete = ["brexit", "``", "''", "'s", "·", "wo", "n't", "...", "@card@"]
    
    stop_words = stop_words_nltk + to_delete + list(string.punctuation)

    with open(output_file, "w") as output:
        to_be_filtered = open(processed_file).readlines()
        corpus_list = [line.strip() for line in to_be_filtered]
        corpus_nlist = [each_word.split() for each_word in corpus_list]
        for sentence in range(0, len(corpus_nlist)):
            for word in corpus_nlist[sentence]:
                word = re.sub(r"^(\')", "", word) #getting rid of the ' at the beginning of some words
                if word.lower() not in stop_words:
                    output.write(word)
                    output.write(" ")
            output.write("\n") #line break after each article/hl

In [None]:
remove_stop_words("lemmatised_output_file.txt", "filtered_output_file.txt")
with open("filtered_output_file.txt") as filtered,
    print(filtered.read())

## LDA

To get an interpretation of the content of the newspaper articles, we apply topic modeling to them. 
We use a Latent Dirichlet Allocation (LDA) model from the package gensim. 
This model assumes that our articles were written using a "generative process" in which every word of the article is created out of a distribution of words within a topic. 
In a first step, this distribution is created by pre-defining a number of topics and then building a model over the whole corpus. In a second step, the model is applied to the articles to find the specific topic mixture which best represents the article. These topic mixtures make up our feature vectors for the clustering.

In [22]:
class LDA():
    '''
    This class implements a Latent Dirichlet Allocation. Initializing one instance creates a
    model and saves it to a folder. It also uses this model to create a topic distribution
    vector for every article in the corpora
    '''
    def __init__(self, filename1, filename2, num_topics=10, no_below=20, no_above=0.5):
        self.num_topics = num_topics
        self.corpus = self.createCorpus(filename1, filename2)
        self.model_corpus, self.dictionary = self.createModelCorpus(no_below, no_above)
        try:
            self.model = gensim.models.LdaModel.load("LDA_Models/ldamodel_topics="+str(num_topics)+"_no_above="+str(no_above))
        except:
            self.model = self.train_lda()
            self.model.save("LDA_Models/ldamodel_topics="+str(num_topics)+"_no_above="+str(no_above))

        self.corpus_feature_vectors = self.apply_lda()
        self.final_output = self.createFinalOutput()

    def createCorpus(self, filename1, filename2):
        ''' 
        Creating lists of tokens in a list of articles for further processing
        '''
        with open(filename1,encoding="utf-8") as f1:
            with open(filename2, encoding="utf-8") as f2:
                all_articles = f1.readlines()
                all_articles.extend(f2.readlines())
                corpus = [[token for token in article.strip().split(" ")] for article in all_articles]
        return corpus

    def createModelCorpus(self, no_below, no_above):
        ''' 
        Creating a model corpus and a dictionary for the lda model
        ''' 
        dictionary = corpora.Dictionary(self.corpus)
        dictionary.filter_extremes(no_below=no_below, no_above=no_above)
        model_corpus = [dictionary.doc2bow(token) for token in self.corpus]
        return model_corpus, dictionary

    def train_lda(self):
        ''' 
        Creating a LDA model from the whole corpus
        ''' 
        ldamodel = gensim.models.ldamodel.LdaModel(self.model_corpus, num_topics=self.num_topics, id2word=self.dictionary, passes=10)
        return ldamodel

    def apply_lda(self):
        ''' 
        applying the LDA model to all articles to get the feature vectors
        ''' 
        corpus_feature_vectors = []
        for article in self.model_corpus:
            corpus_feature_vectors.append(self.model[article])
        return corpus_feature_vectors

    def createFinalOutput(self):
        ''' 
        arrange the feature vectors in a list
        ''' 
        output = []
        for vec in self.corpus_feature_vectors:
            vec_dic = dict(vec)
            output.append([(vec_dic[topic] if topic in vec_dic else 0) for topic in range(10)])
        return output

    def get_topics(self):
        ''' 
        show the topics
        ''' 
        return(self.model.show_topics(num_topics=12,num_words=5))

### Applying LDA

Applying LDA and having a look at the 12 topics with the 5 most important words.

In [23]:
filename1 = "Corpora/filtered/filteredjust_hl_article_guardian_lemmatized.txt"
filename2 = "Corpora/filtered/filteredhl_article_tele_lemmatized.txt"

lda = LDA(filename1, filename2, num_topics=12, no_below=20, no_above=0.7)

corpus_feature_vectors = lda.corpus_feature_vectors
output = lda.final_output
lda.get_topics()
output

[(0,
  '0.020*"Scotland" + 0.013*"Scottish" + 0.011*"market" + 0.010*"trade" + 0.010*"business"'),
 (1,
  '0.019*"Labour" + 0.013*"party" + 0.010*"vote" + 0.009*"people" + 0.006*"Corbyn"'),
 (2,
  '0.011*"business" + 0.010*"deal" + 0.010*"financial" + 0.009*"European" + 0.009*"London"'),
 (3,
  '0.019*"vote" + 0.016*"referendum" + 0.009*"Scotland" + 0.009*"MP" + 0.008*"Scottish"'),
 (4,
  '0.010*"May" + 0.010*"deal" + 0.009*"European" + 0.009*"British" + 0.008*"Europe"'),
 (5,
  '0.015*"economy" + 0.012*"growth" + 0.010*"vote" + 0.010*"year" + 0.009*"market"'),
 (6,
  '0.023*"Mr" + 0.017*"Johnson" + 0.010*"May" + 0.010*"Hammond" + 0.009*"Secretary"'),
 (7,
  '0.015*"Ireland" + 0.010*"border" + 0.010*"May" + 0.009*"right" + 0.008*"Northern"'),
 (8,
  '0.025*"May" + 0.014*"Mrs" + 0.012*"European" + 0.011*"Mr" + 0.010*"talk"'),
 (9,
  '0.008*"report" + 0.007*"year" + 0.007*"trade" + 0.006*"work" + 0.006*"civil"'),
 (10,
  '0.017*"MP" + 0.014*"bill" + 0.012*"vote" + 0.009*"deal" + 0.009*"p

In [24]:
output


[[0, 0.9718892, 0, 0, 0, 0, 0, 0, 0, 0],
 [0,
  0.12120435,
  0,
  0.021611527,
  0.5158749,
  0.026185617,
  0,
  0.10589536,
  0,
  0.08440425],
 [0, 0.44612977, 0, 0.05269092, 0.010774608, 0, 0, 0, 0.059226174, 0],
 [0, 0.5206154, 0, 0.12177483, 0.3224256, 0, 0, 0.033750404, 0, 0],
 [0, 0, 0, 0, 0.33307245, 0, 0.26109496, 0, 0.28407776, 0],
 [0, 0.21822652, 0.01572969, 0, 0.18744518, 0, 0, 0.2803045, 0, 0.28320125],
 [0.017394844,
  0.03261329,
  0,
  0,
  0.41975024,
  0.032010913,
  0.08687203,
  0.26521394,
  0,
  0],
 [0, 0.6533546, 0, 0, 0.050445948, 0, 0, 0.078066185, 0, 0],
 [0, 0.010546313, 0.031213995, 0, 0.036904972, 0, 0, 0.5375376, 0.26877075, 0],
 [0, 0.27896813, 0, 0, 0.44386485, 0, 0.11283731, 0.11116003, 0, 0.043329142],
 [0, 0.9432148, 0, 0, 0, 0.053710096, 0, 0, 0, 0],
 [0,
  0.024640767,
  0.0135594355,
  0,
  0.64833385,
  0,
  0,
  0,
  0.030410817,
  0.056834187],
 [0, 0.2377314, 0, 0, 0.56809366, 0, 0, 0.018785512, 0, 0],
 [0, 0.15745334, 0, 0.035168123, 0.130

### K-Means and Visualisation
After each article is assigned a feature vector consisting of 12 values specifying the portion each article deals with one of the 12 (we tested from 6 to 14 number of topics) respective topic. By that we can imagine every article being located somewhere in the 12-dimensional topic space. By using k-means clustering we would like to find different clusters among the articles. Being in one cluster means that the articles in there make use of a similiar topic mix. We evaluate the quality of the cluster by the "elbow" method using the sum of squared intra-cluster distance of the articles. We plot histograms for each cluster showing how much it consists of Guardian or Telegraph articles. 

In [None]:
class kmeans():

    def __init__(self,feature_vector,image_dir,r_state=42):

        self.feature_vector = feature_vector
        self.r_state = r_state
        self.dir = "cluster_images/"+image_dir+"/"
        guardian = np.ones(998)
        sun = np.zeros(876)
        self.both = np.concatenate([guardian, sun])
        self.centroids=None


    def cluster(self,k):
        '''
        partitions the data into k clusters using the k_means algorithm
        '''
        clustering = KMeans(n_clusters=k,random_state=self.r_state)
        labels = clustering.fit_predict(self.feature_vector)
        self.labels = labels

        self.centroids = clustering.cluster_centers_
        return(labels,clustering.inertia_)

    def visualize_data(self,fname):
        '''
        shows a 2-dimensional scatter plot of data with each color representing the cluster and the
        form of the data point (triange/circle) indicates from which newspaper the article is
        '''
        X_reduced = PCA(n_components=2).fit_transform(self.feature_vector)
        fig = plt.figure()

        ax = fig.add_subplot(111)
        ax.scatter(X_reduced[:, 0], X_reduced[:, 1], c=self.labels,marker="^", s=self.both*10, edgecolor="red", linewidth=0.3)
        ax.scatter(X_reduced[:,0], X_reduced[:,1], c=self.labels,marker="o", s=((self.both-1)*-1)*10, edgecolor="black", linewidth=0.3)
        ax.set_xlabel('x')
        ax.set_ylabel('y')
        #plt.colorbar(scatter)

        if not os.path.exists(self.dir):
            os.makedirs(self.dir)
        plt.savefig(self.dir+fname,dpi=600)
        plt.close()

    def plot_elbow(self,k_range,fname,individ_plot=False):
        '''
        plots the intra-cluster distance of all clustr against different number of cluster. Needs the range of k.
        '''
        distorsions = []
        for k in k_range:
            labels, score = self.cluster(k)
            distorsions.append(score)
            if individ_plot:
                self.visualize_data("cluster_"+str(k))

        plt.figure(figsize=(15, 5))
        plt.xlabel("number of clusters: k")
        plt.ylabel("Sum of squared distances of samples to their closest cluster center.")
        plt.plot(k_range, distorsions)
        plt.grid(True)
        plt.savefig(self.dir+fname,dpi=600)
        plt.close()

    def plot_histogram2(self, fname, k):
        '''
        plots the histogram for the distribution of articles in each cluster
        '''
        grouped = [[] for x in range(k)]
        hist = [[] for x in range(k)]
        # the histogram of the data
        verteilung = list(zip(self.both, self.labels))
        verteilung = sorted(verteilung, key=lambda x: x[1])

        for key, group in groupby(verteilung, lambda x: x[1]):
            for thing in group:
                #print(key, thing)
                grouped[key].append(int(thing[0]))
            hist[key].append(np.histogram(grouped[key],bins=2)[0])

        guardian = []
        telegraph = []
        for cluster in hist:
            guardian.append(cluster[0][0])
            telegraph.append(cluster[0][1])

        ind = np.arange(k)  # the x locations for the groups
        width = 0.35  # the width of the bars

        fig, ax = plt.subplots()
        rects1 = ax.bar(ind, guardian, width, color='b')
        rects2 = ax.bar(ind + width, telegraph, width, color='y')

        # add some text for labels, title and axes ticks
        ax.set_ylabel('Frequency in topic clusters')
        ax.set_title('Distribution of Guardian/Telegraph articles in each cluster')
        ax.set_xticks(ind + width / 2)
        ax.set_xticklabels(range(k))

        ax.legend((rects1[0], rects2[0]), ('The Guardian', 'Telegraph'))

        print(str(self.dir + fname))
        plt.savefig(str(self.dir + fname))
        plt.clf()

more_stopwords = ["Mrs", "Mr"]
stopword = stopwords.words("english") + list(string.punctuation) + more_stopwords

    # print(stopword)
filename1 = "Corpora/filtered/filteredjust_hl_article_guardian_lemmatized.txt"
filename2 = "Corpora/filtered/filteredhl_article_tele_lemmatized.txt"

    # no_below : No words which appear in less than X articles
    # no_above : No words which appear in more than X % of the articles
num_topics = 12
no_above = 0.6
lda = LDA(filename1, filename2, stopword, num_topics=num_topics, no_below=20, no_above=no_above)
corpus_feature_vectors = lda.corpus_feature_vectors
output = lda.final_output

k_means = kmeans(output, "num_topics="+str(num_topics)+"_no_above="+str(no_above).replace(".",""))

#print(k_means.centroids)
k_means.plot_elbow(range(4,17,2), "Elbow plot", individ_plot=True)



a cluster projected onto 2 dimensions using PCA looks like that: 
<img src="files\cluster_images\num_topics=12_no_above=06\cluster_6.png">

### Histograms

In [None]:
with open("cluster_images/top10topics.csv", "a") as top:
    top.write("num_topics=" + str(num_topics) + "_no_above=" + str(no_above).replace(".", "") + ",")
    topics = lda.get_topics(num_words=5)
    for topic_words in topics:
        top.write(str(topic_words) + ",")
        top.write("\n")
        for k in [6]:
            k_means = kmeans(output, "histograms")
            labels,score = k_means.cluster(k)
            k_means.plot_histogram2("num_topics="+str(num_topics)+"_no_above="+str(no_above).replace(".","")+"_k="+str(k),k)



Plotting a histogram showing the distribution of Guardian and Telegraph articles in each cluster yields: 
<img src="files\cluster_images\histograms\num_topics=12_no_above=05_k=8.png">

### Investigating the cluster and their centroids:

## Sentiment Analysis


If articles present the same topics, the manner the topics are presented can still differ. Either a negative, positive or neutral sentiment can be expressed towards a topic. To be able to determine whether that is a the case we apply the Vader sentiment analysis system (Hutto& Gilbert, 2014) implemented in the nltk module to the headlines of the articles. We don't ecpect the system to work well if applied to the whole article, as many different sentiments are expressed in a text, the score would probably be balanced.
Vader is a rule-based sentiment anysis system that was originally applied to user generated content. As headlines are only short texts we expected it work fine with headlines. 

In [18]:
snt = SentimentIntensityAnalyzer()

# the nltk vader sentiment analysis system is used to
# get the polarities of the headline sentences
# returns a list of list with the overall, negative, neutral and positive 
# value per headline
def get_sentiment_score(sentences):
	scores = []
	for sentence in sentences:
		score = snt.polarity_scores(sentence)
		#score_values = [ val for key, val in score.items] not right order
		score_values = []
		score_values.append(str(score["compound"]))
		score_values.append(str(score["neg"]))
		score_values.append(str(score["neu"]))
		score_values.append(str(score["pos"]))
		scores.append(score_values)
	return scores

# the scores are written to the output file 
def write_scores_to_file(out_file, scores):
	with open(out_file, "w") as o:
		o.write("Overall score\tNegative\tNeutral\tPositive\n")
		for score in scores:
			o.write("\t".join(score))
			o.write("\n")

In [21]:
lemmatised_sentences = read_corpus("lemmatised_output_file.txt")
scores = get_sentiment_score(lemmatised_sentences)
print(scores)

[['-0.7313', '0.17', '0.83', '0.0']]


### Comparison of manual and automatic score

We manually assigned scores to the 100 article headlines of the guardian and the telegraph. The manual and the automatic scores are compared in two manners: 1. The polarity itself is compared, it is determined whether it lays in a predefined tolerance range. 2. It is determined whether the scored have the same poalrity i.e. being negative (<0), positive (<0) or neutral (=0). 

In [None]:
# only is the overall score is taken for the
# comparison
# it's rounded to one decimal place
# return a list of overall scores
def get_overall_score(scored_file):
	overall_scores = []
	with open(scored_file, 'r') as f:
		for line in f.readlines():
			if line != "\n":
				overall_scores.append(round(float(line.split("\t")[0]), 1))
	return overall_scores

# the manual scores are one score per line
# returns a list of manual scores
def read_manual_sa_score(manual_sa_file):
	manual_scores = []
	with open(manual_sa_file, 'r') as f:
		for line in f.readlines():
			if line != "\n":
				manual_scores.append(float(line.strip()))
	return manual_scores

# compares the automatic and manual scores
# the ok_range is a tolerance range of how much the scores may differ
# return the amount of right and wrong classified sentences
# and the indices of the wrongly classified sentences
def compare_scores(automatic, manual, ok_range):
	right = 0
	wrong = 0
	wrong_indices = []
	ok_range = float(ok_range)
	for i, (m_score, a_score) in enumerate(zip(automatic, manual)):
		if abs(m_score - a_score) <= ok_range: #and (m_score - a_score) >= 0.0:
			right += 1
		else:
			wrong += 1
			wrong_indices.append(i)
	return right, wrong, wrong_indices

# compares the automatic in manual scores in terms of polarity
# if both score are below, above or equal to 0 the have the same 
# polarity
# returns the amount of headlines with same and different polarity 
# and the indices of wrongly classified sentences
def get_score_neg_pos_neut(automatic, manual):
	same_pol = 0
	diff_pol = 0
	wrong_indices = []
	for i, (m_score, a_score) in enumerate(zip(automatic, manual)):
		if (m_score > 0 and a_score > 0) or  (m_score < 0 and a_score < 0) or (m_score == 0 and a_score == 0):
			same_pol += 1
		else:
			diff_pol += 1
			wrong_indices.append(i)
	return same_pol, diff_pol, wrong_indices