# Automatic Political Stance Detection

Hier kommt nochn schöner Text hin oder?

![Pipeline](images/pipeline_ANLP_project.png)

In [2]:
%%html
<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }

  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>

In [1]:
# All imports
import re, string, gensim, os
from nltk.tokenize import word_tokenize
from subprocess import Popen, PIPE, STDOUT
from nltk.corpus import stopwords
from gensim import corpora, models
from pprint import pprint
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.pyplot as plt
import numpy as np
from itertools import groupby
from tokenize_articles import read_corpus
from nltk.sentiment.vader import SentimentIntensityAnalyzer

ModuleNotFoundError: No module named 'gensim'

## Preprocessing

### Cleaning

In [1]:
def extract_headlines(cleaned_article, output_file_hl):
    '''Extracts just the headline from the raw data and saves it to a text file. 
    Used in conjunction with the function clean_corpora.'''
    
    with open(output_file_hl, "a") as output_hl:
        hl = re.sub(r"(BYLINE:.*|SECTION:.*|BODY: .*)$", "", cleaned_article)
        print(hl.strip(), file=output_hl)
        
def clean_corpora(input_file, output_file, output_file_headlines=None, guardian=False, sun=False, telegraph=False, guardian_hl=False, 
                  sun_hl=False, telegraph_hl=False):
    '''Takes the raw LexisNexis files as input and extracts the headlines & articles and just headlines (optional), 
    writes them to a text file. Due to subtle formatting differences between the newspapers, 
    there are slight differences in the preprocessing'''
    
    text = open(input_file).read()
    split_text = text.strip().split("All Rights Reserved")
    
    with open(output_file, "a") as output:
        for un_article in split_text:
            un_article1 = un_article.replace("\n", " ")
            article3 = un_article1.replace("\\", "")
            article2 = re.sub(r"(BYLINE: .* [0-9]* words|SECTION: .* LETTER|SECTION: .* words)", "", article3)
            article = re.sub(r"(LOAD-DATE:.*|LOAD-DATE)$", "", article2)
    
            if guardian:          
                article_g = re.sub(r"^(.*GMT)", "", article)
                print(article_g.strip(), file=output)
                
                if guardian_hl:
                    article_g1 = re.sub(r"^(.*GMT)", "", article3)
                    extract_headlines(article_g1, output_file_headlines)
                               
            if sun:
                article_s = re.sub(r"^(.*National Edition|.*Edition [0-9]*;)", "", article)
                article_s = re.sub(r"([a-zA-Z.]*@the[-]*sun\.co\.uk.*)$", "", article_s)
                print(article_s.strip(), file=output)
            
                if sun_hl:
                    article_s1 = re.sub(r"^(.*National Edition|.*Edition [0-9]*;)", "", article3)
                    extract_headlines(article_s1, output_file_headlines)
                    
            if telegraph:
                article_t1 = re.sub(r"^(.*National Edition|.*Edition [0-9]*;|Scotland)", "", article)
                article_t = re.sub(r"(Copyright [0-9]+ Telegraph Media Group Limited|BYLINE: .* BODY:|[0-9]* of [0-9]* DOCUMENTS|HEADLINE:)", "", article_t1)
                
                if telegraph_hl:
                    article_t1 = re.sub(r"(HEADLINE:|Scotland)", "", article_t1)
                    extract_headlines(article_t1, output_file_headlines)

### Tokenization

In [2]:
def read_corpus(input_file):
	corpus = []
	with open(input_file) as f:
		for line in f.readlines():
			if line != '\n':
				corpus.append(line.strip())
	return corpus

def tokenize_corpus(articles):
	tokenized_articles = []
	for sentence in articles:
		tokenized_articles.append(word_tokenize(sentence))
	return tokenized_articles

def write_to_file(tokenized_articles, output_file):
	with open(output_file, 'w') as o:
		for article in tokenized_articles:
			o.write(" ".join(article))
			o.write("\n")

### Lemmatization

In [3]:
def get_sentences(input_file):
	sentences = []
	with open(input_file) as f:
		for line in f.readlines():
			if line != "\n":
				sentences.append(line.strip())
	return sentences

def annotate_articles(sentences):
	annotated_articles = []
	for sentence in sentences:
		sentence.lower()
		p = Popen(['/Applications/Treetagger/cmd/tree-tagger-english'], stdout=PIPE, stdin=PIPE, stderr=PIPE, encoding="utf8")
		out = p.communicate(input=sentence)[0]
		article = []
		annotated_words = out.split("\n")
		for word_anno in annotated_words:
			word_anno = word_anno.split("\t")
			if len(word_anno) == 3:
				annotation = (word_anno[0], (word_anno[1], word_anno[2]))
				article.append(annotation)
		annotated_articles.append(article)
	return annotated_articles

def get_lemma(articles):
	lemma_articles = []
	for article in articles:
		lemma_article = []
		for word, anno in article:
			if anno[1] == "<unknown>":
				lemma_article.append(word)
			else:
				lemma_article.append(anno[1])
		lemma_articles.append(lemma_article)
	return lemma_articles

def write_to_file(lemma_articles, out_file):
	with open(out_file, "w") as out:
		for article in lemma_articles:
			out.write(" ".join(article))
			out.write("\n")

### Stop Word Removal

In [2]:
def remove_stop_words(processed_file, output_file):
    '''Takes the lemmatized, tokenized file as input and removes punctuation, stopwords and other strangely formatted
    words/punctuation'''
    
    stop_words_nltk = list(stopwords.words('english'))
    to_delete = ["brexit", "``", "''", "'s", "·", "wo", "n't", "...", "@card@"]
    
    stop_words = stop_words_nltk + to_delete + list(string.punctuation)

    with open(output_file, "w") as output:
        to_be_filtered = open(processed_file).readlines()
        corpus_list = [line.strip() for line in to_be_filtered]
        corpus_nlist = [each_word.split() for each_word in corpus_list]
        for sentence in range(0, len(corpus_nlist)):
            for word in corpus_nlist[sentence]:
                word = re.sub(r"^(\')", "", word) #getting rid of the ' at the beginning of some words
                if word.lower() not in stop_words:
                    output.write(word)
                    output.write(" ")
            output.write("\n") #line break after each article/hl

## LDA

In [5]:
class LDA():
    '''
    This class implements a Latent Dirichlet Allocation. Initializing one instance creates a
    model and saves it to a folder. It also uses this model to create a topic distribution
    vector for every article in the corpora
    '''
    def __init__(self, filename1, filename2, stopwords, num_topics=10, no_below=20, no_above=0.5):
        self.num_topics = num_topics
        self.corpus = self.createCorpus(filename1, filename2, stopwords)
        self.model_corpus, self.dictionary = self.createModelCorpus(no_below, no_above)
        try:
            self.model = gensim.models.LdaModel.load("LDA_Models/ldamodel_topics="+str(num_topics)+"_no_above="+str(no_above))
        except:
            self.model = self.train_lda()
            self.model.save("LDA_Models/ldamodel_topics="+str(num_topics)+"_no_above="+str(no_above))

        self.corpus_feature_vectors = self.apply_lda()
        self.final_output = self.createFinalOutput()

    def createCorpus(self, filename1, filename2, stopwords):
        # Creating lists of tokens in a list of articles for further processing
        with open(filename1,encoding="utf-8") as f1:
            with open(filename2, encoding="utf-8") as f2:
                all_articles = f1.readlines()
                all_articles.extend(f2.readlines())
                corpus = [[token for token in article.strip().split(" ") if token not in stopwords] for article in all_articles]
        return corpus

    def createModelCorpus(self, no_below, no_above):
        # Creating a model corpus and a dictionary for the lda model
        dictionary = corpora.Dictionary(self.corpus)
        dictionary.filter_extremes(no_below=no_below, no_above=no_above)
        model_corpus = [dictionary.doc2bow(token) for token in self.corpus]
        return model_corpus, dictionary

    def train_lda(self):
        # Creating a LDA model from the whole corpus
        ldamodel = gensim.models.ldamodel.LdaModel(self.model_corpus, num_topics=self.num_topics, id2word=self.dictionary, passes=10)
        return ldamodel

    def apply_lda(self):
        # applying the LDA model to all articles to get the feature vectors
        corpus_feature_vectors = []
        for article in self.model_corpus:
            corpus_feature_vectors.append(self.model[article])
        return corpus_feature_vectors

    def createFinalOutput(self):
        # arrange the feature vectors in a list
        output = []
        for vec in self.corpus_feature_vectors:
            vec_dic = dict(vec)
            output.append([(vec_dic[topic] if topic in vec_dic else 0) for topic in range(10)])
        return output

    def get_topics(self):
        # show the topics
        return(self.model.show_topics(num_topics=7,num_words=3))

stopword = stopwords.words("english") + list(string.punctuation)

filename1 = "Corpora/filtered/filteredjust_hl_article_guardian_tokenized.txt"
filename2 = "Corpora/filtered/filteredjust_hl_article_tokenized.txt"


# no_below : No words which appear in less than X articles
# no_above : No words which appear in more than X % of the articles
if __name__ == '__main__':
    for num_topics in range(3,8):
        for no_above in [0.3,0.4,0.5,0.6,0.7,0.8]:
            lda = LDA(filename1, filename2 ,stopword, num_topics=num_topics, no_below=20, no_above=no_above)

            corpus_feature_vectors = lda.corpus_feature_vectors
            output = lda.final_output

            #pprint(output[:2])

            k_means = kmeans(output,str(num_topics)+"_"+str(no_above))
            k_means.plot_elbow(range(2, 10), "Elbow_plot", individ_plot=True)

ModuleNotFoundError: No module named 'gensim'

### K-Means and Visualisation

In [6]:
class kmeans():

    def __init__(self,feature_vector,image_dir,r_state=42):

        self.feature_vector = feature_vector
        self.r_state = r_state
        self.dir = "cluster_images/"+image_dir+"/"
        guardian = np.ones(998)
        sun = np.zeros(876)
        self.both = np.concatenate([guardian, sun])


    def cluster(self,k):
        clustering = KMeans(n_clusters=k,random_state=self.r_state)
        labels = clustering.fit_predict(self.feature_vector)
        self.labels = labels
        self.centroids = clustering.cluster_centers_
        return(labels,clustering.inertia_)

    def visualize_data(self,fname):

        X_reduced = PCA(n_components=2).fit_transform(self.feature_vector)
        fig = plt.figure()

        ax = fig.add_subplot(111)
        ax.scatter(X_reduced[:, 0], X_reduced[:, 1], c=self.labels,marker="^", s=both*10, edgecolor="red", linewidth=0.3)
        ax.scatter(X_reduced[:,0], X_reduced[:,1], c=self.labels,marker="o", s=((both-1)*-1)*10, edgecolor="black", linewidth=0.3)
        ax.set_xlabel('x')
        ax.set_ylabel('y')
        #plt.colorbar(scatter)

        if not os.path.exists(self.dir):
            os.makedirs(self.dir)
        plt.savefig(self.dir+fname,dpi=600)
        plt.close()

    def plot_elbow(self,k_range,fname,individ_plot=False):
        distorsions = []
        for k in k_range:
            labels, score = self.cluster(k)
            distorsions.append(score)
            if individ_plot:
                self.visualize_data("cluster_"+str(k))

        plt.figure(figsize=(15, 5))
        plt.xlabel("number of clusters: k")
        plt.ylabel("Sum of squared distances of samples to their closest cluster center.")
        plt.plot(k_range, distorsions)
        plt.grid(True)
        plt.savefig(self.dir+fname,dpi=600)
        plt.close()

    def plot_histogram(self, fname, k):
        #does not work
        grouped = [[] for x in range(k)]
        # the histogram of the data
        verteilung = list(zip(self.both, self.labels))
        verteilung = sorted(verteilung,key=lambda x: x[1])

        for key, group in groupby(verteilung, lambda x: x[1]):
            for thing in group:
                print(key,thing)
                grouped[key].append(int(thing[0]))

        plt.hist(grouped, k, histtype='bar',normed=1, alpha=0.75)
        plt.tight_layout()
        plt.savefig(self.dir+fname+str(k))
        plt.clf()

    def plot_histogram2(self, fname, k):
        grouped = [[] for x in range(k)]
        hist = [[] for x in range(k)]
        # the histogram of the data
        verteilung = list(zip(self.both, self.labels))
        verteilung = sorted(verteilung, key=lambda x: x[1])

        for key, group in groupby(verteilung, lambda x: x[1]):
            for thing in group:
                #print(key, thing)
                grouped[key].append(int(thing[0]))
            hist[key].append(np.histogram(grouped[key],bins=2)[0])

        guardian = []
        sun = []
        for cluster in hist:
            guardian.append(cluster[0][0])
            sun.append(cluster[0][1])

        ind = np.arange(k)  # the x locations for the groups
        width = 0.35  # the width of the bars

        fig, ax = plt.subplots()
        rects1 = ax.bar(ind, guardian, width, color='b')
        rects2 = ax.bar(ind + width, sun, width, color='y')

        # add some text for labels, title and axes ticks
        ax.set_ylabel('Frequency in topic clusters')
        ax.set_title('Distribution of Guardian/Sun articles in each cluster')
        ax.set_xticks(ind + width / 2)
        ax.set_xticklabels(range(k))

        ax.legend((rects1[0], rects2[0]), ('The Guardian', 'Sun'))
        plt.savefig(self.dir + fname + str(k))
        plt.clf()

ModuleNotFoundError: No module named 'sklearn'

### Histograms

In [7]:
more_stopwords = ["Mrs", "says", "Mr"]
stopword = stopwords.words("english") + list(string.punctuation) + more_stopwords
#print(stopword)
filename1 = "Corpora/filtered/filteredjust_hl_article_guardian_tokenized.txt"
filename2 = "Corpora/filtered/filteredjust_hl_article_tokenized.txt"


# no_below : No words which appear in less than X articles
# no_above : No words which appear in more than X % of the articles
lda = LDA(filename1, filename2, stopword, num_topics=7, no_below=20, no_above=0.5)
corpus_feature_vectors = lda.corpus_feature_vectors
output = lda.final_output

#for k in [3,4,5,6,7]:
#    k_means = kmeans(output, "histograms")
#    labels,score = k_means.cluster(k)
#    k_means.plot_histogram2("histogram",k)

print(lda.get_topics())


ModuleNotFoundError: No module named 'sklearn'

## Sentiment Analysis

In [8]:
snt = SentimentIntensityAnalyzer()

# polarity score: dict with overall, neg, neu, pos
def get_sentiment_score(sentences):
	scores = []
	for sentence in sentences:
		score = snt.polarity_scores(sentence)
		#score_values = [ val for key, val in score.items] not right order
		score_values = []
		score_values.append(str(score["compound"]))
		score_values.append(str(score["neg"]))
		score_values.append(str(score["neu"]))
		score_values.append(str(score["pos"]))
		scores.append(score_values)
	return scores

def write_scores_to_file(out_file, scores):
	with open(out_file, "w") as o:
		o.write("Overall score\tNegative\tNeutral\tPositive\n")
		for score in scores:
			o.write("\t".join(score))
			o.write("\n")

### Comparison of manual and automatic score

In [9]:

def get_overall_score(scored_file):
	overall_scores = []
	with open(scored_file, 'r') as f:
		for line in f.readlines():
			if line != "\n":
				overall_scores.append(round(float(line.split("\t")[0]), 1))
	return overall_scores

def read_manual_sa_score(manual_sa_file):
	manual_scores = []
	with open(manual_sa_file, 'r') as f:
		for line in f.readlines():
			if line != "\n":
				manual_scores.append(float(line.strip()))
	return manual_scores

def compare_scores(automatic, manual, ok_range):
	right = 0
	wrong = 0
	wrong_indices = []
	ok_range = float(ok_range)
	for i, (m_score, a_score) in enumerate(zip(automatic, manual)):
		if m_score - a_score <= ok_range:
			right += 1
		else:
			wrong += 1
			wrong_indices.append(i)
	return right, wrong, wrong_indices