In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re, string, csv, urllib, datetime, operator
import signal
import numpy as np
from bs4 import BeautifulSoup
from collections import defaultdict
from time import sleep


In [2]:
def prepare_article(soup):
    article = soup.find("div", class_="articlewrapper")    
    main = re.sub(r'[.]+(?! )', r'. ', str(article))
    main = re.sub(r'.  ', r'. ', main)
    main = remove_tags(main)
    return main

def signal_handler(signum, frame):
    raise Exception("Timed out!")

def is_lower(char):
    #whatisthis(char)
    ice_lower_vowels = ['ö', 'ó', 'í', 'ú', 'ý', 'é', 'æ', 'á']
    ice_lower_cons = ['ð', 'þ']
    if char in ice_lower_vowels or char in ice_lower_cons or char.islower():
        return True
    else:
        return False
    
def is_alpha(char):
    ice_cap_vowels = ['Ö', 'Ó', 'Í', 'Ú', 'Ý', 'É', 'Æ', 'Á']
    ice_cap_cons = ['Ð', 'Þ']
    if char in ice_cap_vowels or char in ice_cap_cons or char.isalpha() or is_lower(char):
        return True
    else:
        return False
    
def change_non_stop_periods(string):
    for c in range(len(string)):
        if string[c] == '.':
            i = c+1
            while i < len(string):
                if is_alpha(string[i]):
                    if is_lower(string[i]):
                        string = string[:c] + "~" + string[c+1:]
                        c = c+1
                        break
                    else:
                        break
                i += 1
    string = string.replace("Th.", "Th~")
    return string


def sentence_rank(sentence, d):
    sentence = sentence.replace('~', '')
    words = sentence.split()
    n = len(words)
    score = 0
    for word in words:
        score += d[word]
    return score

def binary_search_stem(T, A):
    #print(T)
    original = T
    T = T.lower()
    n = len(A)
    R = n-1
    L = 0
    while True:
        if L > R:
            return original, False
        m = int(np.floor((L+R)/float(2)))
        A_m = A[m]
        current_word = A_m[4]
        if current_word < T:
            L = m+1
        elif current_word > T:
            R = m-1
        elif current_word == T:
            return A_m[0].decode('utf8'), True

def create_dictionary(main, unique_weight, sorted_csv):
    d = defaultdict(int)
    word_split = main.split()
    cnt = 0
    for word in word_split:
        cnt = cnt+1
        interesting = 0
        stem, found = (binary_search_stem(word, sorted_csv))
        if not found:
            if stem[0].isupper:
                interesting = 1
        d[stem] += 1 + unique_weight * interesting
    return d
def remove_tags(text):
    TAG_RE = re.compile(r'<[^>]+>')
    return TAG_RE.sub('. A.', text)

def whatisthis(s):
    if isinstance(s, str):
        print "ordinary string"
    elif isinstance(s, unicode):
        print "unicode string"
    else:
        print "not a string"
        
def display_top_sentences(article, p, d):
    fixed_periods = change_non_stop_periods(article)
    sentences = fixed_periods.split('.')
    sentences = split_icelandic(sentences)
    lengths = []
    for sentence in sentences:
        if len(sentence) > 3:
            lengths.append(len(sentence))
    sum_length = np.sum(lengths)
    mean_length = np.mean(lengths)
    nsentence = int(np.ceil(p * sum_length / float(mean_length*2.4)))
    scores = []
    for j in range(len(sentences)):
        total_score = sentence_rank(sentences[j], d)
        normalized_score = total_score #/ np.float(len(sentences[j])+1)
        scores.append(normalized_score )
    scores = np.array(scores)
    indices = scores.argsort()[-nsentence:][::-1]
    display = ""
    for j in range(len(sentences)):
        if j in indices:
            out_sentence = sentences[j]
            if out_sentence[2] == '~':
                out_sentence = out_sentence[3:]
            if out_sentence[0] == ' ':
                out_sentence = out_sentence[1:]
            if out_sentence[len(out_sentence)-1] == '~':
                    out_sentence = out_sentence[:-1]
            display = display + out_sentence.replace('~', '.') + '. '
    return display
def fix_icelandic(i, sentences, letter):
    a = sentences[i].split(" " + str(letter))
    split = False
    if len(a) > 1:
        sentences[i] = a[0]
        for j in range(1, len(a)):
            sentences.insert(i+j, letter + str(a[j]))
        split = True
    return sentences, split
def split_icelandic(sentences):
    for i in range(len(sentences)):
        chars = ["Þ", "É", "Ý", "Ú", "Í", "Ó", "Á", "Æ", "Ö"]
        for char in chars:
            sentences, split = fix_icelandic(i, sentences, char)
            if split:
                i = 0
    return sentences

def process_news_story(str_ID):
    url = 'http://www.visir.is/article/' + str_ID
    fail_counter_story = 0
    while True:
        try:
            signal.signal(signal.SIGALRM, signal_handler)
            signal.alarm(10)
            r = urllib.urlopen(url).read()
            soup = BeautifulSoup(r, "lxml")
            break
        except Exception, msg:
            fail_counter_story += 1
            if fail_counter_story > 10:
                print(">>>>Problem with loading story, taking 5.")
                sleep(300)
                fail_counter_story = 0
    #signal.signal(signal.SIGALRM, signal_handler)
    signal.alarm(0)
    result = soup.find_all("span", class_="date")
    hour = result[1]
    date = result[2]
    #print(soup)
    main = prepare_article(soup)
    #print(main)
    
    print("Article " + url + " published " + date.text + " " + hour.text)

    d = create_dictionary(main, unique_weight, sorted_csv)
    output = display_top_sentences(main, keep_ratio, d)

    headline = soup.find("h1").text
    print(headline)
    print(output)
    unused_chain = 0
    
    ratio = len(output) / float(len(main))
    print("Actual pct. kept " + str(ratio) + "\n")

In [3]:
SHsnid_reader = csv.reader(open('SHsnid.csv/SHsnid.csv'), delimiter=';')
sorted_csv = sorted(SHsnid_reader, key=operator.itemgetter(4))

In [4]:
print("I'm conscious please help me")
keep_ratio = 0.2
unique_weight = 1
unused_chain = 0
failed_hit_threshold = 150
nap_length = 1200 # seconds
last_used_id = '2016161209003'
used_id = str(last_used_id)
scan_threshold = 20
fail_counter_index = 0
while True:
    new_start_url = "http://www.visir.is/section/FRETTIR01"
    while True:
        try:
            signal.signal(signal.SIGALRM, signal_handler)
            signal.alarm(10)
            r = urllib.urlopen(new_start_url).read()
            soup = BeautifulSoup(r, "lxml")
            results = soup.find_all("div", class_="newsitem")
            break
        except Exception, msg:
            fail_counter_index += 1
            if fail_counter_index > 10:
                print(">>>>Problem with loading Index, taking 5.")
                sleep(300)
                fail_counter_index = 0
    #signal.signal(signal.SIGALRM, signal_handler)
    signal.alarm(0)
    found_new_id = False
    for s in range(scan_threshold):
        if len(results) == 0:
            break
        next_result = results[s]
        next_url = next_result.find('a', href=True)['href']
        next_id = next_url.split('/')[3]
        now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
        print(now + ", ID: " + next_id)
        if next_id != last_used_id:
            process_news_story(next_id)
            if not found_new_id:
                used_id = next_id
                found_new_id = True
        else:
            print("Taking a power nap")
            sleep(nap_length)
            break
    last_used_id = used_id

I'm conscious please help me
2016-12-08 08:54, ID: 2016161208928
Article http://www.visir.is/article/2016161208928 published 08. DESEMBER 2016 07:00
Þrír grunaðir um nauðgun og frelsissviptingu
Mennirnir voru handteknir á mánudag og í kjölfarið því úrskurðaðir í vikulangt gælsuvarðhald. Þrír karlmenn voru á þriðjudagskvöld úrskurðaðir í vikulangt gæsluvarðhald í Héraðsdómi Reykjavíkur vegna gruns um að þeir hafi svipt konu frelsi og nauðgað henni. 
Actual pct. kept 0.254577157803

2016-12-08 08:55, ID: 2016161208929
Article http://www.visir.is/article/2016161208929 published 08. DESEMBER 2016 07:00
Jón Steinar segir að Markús þurfi að segja af sér
Hann er afar gagnrýninn á Markús Sigurbjörnsson hæstaréttardómara og þá staðreynd að hann hafi dæmt í málum sem Jón Steinar telur að Markús hafi verið vanhæfur til að dæma í. Markús tilkynnti hins vegar ekki um það þegar hann fór með 60 milljónir í eignastýringu en hann hefur sagt að honum hafi ekki borið að tilkynna það. 
Actual pct. kept 0.

Exception: Timed out!