In [5]:
import requests
from bs4 import BeautifulSoup, NavigableString
import datetime

In [8]:
def process_post_bodies(bodies):
    for body in bodies:
        cites = []
        cited = body.findAll('div', {'class': 'cite'})
        if cited:
            cites = []
            for c in cited:
                cites.append(c['name'])
        collect_text = []
        for tag in body:
            if tag.name not in ('div', 'p'):
                if hasattr(tag, 'text'):
                    collect_text.append(tag.text)
                if isinstance(tag, NavigableString):
                    collect_text.append(str(tag))

        else:
            yield ''.join(collect_text), cites


class NW(object):
    BASE_URL = 'http://netwars.pl/'
    BASE_URL_TOPIC = 'http://netwars.pl/temat/{!s}'
    OT_FORUM_NUMBER = '4'

    def __repr__(self):
        return '%s(%r)' % (self.__class__, self.username)

    def __init__(self, username=None, password=None):
        self.username = username
        self.password = password
        self.logged_in = False

    @staticmethod
    def _url_to_soup(url):
        return BeautifulSoup(requests.get(url).text, 'lxml')

    @staticmethod
    def _topic_differences(old, new):
        """
        Return topics that changed between two scrapes
        """
        return dict((set(new.items()) - set(old.items()))).keys()

    @staticmethod
    def _live_user_differences(old, new):
        """
        
        :param old: old version of nw-meta to be compared against 
        :param new: 
        :return: 
        """
        return set(old) == set(new)

    @staticmethod
    def _topic_soup_to_json(soup):

        if 'Nie znaleziono' in soup.text:
            raise ValueError('Topic does not exist')

        topic_number = list(filter(
            lambda x: 'topic_' in x, [
                d.get('id', 'not-relevant')
                for d in soup.findAll('div')
                ]))

        topic_id = int(topic_number[0].split('_')[-1])
        navi_list = [a for a in soup.findAll('ul', {'class': 'forum_navi'})][0].findAll('li')
        forum_id = navi_list[1].a['href']
        topic_name = navi_list[2].text

        ids = soup.findAll('div', {'class': 'post'})
        dates = soup.findAll('div', {'class': 'p2_data'})
        nicks = soup.findAll('div', {'class': 'p2_nick'})
        bodies = soup.findAll('div', {'class': 'post_body'})

        dates = map(lambda x: x.text, dates)
        post_bodies = process_post_bodies(bodies)
        user_hrefs = map(lambda x: x.a['href'], nicks)
        user_names = map(lambda x: x.a.text, nicks)
        ids = map(lambda x: x['id'].split('_')[-1], ids)

        posts_list = [
            {
                'topic_id': topic_id,
                'forum_id': forum_id,
                'post_id': pid,
                'post_date': pdate,
                'user_href': href,
                'user_name': uname,
                'post_body': body,
                'cites': cites

            } for pid, pdate, href, uname, (body, cites) in
            zip(ids, dates, user_hrefs, user_names, post_bodies)
            ]

        topic_meta = {
            'forum_id': forum_id,
            'topic_name': topic_name,
            'topic_id': topic_id

        }

        return posts_list, topic_meta

    def login(self):
        if not self.username or notself.password:
            raise ValueError('No cred')

        payload = {
            'tnick': self.username,
            'tpass': self.password
        }

        nwsession = requests.session()
        nwsession.post(urljoin(self.BASE_URL, 'login'), payload)
        self.nwsession = nwsession
        self.logged_in = True

    def logout(self):
        self.nwsession.post(urljoin(self.BASE_URL, 'logout'))
        self.logged_in = False

    def topic_to_json(self, topic_number):
        soup = self._url_to_soup(self.BASE_URL_TOPIC.format(topic_number))
        return self._topic_soup_to_json(soup)

    @staticmethod
    def _list_of_active_users(base_soup):
        return [a['href'] for a in base_soup.findAll(
            'div', attrs={'id': 'footer'}
        )[0].findAll('a')][2:]

    @staticmethod
    def _topics_and_post_number(base_soup):
        topic_ids = [int(z.a['href'].split('/')[-1]) for z in base_soup.findAll('td', {'class': 'topic'})]
        number_of_posts = [int(z.text) for z in s.findAll('td', {'class': 'posts'})]
        return dict(zip(topic_ids, number_of_posts))

    def home_page_status(self):
        soup = self._url_to_soup(self.BASE_URL)
        topics = self._topics_and_post_number(soup)
        users = self._list_of_active_users(soup)
        return topics, users

In [9]:
# from pytz import utc
# from apscheduler.schedulers.background import BackgroundScheduler
# from apscheduler.schedulers.blocking import BlockingScheduler
# from apscheduler.jobstores.sqlalchemy import SQLAlchemyJobStore
# from apscheduler.jobstores.redis import RedisJobStore
# from apscheduler.executors.pool import ProcessPoolExecutor

# jobstores = {
# #     'mongo': {'type': 'mongodb'},
# #     'default': SQLAlchemyJobStore(url='sqlite:///jobs.sqlite')
#     'default': RedisJobStore()
# }
# executors = {
#     'default': {'type': 'threadpool', 'max_workers': 20},
#     'processpool': ProcessPoolExecutor(max_workers=5)
# }

# job_defaults = {
#     'coalesce': False,
#     'max_instances': 3
# }
# # scheduler = BlockingScheduler()
# scheduler = BackgroundScheduler(jobstores=jobstores, executors=executors, job_defaults=job_defaults, timezone=utc)
# import hashlib
# import requests
# L = []
# def get_nw_hash():
#     hash_string = hashlib.md5(requests.get('http://netwars.pl').text.encode('utf8')).hexdigest()
    
#     if hash_string in L:
#         L.append('oh now nw changed!')
#     else:
#         L.append(hash_string)    

# scheduler.add_job(get_nw_hash, 'interval', seconds=30)
# scheduler.start()

In [10]:
nw = NW()
posts, head = nw.topic_to_json(173399)
# 
# local_machine = {'host':'localhost','port':9200}
# es = elasticsearch.Elasticsearch(hosts=[local_machine])
# 
# for i,p in enumerate(posts):  
#     es.index(index='nw', doc_type='post', id=i, body=p)

In [1]:
import rq

In [8]:
import requests
from count_w import count_words_at_url
from bla import bla
from redis import Redis
from rq import Queue, job
q = Queue(connection=Redis(),name='blabla2')

In [4]:
result = q.enqueue(bla)

In [5]:
result.result

In [13]:
result.get_id()

'50fb7127-4e0d-48cf-ac83-5ddd5f62bf09'

In [14]:
result.get_id()

'50fb7127-4e0d-48cf-ac83-5ddd5f62bf09'

In [20]:
j = job.Job.fetch('50fb7127-4e0d-48cf-ac83-5ddd5f62bf09', connection=Redis())
j.is_failed

False

In [16]:
j.result

'oooooooooook'