In [161]:
import requests
from bs4 import BeautifulSoup, NavigableString
import datetime


In [155]:
def process_post_bodies(bodies):
    for body in bodies:
        cites = []
        cited = body.findAll('div',{'class':'cite'})
        if cited:
            cites = []
            for c in cited:
                cites.append(c['name'])
        collect_text = []
        for tag in body:
            if tag.name not in ('div','p'):
                if hasattr(tag,'text'):
                    collect_text.append(tag.text)
                if isinstance(tag, NavigableString):
                    collect_text.append(str(tag))

        else:
            yield ''.join(collect_text), cites

            
class NW(object):
    BASE_URL = 'http://netwars.pl/'
    BASE_URL_TOPIC = 'http://netwars.pl/temat/{!s}'
    OT_FORUM_NUMBER = '4'
    
    def __repr__(self):
        return '%s(%r)'%(self.__class__, self.username)
     
    def __init__(self, username=None, password=None):
        self.username = username
        self.password = password
        self.logged_in = False
        
    @staticmethod
    def _url_to_soup(url):
        return BeautifulSoup(requests.get(url).text,'lxml')
    
    @staticmethod
    def _topic_differences(old, new):
        """
        Return topics that changed between two scrapes
        """
        return dict((set(new.items()) - set(old.items()))).keys()
        
        
    @staticmethod
    def _live_user_differences(old, new):
        return set(old) == set(new)
    
    @staticmethod
    def _topic_soup_to_json(soup):
        
        if 'Nie znaleziono' in soup.text:
            raise ValueError('Topic does not exist')
        
        topic_number = list(filter(
                lambda x: 'topic_' in x, [
                    d.get('id','not-relevant') 
                    for d in soup.findAll('div')
                ]))
        
        
        topic_id = int(topic_number[0].split('_')[-1])        
        navi_list = [a for a in  soup.findAll('ul', {'class':'forum_navi'})][0].findAll('li')
        forum_id = navi_list[1].a['href']
        topic_name = navi_list[2].text

        
        ids = soup.findAll('div',{'class':'post'})
        dates = soup.findAll('div',{'class':'p2_data'})
        nicks = soup.findAll('div',{'class':'p2_nick'})
        bodies  = soup.findAll('div',{'class':'post_body'})

        dates = map(lambda x: x.text, dates)
        post_bodies = process_post_bodies(bodies)
        user_hrefs = map(lambda x: x.a['href'], nicks)
        user_names = map(lambda x: x.a.text, nicks)
        ids = map(lambda x: x['id'].split('_')[-1], ids)

        posts_list = [
            {
            'topic_id':topic_id,
            'forum_id':forum_id,
            'post_id':pid, 
            'post_date':pdate, 
            'user_href':href, 
            'user_name':uname, 
            'post_body':body,
            'cites': cites

            } for pid,pdate,href,uname, (body, cites) in 
             zip(ids, dates, user_hrefs, user_names, post_bodies)
        ]
           
        topic_meta = {
            'forum_id':forum_id,
            'topic_name': topic_name,
            'topic_id': topic_id
            
        }

        return posts_list, topic_meta


    def login(self):
        if not self.username or notself.password:
            raise ValueError('No cred')
            
        payload = {
            'tnick':self.username,
            'tpass':self.password
        }
        
        nwsession = requests.session()
        nwsession.post(urljoin(self.BASE_URL,'login'), payload)
        self.nwsession = nwsession
        self.logged_in = True
    
    def logout(self):
        self.nwsession.post(urljoin(self.BASE_URL,'logout'))
        self.logged_in = False
        

    def topic_to_json(self, topic_number):
        soup = self._url_to_soup(self.BASE_URL_TOPIC.format(topic_number))
        return self._topic_soup_to_json(soup)
    
    @staticmethod
    def _list_of_active_users(base_soup):
        return [a['href'] for a in base_soup.findAll(
                'div',attrs={'id':'footer'}
            )[0].findAll('a')][2:]
    
    @staticmethod
    def _topics_and_post_number(base_soup):
        topic_ids = [int(z.a['href'].split('/')[-1]) for z in base_soup.findAll('td',{'class':'topic'})]
        number_of_posts = [int(z.text) for z in s.findAll('td',{'class':'posts'})]
        return dict(zip(topic_ids, number_of_posts))
    
    def home_page_status(self):
        soup = self._url_to_soup(self.BASE_URL)
        topics = self._topics_and_post_number(soup)
        users = self._list_of_active_users(soup)
        return topics, users
    

        
        



In [120]:
nw = NW()
post_list, meta = nw.topic_to_json(173469)
k = nw.home_page_status()
a, b = nw.home_page_status()



In [144]:
import redis 
import json
r  = redis.Redis()



TypeError: the JSON object must be str, not 'bytes'

In [99]:
a = {'a':1,'b':2,'c':3}
b = {'a':2,'b':2,'c':3,'d':1, 'x':12}


set_a = set(a.items())
set_b = set(b.items())

dict((set_b - set_a))

{'a': 2}

In [333]:
from pytz import utc
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.jobstores.sqlalchemy import SQLAlchemyJobStore
from apscheduler.jobstores.redis import RedisJobStore
from apscheduler.executors.pool import ProcessPoolExecutor

jobstores = {
#     'mongo': {'type': 'mongodb'},
#     'default': SQLAlchemyJobStore(url='sqlite:///jobs.sqlite')
    'default': RedisJobStore()
}
executors = {
    'default': {'type': 'threadpool', 'max_workers': 20},
    'processpool': ProcessPoolExecutor(max_workers=5)
}

job_defaults = {
    'coalesce': False,
    'max_instances': 3
}
# scheduler = BlockingScheduler()
scheduler = BackgroundScheduler(jobstores=jobstores, executors=executors, job_defaults=job_defaults, timezone=utc)
import hashlib
import requests
L = []
def get_nw_hash():
    hash_string = hashlib.md5(requests.get('http://netwars.pl').text.encode('utf8')).hexdigest()
    
    if hash_string in L:
        L.append('oh now nw changed!')
    else:
        L.append(hash_string)    

scheduler.add_job(get_nw_hash, 'interval', seconds=30)
scheduler.start()



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


['/user/27684',
 '/user/16893',
 '/user/28070',
 '/user/29418',
 '/user/31530',
 '/user/31908',
 '/user/7743',
 '/user/29465',
 '/user/16019',
 '/user/29851',
 '/user/29787',
 '/user/6046',
 '/user/31207',
 '/user/32012',
 '/user/26840',
 '/user/9877',
 '/user/27917',
 '/user/31143',
 '/user/5923',
 '/user/10734',
 '/user/21998',
 '/user/32039',
 '/user/29358',
 '/user/18347',
 '/user/15662',
 '/user/32137',
 '/user/17841',
 '/user/808',
 '/user/4093',
 '/user/31924',
 '/user/15013',
 '/user/27459',
 '/user/13142',
 '/user/11224',
 '/user/9926',
 '/user/7201',
 '/user/398',
 '/user/4053',
 '/user/28771',
 '/user/12677',
 '/user/31646',
 '/user/27782',
 '/user/28071',
 '/user/2656',
 '/user/31392',
 '/user/27403',
 '/user/584',
 '/user/9649']

In [None]:
from pytz import utc
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.jobstores.sqlalchemy import SQLAlchemyJobStore
from apscheduler.jobstores.redis import RedisJobStore
from apscheduler.executors.pool import ProcessPoolExecutor

jobstores = {
#     'mongo': {'type': 'mongodb'},
#     'default': SQLAlchemyJobStore(url='sqlite:///jobs.sqlite')
    'default': RedisJobStore()
}
executors = {
    'default': {'type': 'threadpool', 'max_workers': 20},
    'processpool': ProcessPoolExecutor(max_workers=5)
}

job_defaults = {
    'coalesce': False,
    'max_instances': 3
}
# scheduler = BlockingScheduler()
scheduler = BackgroundScheduler(jobstores=jobstores, executors=executors, job_defaults=job_defaults, timezone=utc)
import hashlib
import requests
L = []
def get_nw_hash():
    hash_string = hashlib.md5(requests.get('http://netwars.pl').text.encode('utf8')).hexdigest()
    
    if hash_string in L:
        L.append('oh now nw changed!')
    else:
        L.append(hash_string)    

scheduler.add_job(get_nw_hash, 'interval', seconds=30)
scheduler.start()

In [336]:
print(process_post_body)

<function process_post_body at 0x7ff3a6892d08>


NameError: name 'unicode' is not defined

In [145]:
import elasticsearch
from elasticsearch_dsl import Search, Q, MultiSearch
import json
import subprocess
import logstash
import requests

In [157]:
nw = NW()
posts, head = nw.topic_to_json(173399)


# topic_number = list(filter(
#                 lambda x: 'topic_' in x, [
#                     d.get('id','not-relevant') 
#                     for d in soup.findAll('div')
#                 ]))
# topic_number

In [159]:




local_machine = {'host':'localhost','port':9200}
es = elasticsearch.Elasticsearch(hosts=[local_machine])


In [160]:
for i,p in enumerate(posts):  
    es.index(index='nw', doc_type='post', id=i, body=p)

    