In [44]:
import pandas as pd
import json
import urllib.request
import datetime
import requests
from lxml import html
from datetime import timezone,timedelta, timezone
import re
import boto3
import base64
import parse

NEWS_BASE_URL_API = 'https://dailyfeeds.co.th'
UTC_OFFSET = 7
FETCH_INTERVAL_MINUTES = 600

page_url = 'https://dailynews.co.th/entertainment'
root_url = 'https://dailynews.co.th'
firehose_name = ''
send_to_firehose = False

def thai_strp_time(text,form):
    thai_full_months = [
    "มกราคม",
    "กุมภาพันธ์",
    "มีนาคม",
    "เมษายน",
    "พฤษภาคม",
    "มิถุนายน",
    "กรกฎาคม",
    "สิงหาคม",
    "กันยายน",
    "ตุลาคม",
    "พฤศจิกายน",
    "ธันวาคม",]
    thai_full_weekdays = [
    "วันจันทร์",
    "วันอังคาร",
    "วันพุธ",
    "วันพฤหัสบดี",
    "วันศุกร์",
    "วันเสาร์",
    "วันอาทิตย์",]
    r = parse.parse(form,text)
    td = r.named
    if 'A' in td:
        weekd_id = thai_full_weekdays.index(td['A']) + 1
    if 'B' in td:
        month_id = thai_full_months.index(td['B']) + 1
    if 'BE' in td:
        year_ad  = td['BE'] - 543
    if 'd' in td:
        date = td['d']
    if 'h' in td:
        hours = td['h']
    if 'm' in td:
        minutes = td['m']
    return(datetime.datetime(year_ad,month_id,date,hour=hours,minute=minutes,tzinfo=timezone(timedelta(hours=7))))

def getFeedPage(feeds_url,root_url=None):
    if root_url is None:
        root_url = feeds_url
    feeds_html = urllib.request.urlopen(feeds_url).read()
    tree = html.fromstring(feeds_html)
    feeds_html = tree.cssselect('#top-section > div.left > section > div > article')
    feeds = []
    for n in feeds_html:
        try:
            feeds.append({
                'feeds_url'  : root_url + n.cssselect('a')[0].get('href'),
                'th_date'   : n.cssselect('a > div.media-body > span')[0].text,
                'title'     : n.cssselect('a > div.media-body > h3')[0].text,
                'image_url' : root_url + n.cssselect('a > div.media-left > img')[0].get('src'),
                'abstract'  : n.cssselect('a > div.media-body > p')[0].text,
                'source'    : 'Dailynews'
            })
        except:
            continue
    return pd.DataFrame(feeds)

def getContent(feeds_url):
    feeds_html = urllib.request.urlopen(feeds_url).read()
    tree = html.fromstring(feeds_html)
    tmp_str = tree.xpath('//*[@id="news-article"]/section[3]/div/text()')
    tmp_str = [t.strip() for t in tmp_str]
    t2 = []
    for t in tmp_str:
        if t != '':
            t2.append(t.strip())

    content = "\n\t".join(t2)
    return pd.Series({'content':content})


def main():
    feeds = getFeedPage(page_url,root_url=root_url)
    feeds.loc[:,'time'] = feeds['th_date'].apply(lambda a : thai_strp_time('วัน'+a, '{A}ที่ {d:d} {B} {BE:d} เวลา {h:d}.{m:d} น. '))
    feeds.loc[:,'lastModified'] = feeds['time'].apply(str)

    t_delta = timedelta(minutes=-FETCH_INTERVAL_MINUTES)
    t_now = datetime.datetime.now(tz=timezone(timedelta(hours=UTC_OFFSET)))
    t_old = t_now+t_delta
    data_feeds = feeds[feeds['time'] > t_old]

    data_feeds.loc[:,'content'] = data_feeds['feeds_url'].apply(getContent)

    if len(data_feeds) > 0 and send_to_firehose:
        data_json = data_feeds.to_json(orient='records').encode('utf-8')
        data_byte = base64.b64encode(data_json)
        firehose_client = boto3.client('firehose')
        response = data_json
        response = firehose_client.put_record(
            DeliveryStreamName=firehose_name,
            Record={'Data': data_byte})
    else:
        response = 'No feed founds'
    return data_feeds


In [45]:
main()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,abstract,feeds_url,image_url,source,th_date,title,time,lastModified,content
0,"""เมฆ วินัย"" โพสต์ภาพสุดอบอุ่น ลูกชายมาหาและกอด...",https://dailynews.co.th/entertainment/718844,https://dailynews.co.th/admin/upload/20190706/...,Dailyfeeds,เสาร์ที่ 6 กรกฎาคม 2562 เวลา 12.04 น.,"ไม่เคยรังเกียจ ""เมฆ-วินัย"" สุดดีใจลูกมาหา-มากอด",2019-07-06 12:04:00+07:00,2019-07-06 12:04:00+07:00,
1,คนบันเทิงร่วมไว้อาลัยการจากไปของ คุณย่าภา คุณย...,https://dailynews.co.th/entertainment/718827,https://dailynews.co.th/admin/upload/20190706/...,Dailyfeeds,เสาร์ที่ 6 กรกฎาคม 2562 เวลา 10.50 น.,คนบันเทิงอาลัยคุณย่า'มดดำ' สัญญา'จะอยู่ต่อไปให...,2019-07-06 10:50:00+07:00,2019-07-06 10:50:00+07:00,


In [46]:
getContent('https://dailynews.co.th/entertainment/718844')

content    
dtype: object

In [48]:
feeds_url = 'https://dailynews.co.th/entertainment/718844'
feeds_html = urllib.request.urlopen(feeds_url).read()
tree = html.fromstring(feeds_html)
tmp_str = tree.xpath('//*[@id=news-article"]/section[3]/div/text()')
tmp_str = [t.strip() for t in tmp_str]
t2 = []
for t in tmp_str:
    if t != '':
        t2.append(t.strip())

content = "\n\t".join(t2)


In [51]:
tmp_str

[]