In [26]:
import requests
import bs4 as bs
import re
from typing import Generator

### Data Sources

- Verbatim archives, i.e. debates by time and day http://164.100.47.5/newsite/debatenew/newshow.aspx?arch=245
    - This includes the un edited debates noted down during the conduct by hours of the days
    - We can get what organisations are being talked about and acts/bills
    - People who are speaking
    - when interuptions are happening
    - The number of hours the rajya sabha worked
- Date Wise Full day edited debates http://164.100.47.5/newsite/floor_official_debate/floor_official_debate.aspx
    - Index of everything laid on the table with page indices
    - More concise

In [2]:
url = 'http://164.100.47.5/newsite/floor_official_debate/floor_official_debate.aspx'

In [3]:
def get_soup(url):
    get_resp = requests.get(url)
    soup = bs.BeautifulSoup(get_resp.text)
    return soup

def extract_aspx_variables_from_soup(soup, var):
    return soup.find('input', {'id':var}).attrs['value']

In [32]:
def get_session_id_resp(session_id:int) -> requests.models.Response:
    url = 'http://164.100.47.5/newsite/floor_official_debate/floor_official_debate.aspx'
    get_resp_soup = get_soup(url)
    data = {'ctl00$ContentPlaceHolder1$DropDownList1': session_id,
           '__VIEWSTATE': extract_aspx_variables_from_soup(get_resp_soup, '__VIEWSTATE'),
           '__VIEWSTATEGENERATOR': extract_aspx_variables_from_soup(get_resp_soup, '__VIEWSTATEGENERATOR'),
           '__EVENTVALIDATION': extract_aspx_variables_from_soup(get_resp_soup, '__EVENTVALIDATION'),
           '__EVENTTARGET': 'ctl00$ContentPlaceHolder1$DropDownList1'}
    resp = requests.post(url, data=data)
    return resp


def get_pdf_links(resp: requests.models.Response) -> Generator[bs.element.Tag, None, None]:
    soup = bs.BeautifulSoup(resp.text)
    for link in soup.findAll('a', attrs={'href': re.compile("^http://.*pdf")}):
        if 'Debate' in link.get('href'):
            yield link

In [63]:
def filter_debate_pdf_links(pdf_links):
    for link in pdf_links:
        if 'Debate' in link.get('href'):
            yield link

In [77]:
def get_sessions():
    url = 'http://164.100.47.5/newsite/floor_official_debate/floor_official_debate.aspx'
    get_resp_soup = get_soup(url)
    session_options = get_resp_soup.find(
        'select',
        {'id': 'ctl00_ContentPlaceHolder1_DropDownList1'}
    ).findAll('option')
    return (opt.text for opt in session_options if 'Select' not in opt.text)

In [78]:
session_id = 246
resp = get_session_id_resp(session_id)
if resp.status_code != 200:
    raise Exception(resp.text)
pdf_links = filter_debate_pdf_links(get_pdf_links(resp))

In [79]:
pdf_links

<generator object filter_debate_pdf_links at 0x10e1cda50>

In [80]:
list(pdf_links)

http://rajyasabha.nic.in/rsnew/question/mini_nodal_officers.pdf
http://rajyasabha.nic.in/rsnew/member_site/newlob/party_position.pdf
http://rajyasabha.nic.in/rsnew/members/entitlement_salary_mp.pdf
http://rajyasabha.nic.in/rsnew/members/entitlement_salary_exmp.pdf
http://rajyasabha.nic.in/rsnew/Parliament_of_India.pdf
http://rajyasabha.nic.in/rsnew/two_house_parliament.pdf
http://164.100.47.7/Pricelist/PriceList/E_Souvenir_Item.pdf
http://164.100.47.5/Official_Debate_Nhindi/Floor/246/F10.08.2018.pdf
http://164.100.47.5/Official_Debate_Nhindi/Floor/246/F09.08.2018.pdf
http://164.100.47.5/Official_Debate_Nhindi/Floor/246/F08.08.2018.pdf
http://164.100.47.5/Official_Debate_Nhindi/Floor/246/F07.08.2018.pdf
http://164.100.47.5/Official_Debate_Nhindi/Floor/246/F06.08.2018.pdf
http://164.100.47.5/Official_Debate_Nhindi/Floor/246/F03.08.2018.pdf
http://164.100.47.5/Official_Debate_Nhindi/Floor/246/F02.08.2018.pdf
http://164.100.47.5/Official_Debate_Nhindi/Floor/246/F01.08.2018.pdf
http://164.1

[<a href="http://164.100.47.5/Official_Debate_Nhindi/Floor/246/F10.08.2018.pdf" id="ctl00_ContentPlaceHolder1_GridView4_ctl02_hp2" target="_blank"><img id="ctl00_ContentPlaceHolder1_GridView4_ctl02_img1" src="images/download.png" style="width :30px"/>(1729 Kb)</a>,
 <a href="http://164.100.47.5/Official_Debate_Nhindi/Floor/246/F09.08.2018.pdf" id="ctl00_ContentPlaceHolder1_GridView4_ctl03_hp2" target="_blank"><img id="ctl00_ContentPlaceHolder1_GridView4_ctl03_img1" src="images/download.png" style="width :30px"/>(6164 Kb)</a>,
 <a href="http://164.100.47.5/Official_Debate_Nhindi/Floor/246/F08.08.2018.pdf" id="ctl00_ContentPlaceHolder1_GridView4_ctl04_hp2" target="_blank"><img id="ctl00_ContentPlaceHolder1_GridView4_ctl04_img1" src="images/download.png" style="width :30px"/>(738 Kb)</a>,
 <a href="http://164.100.47.5/Official_Debate_Nhindi/Floor/246/F07.08.2018.pdf" id="ctl00_ContentPlaceHolder1_GridView4_ctl05_hp2" target="_blank"><img id="ctl00_ContentPlaceHolder1_GridView4_ctl05_img1"

In [94]:
import os
import pandas as pd
from datetime import datetime

In [91]:
base_dir = '../sessions'

sessions = pd.DataFrame()
for session_file in os.listdir(base_dir):
    sessions = pd.concat([sessions, pd.read_csv(f'{base_dir}/{session_file}')])

In [99]:
sessions['date'] = sessions.date.apply(lambda x: datetime.strptime(x.strip('F'), '%d.%m.%Y'))

In [100]:
sessions

Unnamed: 0,session_id,date,url,size
0,116,1980-12-24 00:00:00,http://164.100.47.5/Official_Debate_Nhindi/Flo...,(5825 Kb)
1,116,1980-12-23 00:00:00,http://164.100.47.5/Official_Debate_Nhindi/Flo...,(3738 Kb)
2,116,1980-12-22 00:00:00,http://164.100.47.5/Official_Debate_Nhindi/Flo...,(10298 Kb)
3,116,1980-12-19 00:00:00,http://164.100.47.5/Official_Debate_Nhindi/Flo...,(6683 Kb)
4,116,1980-12-18 00:00:00,http://164.100.47.5/Official_Debate_Nhindi/Flo...,(6937 Kb)
...,...,...,...,...
9,133,1985-03-18 00:00:00,http://164.100.47.5/Official_Debate_Nhindi/Flo...,(4252 Kb)
10,133,1985-03-16 00:00:00,http://164.100.47.5/Official_Debate_Nhindi/Flo...,(12 Kb)
11,133,1985-03-15 00:00:00,http://164.100.47.5/Official_Debate_Nhindi/Flo...,(4698 Kb)
12,133,1985-03-14 00:00:00,http://164.100.47.5/Official_Debate_Nhindi/Flo...,(6722 Kb)


In [101]:
sessions.to_csv('sessions.csv')

In [1]:
sessions.url.iloc

NameError: name 'sessions' is not defined