In [1]:
# systems
import os

from dateutil import parser

# scraping websites
import json
import requests
from bs4 import BeautifulSoup

# language processing
from sutime import SUTime

In [2]:
list_id = "mitml"
base_url = "http://mailman.mit.edu/mailman/private/"

local_dir = '../listings/'
local_list_dir = os.path.join(local_dir, list_id)
local_list_dir

'../listings/mitml'

In [3]:
# Import URLs and indices
with open(os.path.join(local_list_dir, 'urls.txt'), 'r') as f: 
    urls = [line.rstrip() for line in f.readlines()]
    indices = [os.path.basename(url) for url in urls]
    
# Import listing HTMLs
listings = {}
for url, index in zip(urls, indices): 
    with open(os.path.join(local_list_dir, index), 'r') as f:
        listings[index] = {
            "url" : url, 
            "html": BeautifulSoup(''.join(f.readlines()), 'html.parser')
        }

In [4]:
htmls = [dic['html'] for key,dic in listings.items()]

In [5]:
htmls[0]

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">

<html>
<head>
<title> [Mitml] Call for Papers: NIPS 2018 Workshop "All of Bayesian Nonparametrics (Especially the Useful Bits)"
   </title>
<link href="index.html" rel="Index"/>
<link href="mailto:mitml%40mit.edu?Subject=%5BMitml%5D%20Call%20for%20Papers%3A%20NIPS%202018%20Workshop%20%22All%20of%20Bayesian%0A%20Nonparametrics%20%28Especially%20the%20Useful%20Bits%29%22&amp;In-Reply-To=" rel="made"/>
<meta content="index,nofollow" name="robots"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<link href="000479.html" rel="Previous"/>
</head>
<body bgcolor="#ffffff">
<h1>[Mitml] Call for Papers: NIPS 2018 Workshop "All of Bayesian Nonparametrics (Especially the Useful Bits)"</h1>
<b>Tamara Broderick</b>
<a href="mailto:mitml%40mit.edu?Subject=%5BMitml%5D%20Call%20for%20Papers%3A%20NIPS%202018%20Workshop%20%22All%20of%20Bayesian%0A%20Nonparametrics%20%28Especially%20the%20Useful%20Bits%29%22&amp;In-Reply-To=" title='

## Get title

In [10]:
soup.title.string

' [Mitml] (TODAY) Statistics Special Seminar - Prateek Jain (Microsoft Research) - Tuesday, Aug 21st@2:00pm, E18-304\n   '

In [15]:
soup.title

<title> [Mitml] (TODAY) Statistics Special Seminar - Prateek Jain (Microsoft Research) - Tuesday, Aug 21st@2:00pm, E18-304
   </title>

## Get time and date

In [51]:
jar_files = "../../packages/python-sutime/jars/"

try:
    sutime = SUTime(jars=jar_files, mark_time_ranges=True)
except OSError: 
    sutime = SUTime(jars=jar_files, jvm_started=True, mark_time_ranges=True, include_range=True)

    
def sutime_results(text, reference='', sutime=sutime): 
    
    if len(reference) == 0:
        results = sutime.parse(text)
    else: 
        results = sutime.parse(text, reference)
        
    return results

In [81]:
html = htmls[64]

sent_time = html.i.text
body_text = html.pre.text

print(sent_time)
print(body_text)

Fri Dec  8 11:08:18 EST 2017
Join us for a talk with Benjamin Recht on Friday, 12/15
3:15 PM in classroom 32-155
Hosts: Leslie Kaelbling and Pablo Parrilo

Title: The statistical foundations of learning to control

Abstract: Given the dramatic successes in machine learning and reinforcement learning over the past half decade, there has been a resurgence of interest in applying these techniques to continuous control problems in robotics, self-driving cars, and unmanned aerial vehicles.  Though such control applications appear to be straightforward generalizations of standard reinforcement learning, few fundamental baselines have been established prescribing how well one must know a system in order to control it.  In this talk, I will discuss how one might merge techniques from statistical learning theory with robust control to derive such baselines for such continuous control.  I will explore several examples that balance parameter identification against controller design and demonstrat

In [82]:
# Get email timestamp
reference_time = parser.parse(sutime_results(sent_time)[0]['value']).isoformat()
reference_time

'2017-12-08T11:08:18-05:00'

In [84]:
matches = sutime_results(body_text, reference_time)

date_matches = []
duration_matches = []
other_matches = []

for match in matches: 
    if match['type'] == 'DATE': date_matches.append(match)
    elif match['type'] == 'DURATION': duration_matches.append(match)
    else: other_matches.append(match)

# for match in date_matches:
#     print(match)
#     parser.parse(match["value"], fuzzy=True)

In [86]:
matches

[{'start': 42,
  'end': 48,
  'text': 'Friday',
  'type': 'DATE',
  'value': '2017-12-08'},
 {'start': 56,
  'end': 63,
  'text': '3:15 PM',
  'type': 'TIME',
  'value': '2017-12-08T15:15'},
 {'start': 277,
  'end': 285,
  'text': 'the past',
  'type': 'DATE',
  'value': 'PAST_REF'},
 {'start': 291,
  'end': 297,
  'text': 'decade',
  'type': 'DURATION',
  'value': 'P10Y'},
 {'start': 1915, 'end': 1919, 'text': '2012', 'type': 'DATE', 'value': '2012'},
 {'start': 1976, 'end': 1980, 'text': '2014', 'type': 'DATE', 'value': '2014'},
 {'start': 1998, 'end': 2002, 'text': '2015', 'type': 'DATE', 'value': '2015'}]

In [85]:
date_matches

[{'start': 42,
  'end': 48,
  'text': 'Friday',
  'type': 'DATE',
  'value': '2017-12-08'},
 {'start': 277,
  'end': 285,
  'text': 'the past',
  'type': 'DATE',
  'value': 'PAST_REF'},
 {'start': 1915, 'end': 1919, 'text': '2012', 'type': 'DATE', 'value': '2012'},
 {'start': 1976, 'end': 1980, 'text': '2014', 'type': 'DATE', 'value': '2014'},
 {'start': 1998, 'end': 2002, 'text': '2015', 'type': 'DATE', 'value': '2015'}]

In [80]:
duration_matches

[{'start': 315,
  'end': 333,
  'text': '3:00 PM to 4:00 PM',
  'type': 'DURATION',
  'value': {'end': 'T16:00', 'begin': 'T15:00'}},
 {'start': 412,
  'end': 424,
  'text': 'recent years',
  'type': 'DURATION',
  'value': 'PXY'}]

In [79]:
parser.parse(duration_matches[0]['text'],fuzzy=True)

datetime.datetime(2018, 8, 23, 4, 0)

In [67]:
parser.parse("2018").isoformat()

'2018-08-23T00:00:00'

In [50]:
sutime_results(body_text, reference_time)

[{'start': 109,
  'end': 126,
  'text': 'Monday, June 11th',
  'type': 'DATE',
  'value': '2018-06-11'},
 {'start': 132,
  'end': 142,
  'text': '9:30am',
  'type': 'TIME',
  'value': '2018-05-24T09:30'},
 {'start': 286,
  'end': 306,
  'text': 'the end of the month',
  'type': 'DATE',
  'value': '2018-05'},
 {'start': 1655,
  'end': 1676,
  'text': 'Monday, June 11, 2018',
  'type': 'DATE',
  'value': '2018-06-11'},
 {'start': 1677,
  'end': 1700,
  'text': 'from 9:30 AM to 7:00 PM',
  'type': 'DURATION',
  'value': {'end': 'T19:00', 'begin': 'T09:30'}},
 {'start': 1860,
  'end': 1865,
  'text': '02142',
  'type': 'DATE',
  'value': '2142'},
 {'start': 4225,
  'end': 4242,
  'text': 'Monday, June 11th',
  'type': 'DATE',
  'value': '2018-06-11'},
 {'start': 4247,
  'end': 4252,
  'text': 'a day',
  'type': 'DURATION',
  'value': 'P1D'},
 {'start': 4483,
  'end': 4499,
  'text': 'the month of May',
  'type': 'DATE',
  'value': '2018-05'},
 {'start': 4515,
  'end': 4522,
  'text': 'Curr

In [57]:
parser.parse(sutime_results(tmp.pre.text)[0]['value'][-10:], fuzzy=True)

datetime.datetime(2018, 4, 25, 0, 0)

In [58]:
sutime_results(tmp.pre.text)[0]['value']

'THIS XXXX-WXX-3 INTERSECT XXXX-04-25'

In [65]:
json.loads(sutime._sutime.annotate("today", sent_time))

[{'start': 0, 'end': 5, 'text': 'today', 'type': 'DATE', 'value': 'XXXX'}]

In [62]:
sutime_results(sent_time)[0]['value']

'2018-04-21T16:59:00-0500'

In [9]:
sutime._sutime.annotate("today")

'[{"start":0,"end":5,"text":"today","type":"DATE","value":"2018-08-23"}]'

In [11]:
from datetime import date

date(2017, 1, 9)


datetime.date(2017, 1, 9)

In [17]:
def input_today():
    return 'I have written a test today.'

result = sutime.parse("today", date(2017, 1, 9).isoformat())

In [15]:
date(2017, 1, 9).isoformat()

'2017-01-09'

In [22]:
sutime.parse("today", reference_time.isoformat())

[{'start': 0,
  'end': 5,
  'text': 'today',
  'type': 'DATE',
  'value': '2018-08-21'}]