In [1]:
# systems
import os, sys

from dateutil import parser

# scraping websites
import json
import requests
from bs4 import BeautifulSoup

# language processing
from sutime import SUTime

#
sys.path.insert(0, '../src/')
from listing import *

In [2]:
list_id = "mitml"
base_url = "http://mailman.mit.edu/mailman/private/"

local_dir = '../listings/'
local_list_dir = os.path.join(local_dir, list_id)
local_list_dir

'../listings/mitml'

In [3]:
# Import URLs and indices
with open(os.path.join(local_list_dir, 'urls.txt'), 'r') as f: 
    urls = [line.rstrip() for line in f.readlines()]
    indices = [os.path.basename(url) for url in urls]

In [4]:
urls[:5]

['http://mailman.mit.edu/mailman/private/mitml/2018-August/000480.html',
 'http://mailman.mit.edu/mailman/private/mitml/2018-August/000479.html',
 'http://mailman.mit.edu/mailman/private/mitml/2018-July/000478.html',
 'http://mailman.mit.edu/mailman/private/mitml/2018-June/000477.html',
 'http://mailman.mit.edu/mailman/private/mitml/2018-June/000476.html']

# Rooms

In [5]:
listing = Listing(list_id, '000480.html')

In [6]:
listing.highlight_text(listing.title)

[Mitml] Call for Papers: NIPS [1;31m2018[0m Workshop "All of Bayesian Nonparametrics (Especially the Useful Bits)"


# Time/date

In [7]:
for index in indices[:20]: 
    listing = Listing(list_id, index)

    if listing.is_talk: 
        print(listing.url)
        print(listing.get_parsed_metadata())
        
        print()
        

http://mailman.mit.edu/mailman/private/mitml/2018-August/000479.html
{'summary': 'Statistics Special Seminar - Prateek Jain (Microsoft Research) - Tuesday, Aug 21st@2:00pm, E18-304', 'description': 'http://mailman.mit.edu/mailman/private/mitml/2018-August/000479.html', 'location': 'E18-304', 'start': {'dateTime': '2018-08-21T14:00:00', 'timeZone': 'America/New_York'}, 'end': {'dateTime': '2018-08-21T15:00:00', 'timeZone': 'America/New_York'}}

http://mailman.mit.edu/mailman/private/mitml/2018-June/000477.html
{'summary': 'Fwd: TALK: Tuesday 06-05-2018 Noisy Natural Gradient as Variational Inference', 'description': 'http://mailman.mit.edu/mailman/private/mitml/2018-June/000477.html', 'location': '32-G882', 'start': {'dateTime': '2018-06-05T14:00:00', 'timeZone': 'America/New_York'}, 'end': {'dateTime': '2018-06-05T15:00:00', 'timeZone': 'America/New_York'}}

http://mailman.mit.edu/mailman/private/mitml/2018-June/000476.html
{'summary': 'TALK: Tuesday 06-05-2018 Noisy Natural Gradient a

# Similarity metrics

In [5]:
from difflib import SequenceMatcher
from itertools import combinations


In [34]:
similar = lambda a,b: SequenceMatcher(None, a, b).ratio()

In [6]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [32]:
recent_listings = []

for index in indices[:50]: 
    l = Listing(list_id, index)
    if l.is_talk: 
        recent_listings.append(l)

In [35]:
for a,b in combinations(recent_listings, 2): 
    print(similar(a.message, b.message))
    if similar(a.message, b.message) > 0.2: 
        print(a.url)
        print(b.url)

0.02384937238493724
0.023299161230195712
0.023299161230195712
0.01383185649448887
0.01236603462489695
0.03976311336717428
0.01383185649448887
0.03465445130452101
0.019664732430689877
0.01236603462489695
0.01236603462489695
0.03465445130452101
0.021906693711967545
0.016666666666666666
0.033748801534036435
0.0344686089454247
0.01715622563036132
0.01899050474762619
0.016666666666666666
0.025552108048914036
0.016666666666666666
0.01899050474762619
0.02379841343910406
0.03433327553320617
0.04148747786491273
0.011069182389937107
0.02213666987487969
0.037012557832121616
0.012258796821793417
0.01226158038147139
0.7731958762886598
http://mailman.mit.edu/mailman/private/mitml/2018-June/000477.html
http://mailman.mit.edu/mailman/private/mitml/2018-June/000476.html
0.7731958762886598
http://mailman.mit.edu/mailman/private/mitml/2018-June/000477.html
http://mailman.mit.edu/mailman/private/mitml/2018-May/000473.html
0.16260162601626016
0.1401831939466348
0.01347488770926909
0.16260162601626016
0.150

0.16492494774843244
0.10672853828306264
0.15156440471353108
0.7410665200659703
http://mailman.mit.edu/mailman/private/mitml/2018-April/000460.html
http://mailman.mit.edu/mailman/private/mitml/2018-April/000456.html
0.013682564503518374
0.025923208608461725
0.18190386427898209
0.15156440471353108
0.0590805803181262
0.15156440471353108
0.18190386427898209
0.05918727915194346
0.016308870028290897
0.022407628128724672
0.17785155323689827
0.03319690768531151
0.09537753608031793
0.017645792984721326
0.017649591046061126
0.09398988953379517
0.10056963268513062
0.03519374333451831
0.0174077578051088
0.028651949271958667
0.17221844550192614
0.10056963268513062
0.06112054329371817
0.10056963268513062
0.17221844550192614
0.05109644453906749
0.01877630301068307
0.018790100824931256
0.17099863201094392
0.026713378585504707
0.09333333333333334
0.018272425249169437
0.018276220145379024
0.092
0.016982836495031618
0.026954177897574125
0.027357811375089993
0.0962517353077279
0.092
0.08553198827384031
0.

In [17]:
l1 = Listing(list_id, '000446.html')
l2 = Listing(list_id, '000445.html')

In [19]:
similar(l1.message, l2.message)

0.37573385518590996

---

In [24]:
all_metadata

[{'summary': 'TALK: Thursday 09-17-2015 Extreme Classification: A New Paradigm for Ranking & Recommendation',
  'description': 'http://mailman.mit.edu/mailman/private/mitml/2015-September/000000.html',
  'location': '32-D507',
  'start': {'dateTime': '2015-09-17T14:00:00', 'timeZone': 'America/New_York'},
  'end': {'dateTime': '2015-09-17T15:00:00', 'timeZone': 'America/New_York'},
  'message': 'Just a reminder about this MIT-ML Seminar talk. Notice: this week time is 2:00PM to 3:00PM.\n\nThanks,\nSuvrit.\n–\n\nBegin forwarded message:\n\n> From: calendar at csail.mit.edu\n> Subject: TALK: Thursday 09-17-2015 Extreme Classification: A New Paradigm for Ranking & Recommendation\n> Date: September 16, 2015 at 0:01:22 EDT\n> To: seminars at csail.mit.edu\n> \n> Extreme Classification: A New Paradigm for Ranking & Recommendation\n> \n> Speaker: Manik Varma\n> Speaker Affiliation: MSR Bangalore\n> Host: Suvrit Sra, Stefanie Jegelka\n> Host Affiliation: CSAIL/LIDS\n> \n> Date: Thursday, Septe

In [22]:
def get_listing_id(metadata): 
    
    for m in all_metadata[-30:]: 
        similarity = similar(m['message'], metadata['message'])
        if metadata['is_correction']: 
            if similarity > 0.3: 
                return m['id']
        else: 
            if similarity > 0.5: 
                return m['id']
            
    return -1
    

all_metadata = []
counter = 0
for index in indices[-200:][::-1]: 
    
    l = Listing(list_id, index) 
    
    if l.is_talk: 
        
        metadata = l.get_parsed_metadata_dense()
        
        listing_id = get_listing_id(metadata)
        metadata['id'] = listing_id
        if listing_id == -1: 
            counter += 1
            metadata['id'] = counter
            
        all_metadata.append(metadata)
        
        
        # Add to all_metadata

In [30]:
l.url.split('/')[-3] + 

'mitml'

In [23]:
pd.DataFrame(all_metadata)

Unnamed: 0,description,end,id,is_correction,is_talk,location,message,posted_date,start,summary
0,http://mailman.mit.edu/mailman/private/mitml/2...,"{'dateTime': '2015-09-17T15:00:00', 'timeZone'...",1,False,True,32-D507,Just a reminder about this MIT-ML Seminar talk...,2015-09-16,"{'dateTime': '2015-09-17T14:00:00', 'timeZone'...",TALK: Thursday 09-17-2015 Extreme Classificati...
1,http://mailman.mit.edu/mailman/private/mitml/2...,"{'dateTime': '2015-10-22T17:00:00', 'timeZone'...",2,False,True,,Begin forwarded message:\n\n> From: calendar a...,2015-10-21,"{'dateTime': '2015-10-22T16:00:00', 'timeZone'...",Fwd: TALK: Thursday 10-22-2015 Brian Kulis: Sm...
2,http://mailman.mit.edu/mailman/private/mitml/2...,"{'dateTime': '2015-11-12T17:00:00', 'timeZone'...",3,False,True,32-D507,COEVOLVE: A Joint Point Process Model for Info...,2015-11-05,"{'dateTime': '2015-11-12T16:00:00', 'timeZone'...",TALK: Thursday 11-12-2015 COEVOLVE: A Joint Po...
3,http://mailman.mit.edu/mailman/private/mitml/2...,"{'dateTime': '2015-11-10T14:00:00', 'timeZone'...",4,False,True,32-G449,Title: Speech Production Features for Deep Neu...,2015-11-09,"{'dateTime': '2015-11-10T13:00:00', 'timeZone'...",Tuesday Nov. 10 - Seminar: Leonardo Badino - S...
4,http://mailman.mit.edu/mailman/private/mitml/2...,"{'dateTime': '2015-11-17T12:00:00', 'timeZone'...",5,False,True,,"Dear all,\nProf. Risi Kondor from University o...",2015-11-10,"{'dateTime': '2015-11-17T11:00:00', 'timeZone'...","Risi Kondor: MIT-ML Seminar (Tuesday, November..."
5,http://mailman.mit.edu/mailman/private/mitml/2...,"{'dateTime': '2015-11-12T16:00:00', 'timeZone'...",3,False,True,32-D507,ML seminar: COEVOLVE: A Joint Point Process Mo...,2015-11-12,"{'dateTime': '2015-11-12T15:00:00', 'timeZone'...",TALK: Thursday 11-12-2015 COEVOLVE: A Joint Po...
6,http://mailman.mit.edu/mailman/private/mitml/2...,"{'dateTime': '2015-11-12T16:00:00', 'timeZone'...",3,False,True,32-D507,Today! this talk will be of interest to those ...,2015-11-12,"{'dateTime': '2015-11-12T15:00:00', 'timeZone'...",Fwd: TALK: Thursday 11-12-2015 COEVOLVE: A Joi...
7,http://mailman.mit.edu/mailman/private/mitml/2...,"{'dateTime': '2015-11-19T17:00:00', 'timeZone'...",6,False,True,32-D463,"Thursday at the MIT-MSR ML seminar, Rebecca St...",2015-11-15,"{'dateTime': '2015-11-19T16:00:00', 'timeZone'...",Rebecca Steorts Thursday 4pm at MIT-MSR ML sem...
8,http://mailman.mit.edu/mailman/private/mitml/2...,"{'dateTime': '2015-11-17T12:00:00', 'timeZone'...",5,False,True,,Reminder - this talk will be happening tomorro...,2015-11-16,"{'dateTime': '2015-11-17T11:00:00', 'timeZone'...","Risi Kondor: MIT-ML Seminar (Tuesday, November..."
9,http://mailman.mit.edu/mailman/private/mitml/2...,"{'dateTime': '2015-11-17T12:00:00', 'timeZone'...",5,False,True,,"For those who are interested, Risi will be pre...",2015-11-17,"{'dateTime': '2015-11-17T11:00:00', 'timeZone'...","Risi Kondor: MIT-ML Seminar (Tuesday, November..."
