In [12]:
import requests
import pandas as pd 
import numpy as np
from bs4 import BeautifulSoup as sp
import nltk

import matplotlib.pyplot as plt 
import plotly 
import plotly.plotly as py
import plotly.graph_objs as go

from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import plotly.tools as tls
import cufflinks as cf
from IPython.display import HTML

In [13]:
py.sign_in("kasstohr", "TOKEN")

In [14]:
#Data source for sample City Council transcript: https://apps.tampagov.net/cttv_cc_webapp/Agenda.aspx?pkey=1981
url = 'https://apps.tampagov.net/cttv_cc_webapp/Agenda.aspx?pkey=1981'
r = requests.get(url)

#get text content 
soup = sp(r.content, 'html.parser')
main_content = soup.find(id="MainContent_Label1").text
#tokenize content
tokens = nltk.word_tokenize(main_content)

### Frequency counts of relevant words

To help identify the most relevant topics of a given city council meeting we can use NLP to identify words or phrases (in this case just words) that might provide insight into the topic of the meeting. 

In [15]:
#get word frequency and identify relavant words by length and frequency
fdist1 = nltk.FreqDist(tokens)
V = set(tokens)
relevant_words  = sorted(w for w in set(tokens) if len(w) > 7 and fdist1[w] > 7)

#get word counts
rel_freq = []
for w in relevant_words: 
    count = fdist1[w]
    rel_freq.append([w,count])
    
#create table     
rel_freq_table = pd.DataFrame(rel_freq, columns = ['word', 'frequency'])
rel_freq_table = rel_freq_table.sort_values(by='frequency', ascending = True)

#plot table 
rel_freq_table.iplot(x = 'frequency', 
                     y = 'word', 
                     kind = 'bar', 
                     orientation = 'h', 
                     title = 'Frequency of Relevant Words <br> Sample City Council Meeting', 
                     filename = 'dsi/pc_freq_plot')

## Identify speakers and how often they speak 

Many transcripts offer clues as to who the speakers at a meeting may be, whether they are council members or people attending the meeting providing comment. In this case we analyse just the all caps references to identify when and how frequently speakers spoke at the meeting (frequency distribution in 10 minute intervals). 

In [16]:
#split content by line to get an approximate timeseries 
by_line = main_content.split('\r')
tokens_by_line = []
for i in by_line: 
    tokens = nltk.word_tokenize(i)
    tokens_by_line.append(tokens)

#tokenize content lines
cap_names = []
for line in tokens_by_line: 
    V = set(line)
    #upper_words = [w for w in V if w.isupper()]
    upper_words  = sorted(w for w in set(V) if w.isupper() and len(w) > 3)
    cap_names.append(upper_words)
     
#clean up misidentified names
cap_names = pd.DataFrame(cap_names, columns = ['first', 'last', 'q'])
cap_names[cap_names.q.isnull()== False]
cap_names.iloc[4650,0] = 'LISA'
cap_names.iloc[4650,1] = 'MONTELIONE'
cap_names[cap_names.q.isnull()== False]
cap_names = cap_names.drop('q', axis = 1)
cap_names = cap_names[cap_names['first'] != 'DISCLAIMER']

#Thursday, October 20, 2016 
#9:00 a.m. session 
#Add timeseries data (at approx 3s per line)
periods = len(cap_names)
rng = pd.date_range('10/20/2016 09:00:00', periods=periods, freq='3S')
cap_names['time_elapsed'] = rng

#clean up names 
cap_names['Speaker'] = cap_names['first'].str.cat(cap_names['last'], sep = ' ').str.title()
cap_names = cap_names.drop(['first', 'last'], axis = 1)
cap_names = cap_names.set_index('time_elapsed')

#group by ten minute intervals, speaker name 
ts_count = cap_names.groupby([pd.TimeGrouper('10min'), 'Speaker']).count().reset_index()
ts_count.iplot(y = 'Speaker', 
                     x = 'time_elapsed', 
                     kind = 'scatter', 
                     mode='markers',
                     title = 'Frequency Distribution of Speakers <br> Sample City Council Meeting', 
                     filename = 'dsi/speaker_freq_dist_plot')