In [1]:
from pri_data import bigram_weights, political_bigrams, risk_synonyms

In [2]:
# Get data from database for one file

import pandas as pd

import sqlalchemy as sa
import psycopg2 as pg
from pandas.io.sql import read_sql

from sqlalchemy import create_engine
engine = create_engine('postgresql://iangow.me/crsp')

file_name = "2663156_T"

sql = """
    SELECT *
    FROM streetevents.speaker_data
    WHERE file_name = '%s'
    """ % file_name

df = pd.read_sql(sa.text(sql), engine)

In [3]:
# A quick look at the data
df

Unnamed: 0,file_name,last_update,speaker_name,employer,role,speaker_number,context,speaker_text,language
0,2663156_T,2010-02-09 20:57:48,John Charman,AXIS Capital Holdings Limited,CEO & President,5,pres,"Thank you, David. I will begin my market comme...",en
1,2663156_T,2010-02-09 20:57:48,David Greenfield,AXIS Capital Holdings Limited,CFO,4,pres,"Thank you, John, and good morning, everyone. A...",en
2,2663156_T,2010-02-09 20:57:48,Operator,,,1,pres,Good morning and welcome to the Axis Capital H...,en
3,2663156_T,2010-02-09 20:57:48,David Greenfield,AXIS Capital Holdings Limited,CFO,23,qa,"Yes, it covers all of the lines in the casualt...",en
4,2663156_T,2010-02-09 20:57:48,Matthew Heimermann,JPMorgan,Analyst,20,qa,"Good morning, everybody. A couple questions. F...",en
5,2663156_T,2010-02-09 20:57:48,Sam Hoffman,Lincoln Square,Analyst,52,qa,"Okay, thank you.",en
6,2663156_T,2010-02-09 20:57:48,John Charman,AXIS Capital Holdings Limited,CEO & President,3,pres,"Thank you, Linda. Good morning, everyone, and ...",en
7,2663156_T,2010-02-09 20:57:48,Linda Ventresca,AXIS Capital Holdings Limited,EVP & Corporate Development Officer,2,pres,"Thanks, Andrea. Good morning, ladies and gentl...",en
8,2663156_T,2010-02-09 20:57:48,David Greenfield,AXIS Capital Holdings Limited,CFO,12,qa,"Sure, Vinay. I don't have the earned premium n...",en
9,2663156_T,2010-02-09 20:57:48,David Greenfield,AXIS Capital Holdings Limited,CFO,14,qa,Credit and bond the combined ratio moved a cou...,en


In [4]:
# I will use the first (long) passage for illustration
first_speech = df['speaker_text'][0]

In [5]:
from nltk import bigrams, word_tokenize, sent_tokenize

In [6]:
def flatten(l):
    return [item for sublist in l for item in sublist]

def get_words(text):
    sents = sent_tokenize(text)
    words = flatten([word_tokenize(sent) for sent in sents])
    return words

def get_bigrams(text):
    return list(bigrams(get_words(text)))

In [7]:
def find_bigrams(the_text):
    bigrams_list = get_bigrams(the_text)
    indexes = [(x, i) 
               for (i,x) in enumerate(bigrams_list) 
                   if x in political_bigrams]
    return indexes

In [8]:
bigram_matches = find_bigrams(first_speech)
print(bigram_matches)

[(('global', 'financial'), 261), (('our', 'most'), 516), (('for', 'regional'), 525), (('global', 'financial'), 1398), (('and', 'political'), 1563), (('global', 'financial'), 1624), (('was', 'unique'), 1673), (('and', 'political'), 1685), (('global', 'financial'), 1799), (('nevertheless', 'be'), 1886)]


In [9]:
def find_synonyms(the_text):
    bigrams_list = get_words(the_text)
    indexes = [(x, i) for (i,x) in enumerate(bigrams_list) if x in risk_synonyms]
    return indexes

In [10]:
synonym_matches = find_synonyms(first_speech)
print(synonym_matches)

[('risk', 529), ('exposed', 846), ('risks', 1159), ('risk', 1565), ('risk', 1672), ('risk', 1687), ('risks', 1881)]


In [11]:
def find_matches(the_text):
    bgs = find_bigrams(the_text)
    syns = find_synonyms(the_text)
    matches = [{'synonym': s, 'bigram': b} for s in syns for b in bgs if abs(s[1]-b[1]) < 10]
    
    words = get_words(the_text)
    
    for match in matches:
        positions = (match['synonym'][1], match['bigram'][1])
        first = max(min(positions) - 5, 0)
        last = min(max(positions) + 5, len(words) - 1)
        match['bigram_score'] = bigram_weights[match['bigram'][0]]
        match['text'] =  ' '.join(words[first:last])
    return matches

In [12]:
find_matches(first_speech)

[{'bigram': (('for', 'regional'), 525),
  'bigram_score': 0.92,
  'synonym': ('risk', 529),
  'text': 'lines of business . Pricing for regional property per risk and excessive loss accounts'},
 {'bigram': (('and', 'political'), 1563),
  'bigram_score': 21.42,
  'synonym': ('risk', 1565),
  'text': 'has normalized in our credit and political risk business . We look'},
 {'bigram': (('was', 'unique'), 1673),
  'bigram_score': 0.25,
  'synonym': ('risk', 1672),
  'text': '. The peak Blue City risk was unique and exceptional in'},
 {'bigram': (('and', 'political'), 1685),
  'bigram_score': 21.42,
  'synonym': ('risk', 1687),
  'text': 'The balance of our credit and political risk portfolio is well diversified'},
 {'bigram': (('nevertheless', 'be'), 1886),
  'bigram_score': 0.25,
  'synonym': ('risks', 1881),
  'text': ', there are always unforeseen risks for which we must nevertheless be prepared . Over'}]

In [13]:
def score_bigrams(matches):
    if matches:
        return sum(b['bigram_score'] for b in matches)
    else:
        return 0
                    
def count_matches(matches):
    if matches:
        return len(matches)
    else:
        return 0
    
def count_bigrams(the_text):
    if the_text:
        return len(get_bigrams(the_text))
    else:
        return 0

df['matches'] = df['speaker_text'].map(find_matches) 
df['bigram_matches'] = df['matches'].map(count_matches)
df['bigram_scores'] = df['matches'].map(score_bigrams)
df['bigram_counts'] = df['speaker_text'].map(count_bigrams)

# Multiply the sum by 100,000 and divide the result by the average bigram score 
# of the training library (0.38533). This is the numerator of the transcript 
# score. Divide by the length of the transcript and standardize (divide) 
# the result by the sample standard deviation (236.42474). 
numer = sum(df['bigram_scores']) * 1e5/0.38533
denom = sum(df['bigram_counts']) * 236.42474
score = numer/denom
print(score)

48.6710943348
