In [None]:
from pri_data import bigram_weights, political_bigrams, risk_synonyms

In [None]:
# Get data from database for one file

import pandas as pd

import sqlalchemy as sa
import psycopg2 as pg
from pandas.io.sql import read_sql

from sqlalchemy import create_engine
engine = create_engine('postgresql://iangow.me/crsp')

file_name = "2663156_T"

sql = """
    SELECT *
    FROM streetevents.speaker_data
    WHERE file_name = '%s'
    """ % file_name

df = pd.read_sql(sa.text(sql), engine)

In [None]:
# A quick look at the data
df

In [None]:
# I will use the first (long) passage for illustration
first_speech = df['speaker_text'][0]

In [None]:
from nltk import bigrams, word_tokenize, sent_tokenize

In [None]:
def flatten(l):
    return [item for sublist in l for item in sublist]

def get_words(text):
    sents = sent_tokenize(text)
    words = flatten([word_tokenize(sent) for sent in sents])
    return words

def get_bigrams(text):
    return list(bigrams(get_words(text)))

In [None]:
def find_bigrams(the_text):
    bigrams_list = get_bigrams(the_text)
    indexes = [(x, i) 
               for (i,x) in enumerate(bigrams_list) 
                   if x in political_bigrams]
    return indexes

In [None]:
bigram_matches = find_bigrams(first_speech)
print(bigram_matches)

In [None]:
def find_synonyms(the_text):
    bigrams_list = get_words(the_text)
    indexes = [(x, i) for (i,x) in enumerate(bigrams_list) if x in risk_synonyms]
    return indexes

In [None]:
synonym_matches = find_synonyms(first_speech)
print(synonym_matches)

In [None]:
def find_matches(the_text):
    bgs = find_bigrams(the_text)
    syns = find_synonyms(the_text)
    matches = [{'synonym': s, 'bigram': b} for s in syns for b in bgs if abs(s[1]-b[1]) < 10]
    
    words = get_words(the_text)
    
    for match in matches:
        positions = (match['synonym'][1], match['bigram'][1])
        first = max(min(positions) - 5, 0)
        last = min(max(positions) + 5, len(words) - 1)
        match['bigram_score'] = bigram_weights[match['bigram'][0]]
        match['text'] =  ' '.join(words[first:last])
    return matches

In [None]:
find_matches(first_speech)

In [None]:
def score_bigrams(matches):
    if matches:
        return sum(b['bigram_score'] for b in matches)
    else:
        return 0
                    
def count_matches(matches):
    if matches:
        return len(matches)
    else:
        return 0
    
def count_bigrams(the_text):
    if the_text:
        return len(get_bigrams(the_text))
    else:
        return 0

df['matches'] = df['speaker_text'].map(find_matches) 
df['bigram_matches'] = df['matches'].map(count_matches)
df['bigram_scores'] = df['matches'].map(score_bigrams)
df['bigram_counts'] = df['speaker_text'].map(count_bigrams)

# Multiply the sum by 100,000 and divide the result by the average bigram score 
# of the training library (0.38533). This is the numerator of the transcript 
# score. Divide by the length of the transcript and standardize (divide) 
# the result by the sample standard deviation (236.42474). 
numer = sum(df['bigram_scores']) * 1e5/0.38533
denom = sum(df['bigram_counts']) * 236.42474
score = numer/denom
print(score)