In [1]:
import pandas as pd
import json
from sqlalchemy import create_engine
from ling_features import tone_count

conn_string = 'postgresql://10.101.13.99/crsp'

def expand_json(df, col):
    return pd.concat([df.drop([col], axis=1),
                      df[col].map(lambda x: json.loads(json.dumps(x))).apply(pd.Series)], axis=1)

engine = create_engine(conn_string)

sql =  """
SELECT file_name, last_update, speaker_number, context, section,
   speaker_text
FROM streetevents.speaker_data
WHERE file_name = '1000000_T'
ORDER BY section, context, speaker_number"""

speaker_data = pd.read_sql(sql, engine)

In [2]:
speaker_data['last_update'] = speaker_data['last_update'].map(lambda x: str(x.astimezone('UTC')))
speaker_data['tone_data'] = speaker_data['speaker_text'].apply(tone_count)
speaker_data = expand_json(speaker_data, 'tone_data')
speaker_data = speaker_data.drop(['speaker_text'], axis=1)

In [3]:
speaker_data

Unnamed: 0,file_name,last_update,speaker_number,context,section,positive,negative,uncertainty,litigious,modal_strong,modal_weak
0,1000000_T,2005-07-29 19:37:34+00:00,1,pres,1,0,2,1,0,2,1
1,1000000_T,2005-07-29 19:37:34+00:00,2,pres,1,2,4,3,1,6,2
2,1000000_T,2005-07-29 19:37:34+00:00,3,pres,1,26,11,9,1,7,1
3,1000000_T,2005-07-29 19:37:34+00:00,4,pres,1,11,7,8,2,5,4
4,1000000_T,2005-07-29 19:37:34+00:00,5,pres,1,26,3,6,1,8,3
...,...,...,...,...,...,...,...,...,...,...,...
78,1000000_T,2005-07-29 19:37:34+00:00,74,qa,1,0,0,0,0,0,0
79,1000000_T,2005-07-29 19:37:34+00:00,75,qa,1,0,0,0,0,0,0
80,1000000_T,2005-07-29 19:37:34+00:00,76,qa,1,0,1,0,0,0,0
81,1000000_T,2005-07-29 19:37:34+00:00,77,qa,1,0,1,1,0,0,1


In [4]:
sql =  """
SELECT *
FROM se_features.tone_measure
WHERE file_name = '1000000_T'
ORDER BY section, context, speaker_number"""

orig_data = pd.read_sql(sql, engine)

In [5]:
orig_data

Unnamed: 0,file_name,last_update,speaker_number,context,section,positive,negative,uncertainty,litigious,modal_strong,modal_weak
0,1000000_T,2005-07-29 19:37:34+00:00,1,pres,1,0,2,1,0,2,1
1,1000000_T,2005-07-29 19:37:34+00:00,2,pres,1,0,4,3,1,6,2
2,1000000_T,2005-07-29 19:37:34+00:00,3,pres,1,24,10,9,1,6,1
3,1000000_T,2005-07-29 19:37:34+00:00,4,pres,1,9,7,7,2,5,3
4,1000000_T,2005-07-29 19:37:34+00:00,5,pres,1,26,3,5,1,8,2
...,...,...,...,...,...,...,...,...,...,...,...
78,1000000_T,2005-07-29 19:37:34+00:00,74,qa,1,0,0,0,0,0,0
79,1000000_T,2005-07-29 19:37:34+00:00,75,qa,1,0,0,0,0,0,0
80,1000000_T,2005-07-29 19:37:34+00:00,76,qa,1,0,1,0,0,0,0
81,1000000_T,2005-07-29 19:37:34+00:00,77,qa,1,0,1,1,0,0,1


In [6]:
# Please ignore the `False` in column `last_update`.
# It is `False` for the whole column because `last_update` in orig_data is in datetime format 
# While `last_update` in the new speaker_data is in str format
# However, they should be the same after uploading to the server with the dtype specification.

# Here please focus on the `False` in se_features columns such as `positive`.
speaker_data == orig_data

Unnamed: 0,file_name,last_update,speaker_number,context,section,positive,negative,uncertainty,litigious,modal_strong,modal_weak
0,True,False,True,True,True,True,True,True,True,True,True
1,True,False,True,True,True,False,True,True,True,True,True
2,True,False,True,True,True,False,False,True,True,False,True
3,True,False,True,True,True,False,True,False,True,True,False
4,True,False,True,True,True,True,True,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...
78,True,False,True,True,True,True,True,True,True,True,True
79,True,False,True,True,True,True,True,True,True,True,True
80,True,False,True,True,True,True,True,True,True,True,True
81,True,False,True,True,True,True,True,True,True,True,True


In [7]:
# Now focus on the differences for `speaker_number=2` and `context=pres`
sql =  """
SELECT speaker_text
FROM streetevents.speaker_data
WHERE file_name = '1000000_T' AND speaker_number = 2 AND context = 'pres'
ORDER BY section, context, speaker_number"""
some_call_text = pd.read_sql(sql, engine)

test_text = some_call_text['speaker_text'][0]

In [8]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

sents = sent_tokenize(test_text)
sents

['Good morning, everyone, and thank you for joining us for our second quarter earnings conference call.',
 "Participating in today's call will be Terry Growcock, our Chairman and Chief Executive Officer; Carl Laurino, Senior Vice President and Chief Financial Officer; and Tim Kraus, President of Manitowoc Foodservice Group.",
 'Glen Tellock, President of Manitowoc Crane Group and Bob Herre, President of Manitowoc Marine Group are also on the line to participate in our question-and-answer session.',
 'Carl will open the call with an overview of our financial results for the quarter, including a brief report on each operating segment.',
 'Tim will follow with an update of our with Foodservice operation and Terry will conclude with a strategic commentary.',
 'Following these remarks we will open the call for your questions.',
 'For any of you who are not able to stay on the line for the entire conference call, you can hear a replay beginning at 1 PM Eastern time today until 1 AM Eastern t

In [9]:
# Narrow down the differences to a sentence-level.
from ling_features import tone_count
tone_count(sents[0])

{'litigious': 0,
 'modal_strong': 0,
 'modal_weak': 0,
 'negative': 0,
 'positive': 1,
 'uncertainty': 0}

In [10]:
from tone_measure_functions import tone_count
tone_count(sents[0])

{'litigious': 0,
 'modal_strong': 0,
 'modal_weak': 0,
 'negative': 0,
 'positive': 0,
 'uncertainty': 0}

In [11]:
# Now that we know the two functions generate different results for the first sentence,
# Narrow down the differences to a word-level.
words = word_tokenize(sents[0])

In [12]:
from ling_features import tone_count
tone_count(words[0])

{'litigious': 0,
 'modal_strong': 0,
 'modal_weak': 0,
 'negative': 0,
 'positive': 1,
 'uncertainty': 0}

In [13]:
from tone_measure_functions import tone_count
tone_count(words[0])

{'litigious': 0,
 'modal_strong': 0,
 'modal_weak': 0,
 'negative': 0,
 'positive': 0,
 'uncertainty': 0}

In [19]:
# Regex_dict in original tone_measure functions 
import re
import pandas as pd
import json

def make_regex_orig(words):
    word_list = words.lower().split(",")
    regex_text = '\\b(?:' + '|'.join(word_list) + ')\\b'
    regex = re.compile(regex_text)
    
    return regex

df = pd.read_csv("lm_words.csv")
categories = [key for key in df['category']]
regex_dict = { cat: make_regex_orig(df['words'][df['category'] == cat].iloc[0]) for cat in categories}

regex_dict

{'litigious': re.compile(r'\b(?:abrogate| abrogated| abrogates| abrogating| abrogation| abrogations| absolve| absolved| absolves| absolving| accession| accessions| acquirees| acquirors| acquit| acquits| acquittal| acquittals| acquittance| acquittances| acquitted| acquitting| addendums| adjourn| adjourned| adjourning| adjournment| adjournments| adjourns| adjudge| adjudged| adjudges| adjudging| adjudicate| adjudicated| adjudicates| adjudicating| adjudication| adjudications| adjudicative| adjudicator| adjudicators| adjudicatory| admissibility| admissible| admissibly| admission| admissions| affidavit| affidavits| affirmance| affreightment| aforedescribed| aforementioned| aforesaid| aforestated| aggrieved| allegation| allegations| allege| alleged| allegedly| alleges| alleging| amend| amendable| amendatory| amended| amending| amendment| amendments| amends| antecedent| antecedents| antitrust| anywise| appeal| appealable| appealed| appealing| appeals| appellant| appellants| appellate| appellee

In [20]:
# Regex_dict in tone functions in the new_package

import ling_features
from ling_features import tone_count
from tone import word_lists

from tone.word_lists import word_lists
categories = [key for key in word_lists.keys()]

def make_regex(word_list):
    regex_text = '\\b(?:' + '|'.join(word_list) + ')\\b'
    regex = re.compile(regex_text)
    
    return regex
    
regex_dict = {cat: make_regex(word_lists[cat]) for cat in categories }

regex_dict

{'litigious': re.compile(r'\b(?:abovementioned|abrogate|abrogated|abrogates|abrogating|abrogation|abrogations|absolve|absolved|absolves|absolving|accession|accessions|acquirees|acquirors|acquit|acquits|acquittal|acquittals|acquittance|acquittances|acquitted|acquitting|addendums|adjourn|adjourned|adjourning|adjournment|adjournments|adjourns|adjudge|adjudged|adjudges|adjudging|adjudicate|adjudicated|adjudicates|adjudicating|adjudication|adjudications|adjudicative|adjudicator|adjudicators|adjudicatory|admissibility|admissible|admissibly|admission|admissions|affidavit|affidavits|affirmance|affreightment|aforedescribed|aforementioned|aforesaid|aforestated|aggrieved|allegation|allegations|allege|alleged|allegedly|alleges|alleging|amend|amendable|amendatory|amended|amending|amendment|amendments|amends|antecedent|antecedents|antitrust|anywise|appeal|appealable|appealed|appealing|appeals|appellant|appellants|appellate|appellees|appointor|appurtenance|appurtenances|appurtenant|arbitrability|arbi