In [3]:
import os
import math
from lxml import html

In [4]:
def get_text(location):
    text_html = html.parse(location)
    test_letter_text = "\n".join([p.text_content() for p in text_html.xpath('//p')])
    return test_letter_text

In [5]:
def get_units(text):
    units = math.ceil(len(text)/1000)
    return units

In [6]:
units = 0
for f in os.listdir('letters'):
    filename = 'letter_txt/'+f.replace('html','txt')
    with open(filename,"w") as fp:
        text = get_text('letters/'+f)
        units += get_units(text)
        fp.write(text)

In [7]:
units

1347

# Google NLP

In [8]:
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types

import json

do_api_calls = False

In [9]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/jguillette/keys/NLP_experiments_owner.json'

In [10]:
# Instatniate a client
client = language.LanguageServiceClient()

In [11]:
with open('letter_txt/wikisource_vol1_ch1_letter1.txt','r') as fp:
    text = fp.read()

In [12]:
document = types.Document(
    content=text,
    type=enums.Document.Type.PLAIN_TEXT
)

In [13]:
if do_api_calls:
    entity_thing = client.analyze_entities(document=document,encoding_type='UTF8')

In [14]:
dictable_types = [
    "<class 'google.protobuf.internal.containers.ScalarMap'>"
]
list_unpack_types = [
    "<class 'google.protobuf.internal.containers.RepeatedCompositeFieldContainer'>"
]
unpack_types = [
    "<class 'google.cloud.language_v1.types.TextSpan'>",
    "<class 'google.cloud.language_v1.types.Sentiment'>"
]

In [15]:
def unpack_google_dict(google):
    fields = google.ListFields()
    field_info = {}
    for f in fields:
        name = f[0].name
        value = f[1]
        val_type = str(type(value))
        if val_type in dictable_types:
            value = dict(value)
        elif val_type in list_unpack_types:
            value = [unpack_google_dict(v) for v in value]
        elif val_type in unpack_types:
            value = unpack_google_dict(value)
        else:
            value = value
        field_info[name] = value
    return field_info

In [1]:
[x*2 for x in range(1,21)]

[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40]

In [2]:
def recursive_dict_print(d):
    """This is my helper text"""
    for k, v in d.items():
        if isinstance(v, dict):
            recursive_dict_print(v)
        elif isinstance(v, list):
            for x in v:
                recursive_dict_print(x)
        else:
            print("{0: >20} : {1: >40} ({2})".format(k, str(v).replace("\n",""), type(v)))

In [None]:
recursive_dict_print()

# Entity Analysis

In [17]:
def analyze_entities(location):
    filename = location.split('/')[-1].replace('txt','json')
    output_location = 'nlp/google-entities/'+filename
    if os.path.exists(output_location):
        with open(output_location,'r') as fp:
            entities = json.load(fp)
        return entities
    else:
        with open(location,'r') as fp:
            text = fp.read()
        document = types.Document(
            content=text,
            type=enums.Document.Type.PLAIN_TEXT)
        response = client.analyze_entities(document=document,encoding_type='UTF8')
        try:
            entities = [unpack_google_dict(e) for e in response.entities]
            with open(output_location,'w') as fp:
                json.dump(entities,fp)
            return entities
        except:
            return response

In [20]:
response = analyze_entities('letter_txt/wikisource_vol1_ch1_letter10.txt')

In [22]:
type(response)

list

In [23]:
filelist = os.listdir('letter_txt/')
filelist.pop(0)

'Icon\r'

In [24]:
for f in filelist:
    location = os.path.join('letter_txt',f)
    entities = analyze_entities(location)
    if type(entities) == list:
        print("{} done!".format(location))
    else:
        print("problem with {}".format(location))

letter_txt/wikisource_vol1_ch1_letter1.txt done!
letter_txt/wikisource_vol1_ch1_letter10.txt done!
letter_txt/wikisource_vol1_ch1_letter11.txt done!
letter_txt/wikisource_vol1_ch1_letter12.txt done!
letter_txt/wikisource_vol1_ch1_letter13.txt done!
letter_txt/wikisource_vol1_ch1_letter14.txt done!
letter_txt/wikisource_vol1_ch1_letter15.txt done!
letter_txt/wikisource_vol1_ch1_letter16.txt done!
letter_txt/wikisource_vol1_ch1_letter2.txt done!
letter_txt/wikisource_vol1_ch1_letter3.txt done!
letter_txt/wikisource_vol1_ch1_letter4.txt done!
letter_txt/wikisource_vol1_ch1_letter5.txt done!
letter_txt/wikisource_vol1_ch1_letter6.txt done!
letter_txt/wikisource_vol1_ch1_letter7.txt done!
letter_txt/wikisource_vol1_ch1_letter8.txt done!
letter_txt/wikisource_vol1_ch1_letter9.txt done!
letter_txt/wikisource_vol1_ch2_letter1.txt done!
letter_txt/wikisource_vol1_ch2_letter10.txt done!
letter_txt/wikisource_vol1_ch2_letter11.txt done!
letter_txt/wikisource_vol1_ch2_letter12.txt done!
letter_txt

letter_txt/wikisource_vol1_ch7_letter6.txt done!
letter_txt/wikisource_vol1_ch7_letter7.txt done!
letter_txt/wikisource_vol1_ch7_letter8.txt done!
letter_txt/wikisource_vol1_ch7_letter9.txt done!
letter_txt/wikisource_vol2_ch10_letter1.txt done!
letter_txt/wikisource_vol2_ch10_letter10.txt done!
letter_txt/wikisource_vol2_ch10_letter11.txt done!
letter_txt/wikisource_vol2_ch10_letter12.txt done!
letter_txt/wikisource_vol2_ch10_letter13.txt done!
letter_txt/wikisource_vol2_ch10_letter14.txt done!
letter_txt/wikisource_vol2_ch10_letter15.txt done!
letter_txt/wikisource_vol2_ch10_letter16.txt done!
letter_txt/wikisource_vol2_ch10_letter17.txt done!
letter_txt/wikisource_vol2_ch10_letter18.txt done!
letter_txt/wikisource_vol2_ch10_letter19.txt done!
letter_txt/wikisource_vol2_ch10_letter2.txt done!
letter_txt/wikisource_vol2_ch10_letter20.txt done!
letter_txt/wikisource_vol2_ch10_letter21.txt done!
letter_txt/wikisource_vol2_ch10_letter22.txt done!
letter_txt/wikisource_vol2_ch10_letter23.

letter_txt/wikisource_vol2_ch9_letter42.txt done!
letter_txt/wikisource_vol2_ch9_letter43.txt done!
letter_txt/wikisource_vol2_ch9_letter44.txt done!
letter_txt/wikisource_vol2_ch9_letter45.txt done!
letter_txt/wikisource_vol2_ch9_letter46.txt done!
letter_txt/wikisource_vol2_ch9_letter5.txt done!
letter_txt/wikisource_vol2_ch9_letter6.txt done!
letter_txt/wikisource_vol2_ch9_letter7.txt done!
letter_txt/wikisource_vol2_ch9_letter8.txt done!
letter_txt/wikisource_vol2_ch9_letter9.txt done!


# Sentiment Analysis

In [25]:
if do_api_calls:
    sentiment = client.analyze_sentiment(document=document,encoding_type="UTF8")

In [26]:
if do_api_calls:
    json.dumps(unpack_google_dict(sentiment))

In [27]:
def analyze_sentiment(location):
    filename = location.split('/')[-1].replace('txt','json')
    output_location = 'nlp/google-sentiment/'+filename
    if os.path.exists(output_location):
        with open(output_location,'r') as fp:
            sentiment = json.load(fp)
        return sentiment
    else:
        with open(location,'r') as fp:
            text = fp.read()
        document = types.Document(
            content=text,
            type=enums.Document.Type.PLAIN_TEXT)
        response = client.analyze_sentiment(document=document,encoding_type='UTF8')
#         try:
        sentiment = unpack_google_dict(response)
        with open(output_location,'w') as fp:
            json.dump(sentiment,fp)
        return sentiment
#         except:
#             return response

In [28]:
test = analyze_sentiment('letter_txt/wikisource_vol1_ch1_letter1.txt')
type(test)

dict

In [29]:
for f in filelist:
    location = os.path.join('letter_txt',f)
    sentiment = analyze_sentiment(location)
    if type(sentiment) == dict:
        print("{} done!".format(location))
    else:
        print("problem with {}".format(location))

letter_txt/wikisource_vol1_ch1_letter1.txt done!
letter_txt/wikisource_vol1_ch1_letter10.txt done!
letter_txt/wikisource_vol1_ch1_letter11.txt done!
letter_txt/wikisource_vol1_ch1_letter12.txt done!
letter_txt/wikisource_vol1_ch1_letter13.txt done!
letter_txt/wikisource_vol1_ch1_letter14.txt done!
letter_txt/wikisource_vol1_ch1_letter15.txt done!
letter_txt/wikisource_vol1_ch1_letter16.txt done!
letter_txt/wikisource_vol1_ch1_letter2.txt done!
letter_txt/wikisource_vol1_ch1_letter3.txt done!
letter_txt/wikisource_vol1_ch1_letter4.txt done!
letter_txt/wikisource_vol1_ch1_letter5.txt done!
letter_txt/wikisource_vol1_ch1_letter6.txt done!
letter_txt/wikisource_vol1_ch1_letter7.txt done!
letter_txt/wikisource_vol1_ch1_letter8.txt done!
letter_txt/wikisource_vol1_ch1_letter9.txt done!
letter_txt/wikisource_vol1_ch2_letter1.txt done!
letter_txt/wikisource_vol1_ch2_letter10.txt done!
letter_txt/wikisource_vol1_ch2_letter11.txt done!
letter_txt/wikisource_vol1_ch2_letter12.txt done!
letter_txt

letter_txt/wikisource_vol2_ch12_letter20.txt done!
letter_txt/wikisource_vol2_ch12_letter21.txt done!
letter_txt/wikisource_vol2_ch12_letter22.txt done!
letter_txt/wikisource_vol2_ch12_letter23.txt done!
letter_txt/wikisource_vol2_ch12_letter24.txt done!
letter_txt/wikisource_vol2_ch12_letter25.txt done!
letter_txt/wikisource_vol2_ch12_letter26.txt done!
letter_txt/wikisource_vol2_ch12_letter27.txt done!
letter_txt/wikisource_vol2_ch12_letter28.txt done!
letter_txt/wikisource_vol2_ch12_letter29.txt done!
letter_txt/wikisource_vol2_ch12_letter3.txt done!
letter_txt/wikisource_vol2_ch12_letter30.txt done!
letter_txt/wikisource_vol2_ch12_letter31.txt done!
letter_txt/wikisource_vol2_ch12_letter32.txt done!
letter_txt/wikisource_vol2_ch12_letter33.txt done!
letter_txt/wikisource_vol2_ch12_letter34.txt done!
letter_txt/wikisource_vol2_ch12_letter35.txt done!
letter_txt/wikisource_vol2_ch12_letter36.txt done!
letter_txt/wikisource_vol2_ch12_letter37.txt done!
letter_txt/wikisource_vol2_ch12_

# Entity Sentiment Analysis

In [30]:
with open('letter_txt/wikisource_vol1_ch1_letter7.txt','r') as fp:
    text = fp.read()

In [31]:
document = types.Document(
    content=text,
    type=enums.Document.Type.PLAIN_TEXT,
    language='EN')

In [33]:
if do_api_calls:
    ent_sent = client.analyze_entity_sentiment(document=document,encoding_type='UTF8')

In [34]:
if do_api_calls:
    ent_sent

In [35]:
if do_api_calls:
    for e in unpack_google_dict(ent_sent)['entities']:
        if len(e['sentiment']) != 0:
            print(e)
            print()