In [8]:
import pandas as pd
import os
import json
import nltk

In [9]:
df = pd.read_csv("letters_data_cleaned.csv")
df.head()

Unnamed: 0,addressee,date,Month,DayOfMonth,Year,location
0,SPRING GROVE SCHOOL,NOVEMBER 12 1863,NOVEMBER,12.0,1863.0,letters/wikisource_vol1_ch1_letter1.html
1,CHARLES BAXTER,APRIL 9 1872,APRIL,9.0,1872.0,letters/wikisource_vol1_ch1_letter10.html
2,MRS. THOMAS STEVENSON,JULY 25 1872,JULY,25.0,1872.0,letters/wikisource_vol1_ch1_letter11.html
3,MRS. THOMAS STEVENSON,JULY 29 1872,JULY,29.0,1872.0,letters/wikisource_vol1_ch1_letter12.html
4,MRS. THOMAS STEVENSON,AUGUST 2 1872,AUGUST,2.0,1872.0,letters/wikisource_vol1_ch1_letter13.html


In [10]:
with open('nlp/google-sentiment/wikisource_vol1_ch1_letter1.json','r') as fp:
    sentiment = json.load(fp)

In [11]:
sentiment['document_sentiment']

{'magnitude': 3.799999952316284}

In [12]:
def aggregate_sentiment(sentiment):
    score = 0
    magnitude = 0
    for s in sentiment['sentences']:
        if len(s['sentiment']) > 0:
            score += s['sentiment']['score']
            magnitude += s['sentiment']['magnitude']
    return score,magnitude

In [13]:
aggregate_sentiment(sentiment)

(-1.2000000327825546, 3.4000000208616257)

In [14]:
def get_saved_sentiment(location):
    """Takes the location of a text file, uses naming conventions to return saved sentiment from matching json file"""
    filename = os.path.split(location)[-1]
    filename = filename.replace('html','json')
    json_location = os.path.join('nlp','google-sentiment',filename)
    with open(json_location,'r') as fp:
        sentiment = json.load(fp)
    return aggregate_sentiment(sentiment)

In [15]:
get_saved_sentiment(df.location[2])

(0.9000000283122063, 8.500000186264515)

In [16]:
df['score'],df['magnitude'] = zip(*df.location.apply(get_saved_sentiment))

In [17]:
df.head()

Unnamed: 0,addressee,date,Month,DayOfMonth,Year,location,score,magnitude
0,SPRING GROVE SCHOOL,NOVEMBER 12 1863,NOVEMBER,12.0,1863.0,letters/wikisource_vol1_ch1_letter1.html,-1.2,3.4
1,CHARLES BAXTER,APRIL 9 1872,APRIL,9.0,1872.0,letters/wikisource_vol1_ch1_letter10.html,-0.9,3.9
2,MRS. THOMAS STEVENSON,JULY 25 1872,JULY,25.0,1872.0,letters/wikisource_vol1_ch1_letter11.html,0.9,8.5
3,MRS. THOMAS STEVENSON,JULY 29 1872,JULY,29.0,1872.0,letters/wikisource_vol1_ch1_letter12.html,0.1,5.1
4,MRS. THOMAS STEVENSON,AUGUST 2 1872,AUGUST,2.0,1872.0,letters/wikisource_vol1_ch1_letter13.html,-2.6,12.0


In [28]:
df2 = pd.read_csv("all_letters.csv")

In [29]:
df2['location'] = df2.filename.apply(lambda x: "letters/{}.html".format(x))

In [32]:
df = pd.merge(df,df2,how="left",on="location")

In [33]:
df.head()

Unnamed: 0,addressee,date,Month,DayOfMonth,Year,location,score,magnitude,filename,letter_text
0,SPRING GROVE SCHOOL,NOVEMBER 12 1863,NOVEMBER,12.0,1863.0,letters/wikisource_vol1_ch1_letter1.html,-1.2,3.4,wikisource_vol1_ch1_letter1,"<div class=""prose"">\n<p>Letter: SPRING GROVE S..."
1,CHARLES BAXTER,APRIL 9 1872,APRIL,9.0,1872.0,letters/wikisource_vol1_ch1_letter10.html,-0.9,3.9,wikisource_vol1_ch1_letter10,<p>Letter: TO CHARLES BAXTER</p>\n<p><br></p>\...
2,MRS. THOMAS STEVENSON,JULY 25 1872,JULY,25.0,1872.0,letters/wikisource_vol1_ch1_letter11.html,0.9,8.5,wikisource_vol1_ch1_letter11,<p>Letter: TO MRS. THOMAS STEVENSON</p>\n<p><b...
3,MRS. THOMAS STEVENSON,JULY 29 1872,JULY,29.0,1872.0,letters/wikisource_vol1_ch1_letter12.html,0.1,5.1,wikisource_vol1_ch1_letter12,<p>Letter: TO MRS. THOMAS STEVENSON</p>\n<p><b...
4,MRS. THOMAS STEVENSON,AUGUST 2 1872,AUGUST,2.0,1872.0,letters/wikisource_vol1_ch1_letter13.html,-2.6,12.0,wikisource_vol1_ch1_letter13,<p>Letter: TO MRS. THOMAS STEVENSON</p>\n<p><b...


In [38]:
plaintext = html.fromstring(df.letter_text[0]).text_content()

In [40]:
len(nltk.tokenize.word_tokenize(plaintext))

189

In [45]:
def get_word_length(text):
    plaintext = html.fromstring(text).text_content()
    words = nltk.tokenize.word_tokenize(plaintext)
    return len(words)

In [42]:
def get_character_length(text):
    plaintext = html.fromstring(text).text_content()
    return len(plaintext)

In [46]:
df['word_length'] = df.letter_text.apply(get_word_length)
df['character_length'] = df.letter_text.apply(get_character_length)

In [47]:
df.head()

Unnamed: 0,addressee,date,Month,DayOfMonth,Year,location,score,magnitude,filename,letter_text,word_length,character_length
0,SPRING GROVE SCHOOL,NOVEMBER 12 1863,NOVEMBER,12.0,1863.0,letters/wikisource_vol1_ch1_letter1.html,-1.2,3.4,wikisource_vol1_ch1_letter1,"<div class=""prose"">\n<p>Letter: SPRING GROVE S...",189,893
1,CHARLES BAXTER,APRIL 9 1872,APRIL,9.0,1872.0,letters/wikisource_vol1_ch1_letter10.html,-0.9,3.9,wikisource_vol1_ch1_letter10,<p>Letter: TO CHARLES BAXTER</p>\n<p><br></p>\...,445,2050
2,MRS. THOMAS STEVENSON,JULY 25 1872,JULY,25.0,1872.0,letters/wikisource_vol1_ch1_letter11.html,0.9,8.5,wikisource_vol1_ch1_letter11,<p>Letter: TO MRS. THOMAS STEVENSON</p>\n<p><b...,730,3379
3,MRS. THOMAS STEVENSON,JULY 29 1872,JULY,29.0,1872.0,letters/wikisource_vol1_ch1_letter12.html,0.1,5.1,wikisource_vol1_ch1_letter12,<p>Letter: TO MRS. THOMAS STEVENSON</p>\n<p><b...,430,2056
4,MRS. THOMAS STEVENSON,AUGUST 2 1872,AUGUST,2.0,1872.0,letters/wikisource_vol1_ch1_letter13.html,-2.6,12.0,wikisource_vol1_ch1_letter13,<p>Letter: TO MRS. THOMAS STEVENSON</p>\n<p><b...,1064,5012


In [48]:
df['avg_word_length'] = df.character_length / df.word_length

In [49]:
df.head()

Unnamed: 0,addressee,date,Month,DayOfMonth,Year,location,score,magnitude,filename,letter_text,word_length,character_length,avg_word_length
0,SPRING GROVE SCHOOL,NOVEMBER 12 1863,NOVEMBER,12.0,1863.0,letters/wikisource_vol1_ch1_letter1.html,-1.2,3.4,wikisource_vol1_ch1_letter1,"<div class=""prose"">\n<p>Letter: SPRING GROVE S...",189,893,4.724868
1,CHARLES BAXTER,APRIL 9 1872,APRIL,9.0,1872.0,letters/wikisource_vol1_ch1_letter10.html,-0.9,3.9,wikisource_vol1_ch1_letter10,<p>Letter: TO CHARLES BAXTER</p>\n<p><br></p>\...,445,2050,4.606742
2,MRS. THOMAS STEVENSON,JULY 25 1872,JULY,25.0,1872.0,letters/wikisource_vol1_ch1_letter11.html,0.9,8.5,wikisource_vol1_ch1_letter11,<p>Letter: TO MRS. THOMAS STEVENSON</p>\n<p><b...,730,3379,4.628767
3,MRS. THOMAS STEVENSON,JULY 29 1872,JULY,29.0,1872.0,letters/wikisource_vol1_ch1_letter12.html,0.1,5.1,wikisource_vol1_ch1_letter12,<p>Letter: TO MRS. THOMAS STEVENSON</p>\n<p><b...,430,2056,4.781395
4,MRS. THOMAS STEVENSON,AUGUST 2 1872,AUGUST,2.0,1872.0,letters/wikisource_vol1_ch1_letter13.html,-2.6,12.0,wikisource_vol1_ch1_letter13,<p>Letter: TO MRS. THOMAS STEVENSON</p>\n<p><b...,1064,5012,4.710526


In [50]:
df.columns

Index(['addressee', 'date', 'Month', 'DayOfMonth', 'Year', 'location', 'score',
       'magnitude', 'filename', 'letter_text', 'word_length',
       'character_length', 'avg_word_length'],
      dtype='object')

In [51]:
df.to_csv('letters_data_clean_with_sentiment.csv',index=None)