### Import Library

In [9]:
import re  
import nltk 
import matplotlib.pyplot as plt  
import pandas as pd  
from nltk.tokenize import word_tokenize
from nltk.corpus import opinion_lexicon

### Prepare Data

In [10]:
df = pd.read_csv('LittleWomen.txt',sep='\t')\
    .dropna()\
    .drop('gutenberg_id', 1)

df.head(10)

Unnamed: 0,text
0,LITTLE WOMEN
3,by
5,Louisa May Alcott
10,CONTENTS
13,PART 1
15,ONE PLAYING PILGRIMS
16,TWO A MERRY CHRISTMAS
17,THREE THE LAURENCE BOY
18,FOUR BURDENS
19,FIVE BEING NEIGHBORLY


In [11]:
def clean_text(text):
    text = text.lower() 
    text = text.replace("'", '')
    text = re.sub(r'[^\w]', ' ', text) 
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

df['text'] = df['text'].map(clean_text) 
df['text'] = df['text'].map(word_tokenize) 

df = df.text.explode().to_frame('token')
df.head()

Unnamed: 0,token
0,little
0,women
3,by
5,louisa
5,may


### Summarise the Sentiment Words

In [12]:
sentiment_lexicon = {
    **{w: 'positive' for w in opinion_lexicon.positive()},
    **{w: 'negative' for w in opinion_lexicon.negative()}
}

df['sentiment'] = df['token'].map(sentiment_lexicon)
df = df[~df.sentiment.isna()]
df.head(10)

Unnamed: 0,token,sentiment
16,merry,positive
20,beautiful,positive
21,humiliation,negative
23,vanity,negative
23,fair,positive
31,faithful,positive
32,dark,negative
35,mischief,negative
35,peace,positive
36,pleasant,positive


In [13]:
summary_df = df.sentiment.value_counts().to_frame('n')
summary_df['prop'] = summary_df['n'] / summary_df.n.sum()

summary_df.round(3)

Unnamed: 0,n,prop
positive,9535,0.603
negative,6265,0.397
