# Diary analysis using Textblob

### Initial dataframe cleaning

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [15]:
# main df
df = pd.read_csv("data/person_details.csv")
df.head(20)

Unnamed: 0.1,Unnamed: 0,id,diary_ref,person,description,sex,sex_code,year,entry_count
0,0,1,diary01.csv,alta: a detailed dreamer,Alta is an adult woman who wrote down her drea...,female,1,1985-1997,422
1,1,2,diary02.csv,angie: age 18 & 20,Angie is a college student whose dreams are of...,female,1,1996,48
2,2,3,diary03.csv,arlie: a middle-aged woman,Arlie is an older adult woman who wrote down h...,female,1,1992-1998,212
3,3,4,diary04.csv,barb sanders,Barb Sanders (not her real name) is a middle-a...,female,1,1960-1997,3116
4,4,5,diary05.csv,barb sanders #2,"1138 more dreams from Barb Sanders, written do...",female,1,1997-2001,1138
5,5,6,diary06.csv,barb sanders: baseline,These 250 dreams are a representative sample o...,female,1,1960-1997,250
6,6,7,diary07.csv,bay area girls: grades 4-6,The 388 dreams in this set were collected in N...,female,1,1996-1997,234
7,7,8,diary08.csv,bay area girls: grades 7-9,The 388 dreams in this set were collected in N...,female,1,1996-1997,154
8,8,9,diary09.csv,bea 1: a high school student,Bea's vivid dream reports from age 14 to 16 pr...,female,1,2003-2005,223
9,9,10,diary10.csv,bea 2: a college student,See Bea 1.,female,1,2007-2010,63


In [5]:
# df for this analysis
diary1_df = pd.read_csv("data/diaries/diary01.csv", sep = "|")
sample_entries = diary1_df.head(4)
sample_entries

Unnamed: 0,raw_number,content
0,#1 (1957),"The one at the Meads's house, where it's bigge..."
1,#2 (8/11/67),I'm at a family reunion in a large fine house ...
2,#3 (8/1/85),I watch a plane fly past and shortly realize i...
3,#4 (1985?),Me pulling the green leaves and berries off so...


#### Sample diary entries cleaning 

In [6]:
## begin cleaning sample entries
# creating function to separate 'raw_number' column into just a date column
def dates(date_string):
    date_uncleaned = date_string.split()[1]
    date = date_uncleaned[1:-1]
    return date

sample_entries = sample_entries.assign(dates = sample_entries['raw_number'].apply(dates))
# remove raw number column
sample_entries = sample_entries.drop('raw_number', axis=1)
sample_entries

Unnamed: 0,content,dates
0,"The one at the Meads's house, where it's bigge...",1957
1,I'm at a family reunion in a large fine house ...,8/11/67
2,I watch a plane fly past and shortly realize i...,8/1/85
3,Me pulling the green leaves and berries off so...,1985?


### sentiment analysis using TextBlob

In [10]:
import pandas as pd
import textblob
from textblob import Word
import nltk
nltk.download('punkt') # to use blob.words
nltk.download('brown') # to use blob.sentences

[nltk_data] Downloading package punkt to /Users/juliama/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to /Users/juliama/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

> TextBlob basic methods/objects

You can read up on more documentation [here](https://textblob.readthedocs.io/en/dev/quickstart.html).

Tokenization
- `blob.words`: creates a list-like structure of words 
- `blob.sentences`: creates a list of sentences 
- `blob.noun_phrases`: creates list of nouns + (qualifiers, 's)
    - can be used to find most common nouns!

Lemmetization
- def: finding the root word 
- can only lemmatize word objects

In [11]:
# runs thru each diary entry in the sample 
for entry in sample_entries['content']:
    # creating a blob object
    blob = textblob.TextBlob(entry)

    # tokenization
    # print(blob.words)  
    print(blob.sentences)
    # print(blob.noun_phrases)

    # lemmatization: 
    # word = Word('octopi')
    # print(blob.lemmatize)

    # sentiment
    # from [-1, 1], -1 being the most sad, 1 being the most happy, assigns a polarity number
    # pro: assigns a 

[Sentence("The one at the Meads's house, where it's bigger inside than out; there's a European village just inside, with a cobblestone street and a Pied-Piper sort of man with curly hair, he can do things like juggle - I go up the back stairs [there aren't any in the real house] and then down the other side [since there's a second set, immediately] then down a short empty hallway that turns a corner, where I find a tiny room...a young woman with shoulder-length blonde hair in a pageboy is there, cooking at a stove that almost fills the room...she's nice to me."), Sentence("Now outside, I'm waiting for my aunt to pick me up - she arrives in a little round convertible and we go for a drive, not very far - we cross a little bridge over a creek, then double back and she drops me off at the house again."), Sentence("Inside (?)"), Sentence("I sit with a couple of people, playing with a string of blue balloons.")]
[Sentence("I'm at a family reunion in a large fine house with grounds (or a may

> sentiment analysis

using `blob.sentiment`
- results in polarity and subjectivity score
    - polarity score: between [-1, 1], where -1 means the most sad/negative & 1 means the most happy/positive
    - subjectivicty score: between [0, 1], where 0 means the text is purely factual & 1 means the text is extremely subjective
- pro: assigns a numeric polarity score, which makes it easier to train machine learning with 
- pro: subjectivity score is pretty unique (and another aspect we can use to predict text)
- con: difficult to infer what the polarity means; seems like theres only two emotions to work with (happy/sad vs. negative/positive)
    - after testing the polarity below with different text emotions (sad, neutral, happy, mad), the polarity score for __sad__ and __mad__ text were both off

In [12]:
# runs thru each diary entry in the sample 
for entry in sample_entries['content']:
    # creating a blob object
    blob = textblob.TextBlob(entry)

    # sentiment 
    print(blob.sentiment)
    
## polarity example 
sad_text = "My grandma died today and I don't know how to react...the pain in my chest is growing and I can't think anything but empty thoughts" # sad
neutral_text = "Hello world." # neutral
happy_text = "Today is the best day of my life! I just got a new job and a new pet, I'm so excited. Life is looking great, I'm proud of myself"
mad_text = "Max is the most infuriating, annoying, condensing piece of shit that should go back up his mother's fucking vagina to find some self-respect. \
            And maybe while he's in there, he should find some inches to grow. I don't even know how he made it into UC San Diego. He's a bitch"

sad_blob = TextBlob(sad_text)
neutral_blob = TextBlob(neutral_text)
happy_blob = TextBlob(happy_text)
mad_blob = TextBlob(mad_text)

polarity_score1 = sad_blob.subjectivity
polarity_score2 = neutral_blob.subjectivity
polarity_score3 = happy_blob.subjectivity
polarity_score4 = mad_blob.subjectivity

print(f"Polarity of sad_text: {polarity_score1}")
print(f"Polarity of neutral_text: {polarity_score2}")
print(f"Polarity of happy_text: {polarity_score3}")
print(f"Polarity of mad_text: {polarity_score4}")

## subjectivity example
text1 = "I love the beautiful scenery in this place." # subjective
text2 = "The sky is blue." # objective

blob1 = TextBlob(text1)
blob2 = TextBlob(text2)

# Accessing subjectivity
subjectivity_score1 = blob1.subjectivity
subjectivity_score2 = blob2.subjectivity

print(f"Subjectivity of text1: {subjectivity_score1}")
print(f"Subjectivity of text2: {subjectivity_score2}")
    

Sentiment(polarity=-0.010850984764028246, subjectivity=0.2944351542177629)
Sentiment(polarity=-0.022186791383219964, subjectivity=0.3764172335600909)
Sentiment(polarity=0.023200000000000016, subjectivity=0.40666666666666673)
Sentiment(polarity=-0.005132575757575764, subjectivity=0.48201388888888896)


NameError: name 'TextBlob' is not defined