In [None]:
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.book import *
from nltk.corpus import stopwords
from nltk import pos_tag


from collections import Counter

import contractions

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from scipy.interpolate import UnivariateSpline

# <span style='color:Orange'>I. Clean text</span> 

In [None]:
def clean_text(text):
    nconts_text = contractions.fix(text)
    clean = nconts_text.replace("_", "")
    return clean

In [None]:
def text_stats(tokens, text):
    num_tokens = len(tokens)
    num_types = len(set(tokens))
    sent_av = num_tokens/len(sent_tokenize(text))

    print(f"The number of tokens in this text is {num_tokens}.",
         f"\nThe number of types is {num_types}.", 
         f"\nThe TTR (type-token ratio) of this text is {num_types/num_tokens}.",
         f"\nThe average sentence length is {sent_av}.")

In [None]:
def custom_tokenize(text):
    tokens = word_tokenize(text)
    words = [token.lower() for token in tokens if token.isalnum()]
    return words

In [None]:
def custom_dispersion(tokens, words):
    for word in words:
        for ele, token in enumerate(tokens):
            if token == word:
                plt.plot(ele, word, 'o', color = 'b', markersize = 3)
                plt.title('Dispersion Plot of Words')
                plt.xlabel("Length of Text in Words")
                plt.ylabel("Key Words")
                
                plt.ylim(-1, len(words))

In [None]:
def open_file(path):
    with open(path, "r", encoding = "utf-8") as file:
        text = file.read()
        return text

In [None]:
orange = open_file("C:/Users/Hien Bach/text_analysis_jupyter/data/my_sweet_orange_tree.txt")

In [None]:
clean_orange = clean_text(orange)

In [None]:
tokens_orange = custom_tokenize(orange)

In [None]:
num_tokens = len(tokens_orange)
num_types =  len(set(tokens_orange))
sent_av = num_tokens/len(sent_tokenize(orange))
print(num_tokens)
print(num_types)
print(sent_av)

In [None]:
novel =nltk.Text(tokens_orange)

# <span style='color:Orange'>II. Dispersion Plot</span>

In [None]:
novel.dispersion_plot(["portuguese", "love", "pain", "hurt", "pinkie"])

In [None]:
custom_dispersion(novel,["portuguese", "love", "pain", "hurt", "pinkie", "humiliation"] ) 

In [None]:
custom_dispersion(novel,["father", "portuga", "portuguese", "totoca"]) 
plt.savefig('dispersion_plot.png', dpi=300, bbox_inches='tight')

# <span style='color:Orange'>III. Word Frequency</span>

In [None]:
FreqDist(novel).most_common(20)

In [None]:
unfiltered = FreqDist(novel).most_common(100)
unfiltered[15]

In [None]:
words, counts = zip(*unfiltered)

plt.figure(figsize=(12, 6))
plt.plot(words, counts, marker='o', linestyle='-', color='b', label='Word Counts')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Word Frequencies')
plt.xticks(rotation=45, ha='right')
plt.legend()

plt.show()

In [None]:
stop_words = stopwords.words('english')
filtered_words = [w for w in novel if w not in stop_words]
FreqDist(filtered_words).most_common(50)
top_fifty = FreqDist(filtered_words).most_common(50)

words, counts = zip(*top_fifty)

plt.figure(figsize=(12, 6))
plt.bar(words, counts, color='skyblue')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top Words and Their Frequencies')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
lst = ["pinkie", "glória", "seu", "totoca", "zezé"]
custom_sw = stopwords.words("portuguese")

custom_sw.extend(lst)
print(custom_sw)

In [None]:
custom_filtered_words = [w for w in novel if w not in custom_sw]

custom_top_fifty = FreqDist(custom_filtered_words).most_common(50)

words, counts = zip(*custom_top_fifty)

plt.figure(figsize=(12, 6))
plt.bar(words, counts, color='skyblue')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top Words and Their Frequencies')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
custom_filtered_words = [w for w in novel if w not in custom_sw and len(w) > 10]

custom_top_fifty = FreqDist(custom_filtered_words).most_common(50)

words, counts = zip(*custom_top_fifty)

plt.figure(figsize=(12, 6))
plt.bar(words, counts, color='skyblue')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top Words and Their Frequencies')
plt.xticks(rotation=45, ha='right')
plt.show()

# <span style='color:Orange'>IV. TTR and MATTR</span>

In [None]:
def ttr(seg_words):
    types = set(seg_words)
    ttr = len(types)/len(seg_words)
    return(ttr)

In [None]:
def averge_word_length(segment):
    total_characters = 0
    for word in segment:
        total_characters = total_characters + len(word)
    awl = total_characters/len(segment)
    return(awl)

In [None]:
def pos_counts(tagged_segment):
    the_count = the_count = (Counter(tag for x,tag in tagged_segment))
    nouns = the_count["NOUN"]
    verbs = the_count["VERB"]
    adjs = the_count["ADJ"]
    advs = the_count["ADV"]
    return nouns, verbs, adjs, advs

In [None]:
def mattr(tokens_tags):
    window = 3000
    end = len(tokens_tags)-window
    mattr = []

    for item in range (0, end, 1000):
        if item < end:
            tagged_seg = tokens_tags[item: item + window]
            seg_words = [item[0] for item in tagged_seg]
            sttr = ttr(seg_words)
            sawl = averge_word_length(seg_words)
            percent = 100 * (item + window)/len(seg_words)
            nouns, verbs, adjs, advs = pos_counts(tagged_seg)
            seg_lex_div = (nouns + verbs + adjs + advs)/ len(seg_words)
            mattr.append((item, item+window, percent, sttr, sawl, nouns, verbs, adjs, advs, seg_lex_div))
    return mattr

In [None]:
tags_novel = nltk.pos_tag(tokens_orange, tagset = "universal")

text_stats(tokens_orange, clean_orange)

In [None]:
mattr_novel = mattr(tags_novel)

len(mattr_novel)

In [None]:
df_novel = pd.DataFrame(mattr_novel, columns = ("window_start", "window_end", "percent", "MATTR", "average_word_length", "nouns", "verbs", "adjectives", "adverbs", "lexical_diversity"))

df_novel

# <span style='color:Orange'>V. MATTR & Lexical diversity in graph</span>

In [None]:
average = sum(df_novel["MATTR"])/len(df_novel)

## <span style='color:green'>With labels</span>

In [None]:
plt.plot(df_novel["percent"], df_novel["MATTR"], label="MATTR")
plt.plot(df_novel["percent"], df_novel["lexical_diversity"]*0.43, label="lexical diversity")
plt.axhline(y = average, color = 'r', linestyle = '--') 
plt.legend()

## <span style='color:green'>Without labels</span>

In [None]:
plt.plot(df_novel["percent"], df_novel["MATTR"]*14.5, df_novel["percent"], df_novel["average_word_length"])

In [None]:
plt.plot(df_novel["percent"], df_novel["nouns"], label="nouns")
plt.plot(df_novel["percent"], df_novel["verbs"], label="verbs")
plt.plot(df_novel["percent"], df_novel["adjectives"]*3.2, label="adjectives" )
plt.legend()

In [None]:
plt.plot(df_novel["percent"], df_novel["nouns"], color = "red", linestyle = "--", label = "nouns")
plt.plot(df_novel["percent"], df_novel["adjectives"]*2.7, label = "adjectives")
plt.legend()

In [None]:
plt.plot(df_novel["percent"], df_novel["nouns"], label = "nouns")
plt.plot(df_novel["percent"], df_novel["verbs"], label = "verbs")
plt.legend()

In [None]:
plt.plot(df_novel["percent"], df_novel["nouns"], label="nouns")
plt.plot(df_novel["percent"], df_novel["verbs"], label="verbs")
plt.plot(df_novel["percent"], df_novel["MATTR"]*2300, label="MATTR")
plt.legend()

## <span style='color:green'>Smallest and Largest</span>

In [None]:
smallest = df_novel.nsmallest(10, "MATTR")
smallest

In [None]:
smallest.sort_values("window_start")

In [None]:
largest = df_novel.nlargest(10, "MATTR")
largest

In [None]:
largest.sort_values("window_start")

In [None]:
print(tokens_orange[21000:24000])

## <span style='color:green'>Smallest and Largest in graphs</span>

In [None]:
plt.plot (df_novel["percent"], df_novel["MATTR"])
plt.axhline(y = average, color = 'r', linestyle = '--') 
plt.plot(largest["percent"], largest["MATTR"], 'o', color='green')
plt.plot(smallest["percent"], smallest["MATTR"], 'o', color='orange')

In [None]:
nouns_largest = df_novel.nlargest(10, "nouns")
nouns_largest

In [None]:
nouns_smallest = df_novel.nsmallest(10, "nouns")
nouns_smallest

In [None]:
adjectives_smallest = df_novel.nsmallest(10, "adjectives")
adjectives_largest = df_novel.nlargest(10, "adjectives")

In [None]:
plt.plot (df_novel["percent"], df_novel["MATTR"])
plt.axhline(y = average, color = 'r', linestyle = '--') 
plt.plot(largest["percent"], largest["MATTR"], 'o', color='green')
plt.plot(smallest["percent"], smallest["MATTR"], 'o', color='orange')
#plt.plot(nouns_largest["percent"], largest["MATTR"], 'o', color='lightgreen')
#plt.plot(nouns_smallest["percent"], smallest["MATTR"], 'o', color='yellow')
plt.plot(adjectives_smallest["percent"],smallest["MATTR"], 'o', color='purple')
plt.plot(adjectives_largest["percent"],largest["MATTR"], 'o', color='red')

# <span style='color:Orange'>VI. Sentiment Analysis</span>

In [None]:
def bing_sent(words):

    score = 0
    pos = 0
    neg = 0

    for word in words:
        matching_tuples = [t for t in bing_result_list if t[0] == word]
        if matching_tuples:
            if matching_tuples[0][1] == "negative":
                score -= 1
                neg -= 1
            if matching_tuples[0][1] == "positive":
                score += 1
                pos += 1
    return (pos, neg, score)

In [None]:
def afinn_sent(words):

    afinn_score = 0

    for word in words:
        matching_tuples = [t for t in afinn_result_list if t[0] == word]

        if matching_tuples:
            afinn_score = afinn_score+matching_tuples[0][1]
    return (afinn_score)

In [None]:
def nrc_sent(words):
    
    categories = ["fear", "anger", "trust", "sadness", "disgust", "anticipation", "joy", "surprise", "negative", "positive"]

    nrc_scores = []

    for category in categories:
    
        category_score = 0 
        nrc_result_list = [(row["word"], row["sentiment"]) for index, row in nrc_df[nrc_df["sentiment"] == category].iterrows()]                   
    
        for word in words:
        
            matching_tuples = [t for t in nrc_result_list if t[0] == word]
        
            if matching_tuples:
                category_score += 1
        nrc_scores.append(category_score)
        
    return (nrc_scores)

In [None]:
def segment_sentiments(words, num_segments, lexicon):
    # Calculate the base length of each segment
    base_segment_length = len(words) // num_segments

    # Calculate the number of extra items
    extra_items = len(words) % num_segments

    # Initialize the starting index for each segment
    start_index = 0

    # Initialize the list to store segments
    segments = []
    
    # Initialize the list to store Bing sentiment scores for each segment
    segment_score = []

    # Divide the words into segments
    for _ in range(num_segments):
        # Calculate the ending index for the current segment
        end_index = start_index + base_segment_length + (1 if extra_items > 0 else 0)
        
        # Create a list of the segment words
        seg_words = words[start_index:end_index]

        # Append the current segment to the list of segments
        segments.append(seg_words)
        
        # Calculate the Bing sentiment scores for the section
        if lexicon == "bing":
            pos, neg, score = bing_sent(seg_words)
            # Append the Bing score to a list that records the score for each segment
            segment_score.append((start_index, end_index-1, pos, neg, score, seg_words))
            
        # Calculate the Afinn sentiment score for the section  
        elif lexicon == "afinn":
            afinn_score = afinn_sent(seg_words)
            # Append the Afinn score to a list that records the score for each segment
            segment_score.append((start_index, end_index-1, afinn_score, seg_words))
            
        # Calculate the NRC sentiment scores for the section    
        elif lexicon == "nrc":
            nrc_scores = nrc_sent(seg_words)
            nrc_list = list(nrc_scores)
            nrc_list.append((nrc_list[9]-nrc_list[8]))
            segment_score.append((start_index, end_index-1, *nrc_list))

        # Update the starting index for the next segment
        start_index = end_index
        extra_items -= 1

    return segment_score

In [None]:
afinn_df = pd.read_csv("Afinn.csv", encoding = "latin-1")

In [None]:
bing_df = pd.read_csv("Bing.csv", encoding = "latin-1")

In [None]:
nrc_df = pd.read_csv("NRC.csv", encoding = "latin-1")

## <span style='color:green'>a) Afinn Lexicon</span>

In [None]:
def segment_sentiments_afinn(words, num_segments):
    # Calculate the base length of each segment
    base_segment_length = len(words) // num_segments

    # Calculate the number of extra items
    extra_items = len(words) % num_segments

    # Initialize the starting index for each segment
    start_index = 0

    # Initialize the list to store segments
    segments = []
    
    # Initialize the list to store Bing sentiment scores for each segment
    segment_score = []

    # Divide the words into segments
    for _ in range(num_segments):
        # Calculate the ending index for the current segment
        end_index = start_index + base_segment_length + (1 if extra_items > 0 else 0)
        
        # Create a list of the segment words
        seg_words = words[start_index:end_index]

        # Append the current segment to the list of segments
        segments.append(seg_words)
        
        
        afinn_score = afinn_sent(seg_words)
        afinn_score_more = afinn_sent_more(seg_words)
        # Append the Afinn score to a list that records the score for each segment
        segment_score.append((start_index, end_index-1, afinn_score, afinn_score_more[0], afinn_score_more[1], afinn_score_more[2], afinn_score_more[3], afinn_score_more[4], afinn_score_more[5], afinn_score_more[6], afinn_score_more[7], afinn_score_more[8], afinn_score_more[9], afinn_score_more[10], seg_words))
    

        # Calculate the Afinn sentiment score for the section
        # Update the starting index for the next segment
        start_index = end_index
        extra_items -= 1

    return segment_score

In [None]:
def afinn_sent(words):

    afinn_score = 0

    for word in words:
        matching_tuples = [t for t in afinn_result_list if t[0] == word]

        if matching_tuples:
            afinn_score = afinn_score+matching_tuples[0][1]
    return (afinn_score)

In [None]:
def afinn_sent_more(words):
    #afinn_score = 0
    first_positive_level = 0
    second_positive_level = 0
    third_positive_level = 0
    fourth_positive_level = 0
    fifth_positive_level = 0
    neutral_level = 0
    first_negative_level = 0
    second_negative_level = 0
    third_negative_level = 0
    fourth_negative_level = 0
    fifth_negative_level = 0
    
    for word in words:
        matching_tuples = [t for t in afinn_result_list if t[0] == word]
        
        if matching_tuples:
            #afinn_score = afinn_score+matching_tuples[0][1]
            if matching_tuples[0][1] == 0:
                neutral_level += 1
            elif matching_tuples[0][1] == 1:
                first_positive_level += 1
            elif matching_tuples[0][1] == 2:
                second_positive_level += 1
            elif matching_tuples[0][1] == 3:
                third_positive_level += 1
            elif matching_tuples[0][1] == 4:
                fourth_positive_level += 1
            elif matching_tuples[0][1] == 5:
                fifth_positive_level += 1
            elif matching_tuples[0][1] == -1:
                first_negative_level += 1
            elif matching_tuples[0][1] == -2:
                second_negative_level += 1
            elif matching_tuples[0][1] == -3:
                third_negative_level += 1
            elif matching_tuples[0][1] == -4:
                fourth_negative_level += 1
            elif matching_tuples[0][1] == -5:
                fifth_negative_level += 1
    return first_positive_level, second_positive_level, third_positive_level, fourth_positive_level, fifth_positive_level, neutral_level, first_negative_level, second_negative_level, third_negative_level, fourth_negative_level, fifth_negative_level

### <span style='color:violet'>Iterate through the Afinn.csv</span>

In [None]:
afinn_result_list = [(row['word'], row['value']) for index, row in afinn_df.iterrows()]

### <span style='color:violet'>Create dataframe</span>

In [None]:
number = 100

afinn_sent_results_orange = segment_sentiments_afinn(tokens_orange, number)

afinn_sent_orange_df = pd.DataFrame(afinn_sent_results_orange, columns = ["seg_start", "seg_eng", "afinn_score", "pos=1", "pos=2", "pos=3", "pos=4", "pos=5", "neutral", "neg=-1", "neg=-2", "neg=-3", "neg=-4", "neg=-5", "word_list"])

afinn_sent_orange_df

### <span style='color:violet'>Plot</span>

In [None]:
plt.bar(((afinn_sent_orange_df.index+1)/number)*100, afinn_sent_orange_df["neg=-1"])
plt.title("Afinn Sentiment Analysis of \n My Sweet Orange Tree")
plt.xlabel("Segment Position in Novel as % of Text")
plt.ylabel("Segment Sentiment")

In [None]:
plt.plot(((afinn_sent_orange_df.index+1)/number)*100, afinn_sent_orange_df["neg=-1"], label = "-1")
plt.plot(((afinn_sent_orange_df.index+1)/number)*100, afinn_sent_orange_df["neg=-2"], label = "-2")
plt.plot(((afinn_sent_orange_df.index+1)/number)*100, afinn_sent_orange_df["neg=-3"], label = "-3")
plt.plot(((afinn_sent_orange_df.index+1)/number)*100, afinn_sent_orange_df["neg=-4"], label = "-4")
plt.plot(((afinn_sent_orange_df.index+1)/number)*100, afinn_sent_orange_df["neg=-5"], label = "-5")
plt.title("Sentiment Analysis of My Sweet Orange Tree \n with levels of negative words using Afinn Lexicon")
plt.xlabel("Segment Position in Novel as % of Text")
plt.ylabel("Segment Sentiment")
plt.legend()

plt.savefig('line.png', dpi=300, bbox_inches='tight')

In [None]:
plt.plot(((afinn_sent_orange_df.index+1)/number)*100, afinn_sent_orange_df["pos=1"], label = "1")
plt.plot(((afinn_sent_orange_df.index+1)/number)*100, afinn_sent_orange_df["pos=2"], label = "2")
#plt.plot(((afinn_sent_orange_df.index+1)/number)*100, afinn_sent_orange_df["pos=3"], label = "3")
#plt.plot(((afinn_sent_orange_df.index+1)/number)*100, afinn_sent_orange_df["pos=4"], label = "4")
#plt.plot(((afinn_sent_orange_df.index+1)/number)*100, afinn_sent_orange_df["pos=5"], label = "5")
plt.title("Sentiment Analysis of My Sweet Orange Tree \n with levels of positive words using Afinn Lexicon")
plt.xlabel("Segment Position in Novel as % of Text")
plt.ylabel("Segment Sentiment")
plt.legend()

## <span style='color:green'>b) Bing Lexicon</span>

### <span style='color:violet'>Plot</span>

In [None]:
bing_result_list = [(row['word'], row['sentiment']) for index, row in bing_df.iterrows()]

### <span style='color:violet'>Create dataframe</span>

In [None]:
bing_sent_results_orange = segment_sentiments(tokens_orange, number, lexicon="bing")

bing_sent_orange_df = pd.DataFrame(bing_sent_results_orange, columns = ["seg_start", "seg_eng", "positive_score", "negative_score", "overall_seg_score", "word_list"])

bing_sent_orange_df

### <span style='color:violet'>Plot</span>

In [None]:
plt.bar(((bing_sent_orange_df.index+1)/number)*100, bing_sent_orange_df["overall_seg_score"])
plt.title("Bing Sentiment Analysis of \n My Sweet Orange Tree")
plt.xlabel("Segment Position in Novel as % of Text")
plt.ylabel("Segment Sentiment")
plt.savefig('bar.png', dpi=300, bbox_inches='tight')

### <span style='color:violet'>Smallest and Largest</span>

In [None]:
largest = bing_sent_orange_df.nlargest(10, "overall_seg_score")
largest

In [None]:
print(bing_sent_orange_df.loc[18].word_list)

In [None]:
print(bing_sent_orange_df.loc[47].word_list)

## <span style='color:green'>c) NRC Lexicon</span>

### <span style='color:violet'>Iterate through the NRC.csv</span>

In [None]:
nrc_sent_results_orange = segment_sentiments(tokens_orange, number, lexicon="nrc")

### <span style='color:violet'>Create dataframe</span>

In [None]:
nrc_sent_orange_df = pd.DataFrame(nrc_sent_results_orange, columns = ["seg_start", "seg_eng", "fear", "anger", "trust", "sadness", "disgust", "anticipation", "joy", "surprise", "negative", "positive", "overall_seg_score"])

nrc_sent_orange_df

### <span style='color:violet'>Plot</span>

In [None]:
plt.bar(((nrc_sent_orange_df.index+1)/number)*100, nrc_sent_orange_df["overall_seg_score"])
plt.title("NRC Sentiment Analysis of \n My Sweet Orange Tree")
plt.xlabel("Segment Position in Novel as % of Text")
plt.ylabel("Segment Sentiment")

In [None]:
#plt.plot(((nrc_sent_orange_df.index+1)/number)*100, nrc_sent_orange_df["anger"], label = "anger")
plt.plot(((nrc_sent_orange_df.index+1)/number)*100, nrc_sent_orange_df["fear"], label = "fear")
plt.plot(((nrc_sent_orange_df.index+1)/number)*100, nrc_sent_orange_df["disgust"], label = "disgust")
plt.plot(((nrc_sent_orange_df.index+1)/number)*100, nrc_sent_orange_df["joy"], label = "joy")
plt.title("NRC Sentiment Analysis of \n My Sweet Orange Tree")
plt.xlabel("Segment Position in Novel as % of Text")
plt.ylabel("Segment Sentiment")
plt.legend()

In [None]:
plt.plot(((nrc_sent_orange_df.index+1)/number)*100, nrc_sent_orange_df["trust"], label = "trust")
plt.title("NRC Sentiment Analysis of \n My Sweet Orange Tree")
plt.xlabel("Segment Position in Novel as % of Text")
plt.ylabel("Segment Sentiment")
plt.legend()

## <span style='color:green'>d) Three Lexicons</span>

In [None]:
plt.plot(((bing_sent_orange_df.index+1)/number)*100, bing_sent_orange_df["overall_seg_score"], label = "bing")
plt.plot(((afinn_sent_orange_df.index+1)/number)*100, afinn_sent_orange_df["afinn_score"], label = "afinn")
plt.plot(((nrc_sent_orange_df.index+1)/number)*100, nrc_sent_orange_df["overall_seg_score"], label = "nrc")
plt.title("Sentiment Analysis of \n My Sweet Orange Tree \n Using Two Different Sentiment Lexicons")
plt.xlabel("Segment Position in Novel as % of Text")
plt.ylabel("Segment Sentiment")
plt.legend()

## <span style='color:green'>e) Smooth Lines</span>

### <span style='color:violet'>Using three lexicons</span>

In [None]:
# Create a graph with smoother lines

# Define the number of points for the spline curve
num_points = 1000

# Smooth the bing_sent_twenty_df data
bing_spline = UnivariateSpline(
    ((bing_sent_orange_df.index + 1) / number) * 100,
    bing_sent_orange_df["overall_seg_score"],
    s=0  # You can adjust the smoothing factor 's' based on your preference
)

# Smooth the afinn_sent_twenty_df data
afinn_spline = UnivariateSpline(
    ((afinn_sent_orange_df.index + 1) / number) * 100,
    afinn_sent_orange_df["afinn_score"],
    s=0  # You can adjust the smoothing factor 's' based on your preference
)

# Smooth the nrc_sent_twenty_df data
nrc_spline = UnivariateSpline(
    ((nrc_sent_orange_df.index + 1) / number) * 100,
    nrc_sent_orange_df["overall_seg_score"],
    s=0  # You can adjust the smoothing factor 's' based on your preference
)

# Generate points for the smoothed curves
x_smooth = np.linspace(
    (((bing_sent_orange_df.index + 1) / number) * 100).min(),
    (((bing_sent_orange_df.index + 1) / number) * 100).max(),
    num=num_points
)

# Set the figure size to stretch the x-axis visually
plt.figure(figsize=(12, 6))  # Adjust width and height as needed

# Plot the smoothed curves
plt.plot(x_smooth, bing_spline(x_smooth), label="Bing Sentiment", color="blue")
plt.plot(x_smooth, afinn_spline(x_smooth), label="Afinn Sentiment", color="orange")
plt.plot(x_smooth, nrc_spline(x_smooth), label="NRC Sentiment", color="red")

# Original data points
plt.scatter(
    ((bing_sent_orange_df.index + 1) / number) * 100,
    bing_sent_orange_df["overall_seg_score"],
    marker="o",
    color="blue",
    alpha=0.5
)
plt.scatter(
    ((afinn_sent_orange_df.index + 1) / number) * 100,
    afinn_sent_orange_df["afinn_score"],
    marker="o",
    color="orange",
    alpha=0.5
)
plt.scatter(
    ((nrc_sent_orange_df.index + 1) / number) * 100,
    nrc_sent_orange_df["overall_seg_score"],
    marker="o",
    color="red",
    alpha=0.5
)


# Set title and labels
plt.title("Sentiment Analysis of \n My Sweet Orange Tree \n Using Three Different Sentiment Lexicons")
plt.xlabel("Segment Position in Novel as % of Text")
plt.ylabel("Segment Sentiment")
plt.legend()


# Stretch the x-axis
plt.xlim(0, 100)  # Replace x_min_value and x_max_value with your desired range

# Show the plot
plt.show()

### <span style='color:violet'>For a specific word using NRC Lexicon</span>

In [None]:
# Create a graph with smoother lines

# Define the number of points for the spline curve
num_points = 1000

# Smooth the nrc_sent_twenty_df data for a specific emotion
nrc_spline_anger = UnivariateSpline(
    ((nrc_sent_orange_df.index + 1) / number) * 100,
    nrc_sent_orange_df["sadness"],
    s=0,  # You can adjust the smoothing factor 's' based on your preference
)

# Smooth the nrc_sent_twenty_df data for a specific emotion
nrc_spline_fear = UnivariateSpline(
    ((nrc_sent_orange_df.index + 1) / number) * 100,
    nrc_sent_orange_df["disgust"],
    s=0,  # You can adjust the smoothing factor 's' based on your preference
)

# Generate points for the smoothed curves
x_smooth = np.linspace(
    (((nrc_sent_orange_df.index + 1) / number) * 100).min(),
    (((nrc_sent_orange_df.index + 1) / number) * 100).max(),
    num=num_points
)

# Set the figure size to stretch the x-axis visually
plt.figure(figsize=(12, 6))  # Adjust width and height as needed

# Plot the smoothed curves
plt.plot(x_smooth, nrc_spline_anger(x_smooth), label="sadness", color="red")
plt.plot(x_smooth, nrc_spline_fear(x_smooth), label="disgust", color="orange")

# Original data points
plt.scatter(
    ((nrc_sent_orange_df.index + 1) / number) * 100,
    nrc_sent_orange_df["sadness"],
    marker="o",
    color="red",
    alpha=0.5,
)

plt.scatter(
    ((nrc_sent_orange_df.index + 1) / number) * 100,
    nrc_sent_orange_df["disgust"],
    marker="o",
    color="orange",
    alpha=0.5,
)

# Set title and labels
plt.title("Sentiment Analysis of \n My Sweet Orange Tree \n Using the NRC Sentiment Lexicon")
plt.xlabel("Segment Position in Novel as % of Text")
plt.ylabel("Segment Sentiment")
plt.legend()

# Show the plot
plt.show()

### *<span style='color:red'>Father and occurences of negative words</span>*

In [None]:
custom_dispersion(novel,["father","portuga","portuguese", "totoca"]) 

In [None]:
nrc_sent_results_orange_1 = segment_sentiments(tokens_orange[8000:23000], number, lexicon="nrc")
nrc_sent_orange_df_1 = pd.DataFrame(nrc_sent_results_orange_1, columns = ["seg_start", "seg_eng", "fear", "anger", "trust", "sadness", "disgust", "anticipation", "joy", "surprise", "negative", "positive", "overall_seg_score"])
nrc_sent_orange_df_1

In [None]:
#plt.plot(((nrc_sent_orange_df_1.index+1)/number)*50, nrc_sent_orange_df_1["fear"], label = "fear")
plt.plot(((nrc_sent_orange_df_1.index+1)/number)*50, nrc_sent_orange_df_1["sadness"], label = "sadness")
#plt.plot(((nrc_sent_orange_df_1.index+1)/number)*50, nrc_sent_orange_df_1["disgust"], label = "disgust")
#plt.plot(((nrc_sent_orange_df_1.index+1)/number)*50, nrc_sent_orange_df_1["anger"], label = "anger")
#plt.plot(((nrc_sent_orange_df_1.index+1)/number)*50, nrc_sent_orange_df_1["joy"], label = "joy")
plt.plot(((nrc_sent_orange_df_1.index+1)/number)*50, nrc_sent_orange_df_1["trust"], label = "trust")
#plt.plot(((nrc_sent_orange_df_1.index+1)/number)*50, nrc_sent_orange_df_1["anticipation"], label = "anticipation")
#plt.plot(((nrc_sent_orange_df_1.index+1)/number)*50, nrc_sent_orange_df_1["surprise"], label = "surprise")
plt.title("Analysis of Zeze's emotions with his father using NRC Lexicon")
plt.xlabel("Segment Position in Novel as % of Text")
plt.ylabel("Segment Sentiment")
plt.legend()

plt.savefig('line_NRC_father.png', dpi=300, bbox_inches='tight')
#Even though having negative attachment with his father, his lexical diversity tends to be more skewed towards positive words.

In [None]:
nrc_sent_results_orange_2 = segment_sentiments(tokens_orange[8000:23000], number, lexicon="nrc")
nrc_sent_orange_df_2 = pd.DataFrame(nrc_sent_results_orange_2, columns = ["seg_start", "seg_eng", "fear", "anger", "trust", "sadness", "disgust", "anticipation", "joy", "surprise", "negative", "positive", "overall_seg_score"])
nrc_sent_orange_df_2

In [None]:
#plt.plot(((nrc_sent_orange_df_2.index+1)/number)*50, nrc_sent_orange_df_2["fear"], label = "fear")
plt.plot(((nrc_sent_orange_df_2.index+1)/number)*50, nrc_sent_orange_df_2["sadness"], label = "sadness")
#plt.plot(((nrc_sent_orange_df_2.index+1)/number)*50, nrc_sent_orange_df_2["joy"], label = "joy")
plt.plot(((nrc_sent_orange_df_2.index+1)/number)*50, nrc_sent_orange_df_2["trust"], label = "trust")
plt.title("Analysis of Zeze's emotions with his father using NRC Lexicon")
plt.xlabel("Segment Position in Novel as % of Text")
plt.ylabel("Segment Sentiment")
plt.legend()

#plt.savefig('plot_NRC_sadness_trust_2.png', dpi=300, bbox_inches='tight')

In [None]:
bing_sent_results_orange_1 = segment_sentiments(tokens_orange[8000:23000], number, lexicon="bing")

bing_sent_orange_df_1 = pd.DataFrame(bing_sent_results_orange_1, columns = ["seg_start", "seg_eng", "positive_score", "negative_score", "overall_seg_score", "word_list"])

bing_sent_orange_df_1

In [None]:
plt.bar(((bing_sent_orange_df_1.index+1)/number)*100, bing_sent_orange_df_1["overall_seg_score"])
plt.title("Bing Sentiment Analysis of \n Zezé's words usage in My Sweet Orange Tree")
plt.xlabel("Segment Position in Novel as % of Text")
plt.ylabel("Segment Sentiment")

plt.savefig('plot_bing_general.png', dpi=300, bbox_inches='tight')

In [None]:
number = 100

afinn_sent_results_orange_1 = segment_sentiments_afinn(tokens_orange[8000:23000], number)

afinn_sent_orange_df_1 = pd.DataFrame(afinn_sent_results_orange_1, columns = ["seg_start", "seg_eng", "afinn_score", "pos=1", "pos=2", "pos=3", "pos=4", "pos=5", "neutral", "neg=-1", "neg=-2", "neg=-3", "neg=-4", "neg=-5", "word_list"])

afinn_sent_orange_df_1

In [None]:
plt.plot(((afinn_sent_orange_df_1.index+1)/number)*100, afinn_sent_orange_df_1["neg=-1"], label = "-1")
plt.plot(((afinn_sent_orange_df_1.index+1)/number)*100, afinn_sent_orange_df_1["neg=-2"], label = "-2")
plt.plot(((afinn_sent_orange_df_1.index+1)/number)*100, afinn_sent_orange_df_1["neg=-3"], label = "-3")
#plt.plot(((afinn_sent_orange_df_1.index+1)/number)*100, afinn_sent_orange_df_1["pos=2"], label = "2")
#plt.plot(((afinn_sent_orange_df_1.index+1)/number)*100, afinn_sent_orange_df_1["neg=-5"], label = "-5")
plt.title("Sentiment Analysis of My Sweet Orange Tree \n with levels of negative words using Afinn Lexicon")
plt.xlabel("Segment Position in Novel as % of Text")
plt.ylabel("Segment Sentiment")
plt.legend()

plt.savefig('plot_afinn_father.png', dpi=300, bbox_inches='tight')

In [None]:
plt.plot(((afinn_sent_orange_df_1.index+1)/number)*100, afinn_sent_orange_df_1["pos=1"], label = "1")
plt.plot(((afinn_sent_orange_df_1.index+1)/number)*100, afinn_sent_orange_df_1["pos=2"], label = "2")
plt.plot(((afinn_sent_orange_df_1.index+1)/number)*100, afinn_sent_orange_df_1["pos=3"], label = "3")
#plt.plot(((afinn_sent_orange_df_2.index+1)/number)*100, afinn_sent_orange_df_2["pos=4"], label = "4")
#plt.plot(((afinn_sent_orange_df_2.index+1)/number)*100, afinn_sent_orange_df_2["pos=5"], label = "5")
plt.title("Sentiment Analysis of My Sweet Orange Tree \n with levels of positive words using Afinn Lexicon")
plt.xlabel("Segment Position in Novel as % of Text")
plt.ylabel("Segment Sentiment")
plt.legend()

### *<span style='color:red'>Portuga and occurences of negative words</span>*

In [None]:
custom_dispersion(novel,["portuga", "portuguese", "pinkie", "glória"])

In [None]:
nrc_sent_results_orange_2 = segment_sentiments(tokens_orange[23000:38000], number, lexicon="nrc")
nrc_sent_orange_df_2 = pd.DataFrame(nrc_sent_results_orange_2, columns = ["seg_start", "seg_eng", "fear", "anger", "trust", "sadness", "disgust", "anticipation", "joy", "surprise", "negative", "positive", "overall_seg_score"])
nrc_sent_orange_df_2

In [None]:
#plt.plot(((nrc_sent_orange_df_2.index+1)/number)*50, nrc_sent_orange_df_2["fear"], label = "fear")
plt.plot(((nrc_sent_orange_df_2.index+1)/number)*50, nrc_sent_orange_df_2["sadness"], label = "sadness")
#plt.plot(((nrc_sent_orange_df_2.index+1)/number)*50, nrc_sent_orange_df_2["joy"], label = "joy")
plt.plot(((nrc_sent_orange_df_2.index+1)/number)*50, nrc_sent_orange_df_2["trust"], label = "trust")
plt.title("Analysis of Zeze's emotions \n with the Portuguese using NRC Lexicon")
plt.xlabel("Segment Position in Novel as % of Text")
plt.ylabel("Segment Sentiment")
plt.legend()

plt.savefig('plot_NRC_portuga_1.png', dpi=300, bbox_inches='tight')

In [None]:
bing_sent_results_orange_2 = segment_sentiments(tokens_orange[23000:38000], number, lexicon="bing")

bing_sent_orange_df_2 = pd.DataFrame(bing_sent_results_orange_2, columns = ["seg_start", "seg_eng", "positive_score", "negative_score", "overall_seg_score", "word_list"])

bing_sent_orange_df_2

In [None]:
plt.bar(((bing_sent_orange_df_2.index+1)/number)*100, bing_sent_orange_df_2["overall_seg_score"])
plt.title("Bing Sentiment Analysis of \n My Sweet Orange Tree")
plt.xlabel("Segment Position in Novel as % of Text")
plt.ylabel("Segment Sentiment")

In [None]:
number = 100

afinn_sent_results_orange_2 = segment_sentiments_afinn(tokens_orange[23000:38000], number)

afinn_sent_orange_df_2 = pd.DataFrame(afinn_sent_results_orange_2, columns = ["seg_start", "seg_eng", "afinn_score", "pos=1", "pos=2", "pos=3", "pos=4", "pos=5", "neutral", "neg=-1", "neg=-2", "neg=-3", "neg=-4", "neg=-5", "word_list"])

afinn_sent_orange_df_2

In [None]:
plt.plot(((afinn_sent_orange_df_2.index+1)/number)*100, afinn_sent_orange_df_2["neg=-1"], label = "-1")
plt.plot(((afinn_sent_orange_df_2.index+1)/number)*100, afinn_sent_orange_df_2["neg=-2"], label = "-2")
#plt.plot(((afinn_sent_orange_df_2.index+1)/number)*100, afinn_sent_orange_df_2["neg=-3"], label = "-3")
#plt.plot(((afinn_sent_orange_df_2.index+1)/number)*100, afinn_sent_orange_df_2["pos=2"], label = "2")
#plt.plot(((afinn_sent_orange_df_2.index+1)/number)*100, afinn_sent_orange_df_2["neg=-4"], label = "-4")
#plt.plot(((afinn_sent_orange_df_2.index+1)/number)*100, afinn_sent_orange_df_2["neg=-5"], label = "-5")
plt.title("Analysis of Zeze's emotions \n with The Portuguese using Afinn Lexicon")
plt.xlabel("Segment Position in Novel as % of Text")
plt.ylabel("Segment Sentiment")
plt.legend()

plt.savefig('plot_Afinn_portuga_1.png', dpi=300, bbox_inches='tight')

In [None]:
plt.plot(((afinn_sent_orange_df_2.index+1)/number)*100, afinn_sent_orange_df_2["pos=1"], label = "1")
plt.plot(((afinn_sent_orange_df_2.index+1)/number)*100, afinn_sent_orange_df_2["pos=2"], label = "2")
plt.plot(((afinn_sent_orange_df_2.index+1)/number)*100, afinn_sent_orange_df_2["pos=3"], label = "3")
#plt.plot(((afinn_sent_orange_df_2.index+1)/number)*100, afinn_sent_orange_df_2["pos=4"], label = "4")
#plt.plot(((afinn_sent_orange_df_2.index+1)/number)*100, afinn_sent_orange_df_2["pos=5"], label = "5")
plt.title("Analysis of Zeze's positive emotions \n with The Portuguese using Afinn Lexicon")
plt.xlabel("Segment Position in Novel as % of Text")
plt.ylabel("Segment Sentiment")
plt.legend()

plt.savefig('plot_Afinn_portuga.png', dpi=300, bbox_inches='tight')

### *<span style='color:red'>Totoca and occurences of negative words</span>*

In [None]:
nrc_sent_results_orange_3 = segment_sentiments(tokens_orange[0:8000], number, lexicon="nrc")
nrc_sent_orange_df_3 = pd.DataFrame(nrc_sent_results_orange_3, columns = ["seg_start", "seg_eng", "fear", "anger", "trust", "sadness", "disgust", "anticipation", "joy", "surprise", "negative", "positive", "overall_seg_score"])
nrc_sent_orange_df_3

In [None]:
#plt.plot(((nrc_sent_orange_df_3.index+1)/number)*50, nrc_sent_orange_df_3["fear"], label = "fear")
plt.plot(((nrc_sent_orange_df_3.index+1)/number)*50, nrc_sent_orange_df_3["sadness"], label = "sadness")
plt.plot(((nrc_sent_orange_df_3.index+1)/number)*50, nrc_sent_orange_df_3["joy"], label = "joy")
#plt.plot(((nrc_sent_orange_df_3.index+1)/number)*50, nrc_sent_orange_df_3["trust"], label = "trust")
plt.title("Analysis of Zeze's emotions with Totoca using NRC Lexicon")
plt.xlabel("Segment Position in Novel as % of Text")
plt.ylabel("Segment Sentiment")
plt.legend()

plt.savefig('plot_NRC_totoca.png', dpi=300, bbox_inches='tight')

In [None]:
bing_sent_results_orange_3 = segment_sentiments(tokens_orange[0:8000], number, lexicon="bing")

bing_sent_orange_df_3 = pd.DataFrame(bing_sent_results_orange_3, columns = ["seg_start", "seg_eng", "positive_score", "negative_score", "overall_seg_score", "word_list"])

bing_sent_orange_df_3

In [None]:
plt.bar(((bing_sent_orange_df_3.index+1)/number)*100, bing_sent_orange_df_3["overall_seg_score"])
plt.title("Bing Sentiment Analysis of \n My Sweet Orange Tree")
plt.xlabel("Segment Position in Novel as % of Text")
plt.ylabel("Segment Sentiment")

In [None]:
number = 100

afinn_sent_results_orange_3 = segment_sentiments_afinn(tokens_orange[0:8000], number)

afinn_sent_orange_df_3 = pd.DataFrame(afinn_sent_results_orange_3, columns = ["seg_start", "seg_eng", "afinn_score", "pos=1", "pos=2", "pos=3", "pos=4", "pos=5", "neutral", "neg=-1", "neg=-2", "neg=-3", "neg=-4", "neg=-5", "word_list"])

afinn_sent_orange_df_3

In [None]:
plt.plot(((afinn_sent_orange_df_3.index+1)/number)*100, afinn_sent_orange_df_3["neg=-1"], label = "-1")
plt.plot(((afinn_sent_orange_df_3.index+1)/number)*100, afinn_sent_orange_df_3["neg=-2"], label = "-2")
plt.plot(((afinn_sent_orange_df_3.index+1)/number)*100, afinn_sent_orange_df_3["neg=-3"], label = "-3")
#plt.plot(((afinn_sent_orange_df_3.index+1)/number)*100, afinn_sent_orange_df_3["pos=1"], label = "1")
#plt.plot(((afinn_sent_orange_df_3.index+1)/number)*100, afinn_sent_orange_df_3["pos=2"], label = "2")
#plt.plot(((afinn_sent_orange_df_3.index+1)/number)*100, afinn_sent_orange_df_3["neg=-4"], label = "-4")
#plt.plot(((afinn_sent_orange_df_3.index+1)/number)*100, afinn_sent_orange_df_3["neg=-5"], label = "-5")
plt.title("Analysis of Zeze's emotions with Totoca using Afinn Lexicon")
plt.xlabel("Segment Position in Novel as % of Text")
plt.ylabel("Segment Sentiment")
plt.legend()

plt.savefig('plot_Afinn_totoca.png', dpi=300, bbox_inches='tight')