In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [None]:
transcript = """
Operator: Good day, everyone. Welcome to the Apple Incorporated Third Quarter Fiscal Year 2020 Earnings Conference Call. 
    Today's call is being recorded. At this time, for opening remarks and introductions, I would like to turn things over to Mr. Tejas Gala,
    Senior Manager, Corporate Finance and Investor Relations. Please go ahead, sir.
    Tejas Gala: Thank you. Good afternoon and thank you for joining us. Speaking first today is Apple's CEO, Tim Cook; and he'll be followed 
    by CFO, Luca Maestri. After that, we'll open the call to questions from analysts. Please note that some of the information you'll hear 
    during our discussion today will consist of forward-looking statements including without limitation those regarding revenue, gross margin, 
    operating expenses, other income and expense, taxes, capitalallocation, and future business outlook, including the potential impact of 
    COVID-19 on the company's business and results of operations. Actual results or trends could differ materially from our forecast. For 
    more information, please refer to the risk factors discussed in Apple's most recently filed periodic reports Form 10-K and Form 10-Q 
    and the Form 8-K filed with the SEC today along with the associated press release. Apple assumes no obligation to update any forward-looking 
    statements or information, which speak as of their respective dates. I'd now like to turn the call over to Tim for introductory remarks.
    Tim Cook: Thanks, Tejas. Good afternoon, everyone. Thanks for joining the call today. Before we begin, I joined the many millions across 
    this country in mourning and memorialize Congressman John Lewis, who was laid to rest earlier today. We've lost a hero who walked among 
    us, a leader in the truest sense who urged this country to aim higher and be better until the very end.

"""

In [None]:
sentences = [' '.join(sent.split()).strip() for sent in transcript.replace('\n', '').split('. ')]

In [None]:
df = pd.DataFrame(sentences, columns=['content'])

In [None]:
df.dtypes

In [None]:
df.head()

In [None]:
import nltk

In [None]:
for dependency in ("brown", "names", "wordnet", "averaged_perceptron_tagger", "universal_tagset"):
    nltk.download(dependency)

In [None]:
# return the wordnet object value corresponding to the POS tag
from nltk.corpus import wordnet

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

def clean_text(text, digits=False, stop_words=False, lemmatize=False, only_noun=False):
    # lower text
    text = str(text).lower()
    
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    
    # remove words that contain numbers
    if digits:
        text = [word for word in text if not any(c.isdigit() for c in word)]
        
    # remove stop words
    if stop_words:
        stop = stopwords.words('english')
        text = [x for x in text if x not in stop]
    
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    
    # pos tag text
    if lemmatize:
        pos_tags = pos_tag(text)    
        # lemmatize text
        text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
        
    if only_noun:
        # select only nouns
        is_noun = lambda pos: pos[:2] == 'NN'
        text = [word for (word, pos) in pos_tag(text) if is_noun(pos)]
    
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    
    # join all
    text = " ".join(text)
    
    return(text)

In [None]:
%%time

# clean text data
df['content_clean'] = df['content'].apply(lambda x: clean_text(x, digits=True, stop_words=True, lemmatize=True))

In [None]:
df.head()

In [None]:
# add sentiment anaylsis columns
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
df['sentiment']= df['content_clean'].apply(lambda x: sid.polarity_scores(x))
df = pd.concat([df.drop(['sentiment'], axis=1), df['sentiment'].apply(pd.Series)], axis=1)

In [None]:
df = df.rename(columns={'neu': 'neutral', 'neg': 'negative', 'pos': 'positive'})

In [None]:
df.head()

In [None]:
df['confidence'] = df[["negative", "neutral", "positive"]].max(axis=1)
df['sentiment'] = df[["negative", "neutral", "positive"]].idxmax(axis=1)

In [None]:
df.head()

In [None]:
df['sentiment'].value_counts()

In [None]:
# visualization

In [None]:
grouped = pd.DataFrame(df['sentiment'].value_counts()).reset_index()
grouped.columns = ['sentiment','count']
print(grouped)

In [None]:
# Display percentage of positive, negative and neutral sentiments
fig = px.pie(grouped, values='count', names='sentiment', title='Sentiments')
fig.show()

In [None]:
sentiment_ratio = df['sentiment'].value_counts(normalize=True).to_dict()

In [None]:
for key in ['negative', 'neutral', 'positive']:
    if key not in sentiment_ratio:
        sentiment_ratio[key] = 0.0

In [None]:
sentiment_ratio

In [None]:
## Display sentiment score

sentiment_score = (sentiment_ratio['neutral'] + sentiment_ratio['positive']) - sentiment_ratio['negative']

fig = go.Figure(go.Indicator(
    mode = "number+delta",
    value = sentiment_score,
    delta = {"reference": 0.5},
    title = {"text": "Sentiment Score"},))

fig.show()

In [None]:
## Display negative sentence locations
fig = px.scatter(df, y='sentiment', color='sentiment', size='confidence', hover_data=['content'], color_discrete_map={"negative":"firebrick","neutral":"navajowhite","positive":"darkgreen"})


fig.update_layout(
    width=800,
    height=300,
)

In [None]:
# Disply annotated trasncript
from IPython.core.display import display, HTML

In [None]:
def annotate (record):
    line = f"""<span class="highlight {record['sentiment']}">{record['content']} </span>"""
    return line

In [None]:
text = ''
for record in df[['content', 'sentiment']].to_dict('records'):
    text += annotate(record)

In [None]:
display(HTML(text))