In [150]:
import pandas as pd
import warnings
import re
import nltk
import plotly.express as px
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [151]:
amz = pd.read_excel('Amazon.xlsx')

In [152]:
amz.astype({'Tweet content': 'string', 'Tweet language': 'string', 'Company Name': 'string'}).dtypes

Date              datetime64[ns]
Tweet content             string
Tweet language            string
Is a RT                     bool
Company Name              string
dtype: object

In [153]:
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

In [154]:
def preprocess_tweet(tweet):
    # Remove URLs, user mentions, special characters, punctuation, numbers, and emojis
    tweet = re.sub(r'http\S+|www\S+|https\S+|@[^\s]+|[^a-zA-Z\s]|\d+','', tweet)
    tweet = remove_emojis(tweet)

    # Convert data to lowercase
    tweet = tweet.lower()

    # Removing stopwords
    clean_words = [word for word in tweet.split() if word.lower() not in stopwords.words('english')]

    return ' '.join(clean_words)


# Word tokenization
def word_tokenization(tweet):
    # Tokenization
    words = nltk.word_tokenize(tweet)
    return words


# Create a new DataFrame for the cleaned tweet content
amz_1 = pd.DataFrame()

# Create a new DataFrame by copying the columns you want from the original DataFrame
amz_1 = amz[['Date', 'Tweet language', 'Is a RT', 'Company Name']].copy()

# Apply preprocessing and tokenization
amz_1['Tweet content'] = amz['Tweet content'].apply(preprocess_tweet)
amz_1['Tweet content'] = amz_1['Tweet content'].apply(word_tokenization)


In [155]:
# Download VADER lexicon if not already downloaded
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Function to get sentiment scores
def get_sentiment_score(words):
    # Join the list of words into a single string
    sentence = ' '.join(words)
    # Get the polarity scores for the sentence
    sentiment_scores = sid.polarity_scores(sentence)
    # Return the compound score which represents the overall sentiment
    return sentiment_scores['compound']

# Add a new column for sentiment scores in the DataFrame
amz_1['Sentiment Score'] = amz_1['Tweet content'].apply(get_sentiment_score)

# Function to categorize sentiment
def categorize_sentiment(score):
    if score >= 0.05:
        return 'Positive'
    elif score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Add a new column for sentiment categories in the DataFrame
amz_1['Sentiment'] = amz_1['Sentiment Score'].apply(categorize_sentiment)

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [156]:
import yfinance as yf

# Define the ticker symbol for Amazon
ticker_symbol = 'AMZN'

# Define the start and end dates
start_date = '2016-04-25'
end_date = '2016-06-15'

# Fetch the data from Yahoo Finance
amz_stock_data = yf.download(ticker_symbol, start=start_date, end=end_date)

[*********************100%%**********************]  1 of 1 completed


In [157]:
def plot_daily_avg_sentiment(daily_avg_sentiment):
    # Calculate daily average sentiment score
    daily_avg_sentiment['Date'] = daily_avg_sentiment['Date'].dt.date
    daily_avg_sentiment = daily_avg_sentiment.groupby('Date')['Sentiment Score'].mean().reset_index()

    # Plot the daily average sentiment score
    fig = px.line(daily_avg_sentiment, x='Date', y='Sentiment Score',
                  title='Daily Average Sentiment Score Over Time',
                  labels={'Sentiment Score': 'Average Sentiment Score', 'Date': 'Date'})
    fig.update_layout(xaxis_tickangle=-45, xaxis=dict(tickformat='%Y-%m-%d'), yaxis=dict(gridcolor='lightgrey'))
    fig.show()

In [158]:
plot_daily_avg_sentiment(amz_1)

In [159]:
amz_1 = amz_1.astype({'Date': 'datetime64[ns]', 'Tweet content': 'string', 'Tweet language': 'string', 'Company Name': 'string'})

In [160]:
# Filter rows between June 7, 2016, and June 12, 2016
filtered_amz_1 = amz_1[(amz_1['Date'] >= '2016-06-07') & (amz_1['Date'] <= '2016-06-12')]


# Group by date and count the number of rows for each date
daily_counts = filtered_amz_1.groupby(filtered_amz_1['Date'].dt.date).size()

# Print the daily counts
print(daily_counts)

Date
2016-06-07     87
2016-06-12    156
dtype: int64


In [161]:
# Reset index to make 'Date' a column instead of index
amz_stock_data = amz_stock_data.reset_index()

# Create the line chart using Plotly Express
fig = px.line(amz_stock_data, x='Date', y='Close', title='Amazon Closing Prices',
              labels={'Date': 'Date', 'Close': 'Close Price'})
fig.show()

In [162]:
# import os
# from transformers import BertTokenizer

# # Set the Hugging Face token
# os.environ["HUGGINGFACE_TOKEN"] = "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"

# # Load the tokenizer
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# def subword_tokenization(text):
#     # Initialize the BERT tokenizer
#     tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#     # Tokenize the text
#     tokens = tokenizer.tokenize(text)

#     # Convert the tokens to subword units
#     subword_tokens = tokenizer.convert_tokens_to_ids(tokens)

#     return subword_tokens

In [163]:
# Create a new DataFrame for the cleaned tweet content
amz_2 = pd.DataFrame()

# Create a new DataFrame by copying the columns you want from the original DataFrame
amz_2 = amz[['Date', 'Tweet language', 'Is a RT', 'Company Name']].copy()

# Apply preprocessing
amz_2['Tweet content'] = amz['Tweet content'].apply(preprocess_tweet)

In [164]:
# from transformers import PreTrainedTokenizerFast

# def subword_tokenization(text, tokenizer):
#     # Tokenize the text into subwords
#     subword_tokens = tokenizer.tokenize(text)

#     return subword_tokens

# # Load a pre-trained tokenizer
# tokenizer = PreTrainedTokenizerFast.from_pretrained("bert-base-uncased")

# # Apply subword tokenization to the DataFrame
# amz_2['Tweet content'] = amz_2['Tweet content'].apply(lambda text: subword_tokenization(text, tokenizer))

In [165]:
from transformers import BertTokenizer

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize a single text
def tokenize_text(text):
    tokens = tokenizer.tokenize(text)
    return tokens

# Assuming your DataFrame is named 'amz_2' and contains a column named 'Tweet content'
amz_2['Tokenized content'] = amz_2['Tweet content'].apply(tokenize_text)

# Print tokenized content of the first few entries
print(amz_2[['Tweet content', 'Tokenized content']].head())

                                       Tweet content  \
0  try discovery engine amp social network invest...   
1  andreas halvorsen buys facebook amazon masterc...   
2  rt stocks surprise investors earnings season a...   
3  oneplus launched price rs available amazon rea...   
4  ocado shares plummet amazon fresh launches rea...   

                                   Tokenized content  
0  [try, discovery, engine, amp, social, network,...  
1  [andreas, hal, ##vor, ##sen, buys, facebook, a...  
2  [rt, stocks, surprise, investors, earnings, se...  
3  [one, ##pl, ##us, launched, price, rs, availab...  
4  [o, ##ca, ##do, shares, plum, ##met, amazon, f...  


In [166]:
amz_2.head()

Unnamed: 0,Date,Tweet language,Is a RT,Company Name,Tweet content,Tokenized content
0,2016-06-15 09:45:00,en,False,Amazon,try discovery engine amp social network invest...,"[try, discovery, engine, amp, social, network,..."
1,2016-06-15 09:35:00,en,False,Amazon,andreas halvorsen buys facebook amazon masterc...,"[andreas, hal, ##vor, ##sen, buys, facebook, a..."
2,2016-06-15 09:26:00,en,True,Amazon,rt stocks surprise investors earnings season a...,"[rt, stocks, surprise, investors, earnings, se..."
3,2016-06-15 09:21:00,en,False,Amazon,oneplus launched price rs available amazon rea...,"[one, ##pl, ##us, launched, price, rs, availab..."
4,2016-06-15 08:53:00,en,False,Amazon,ocado shares plummet amazon fresh launches rea...,"[o, ##ca, ##do, shares, plum, ##met, amazon, f..."


In [167]:
# Add a new column for sentiment scores in the DataFrame
amz_2['Sentiment Score'] = amz_2['Tokenized content'].apply(get_sentiment_score)

# Add a new column for sentiment categories in the DataFrame
amz_2['Sentiment'] = amz_2['Sentiment Score'].apply(categorize_sentiment)

In [168]:
# Group the data frame by 'Date' and calculate the mean sentiment score
amz_3 = amz_2.groupby('Date')['Sentiment Score'].mean().reset_index()

In [169]:
date_column = 'Date'
score_column = 'Sentiment Score'
amz_3_sorted = amz_3.sort_values(by=date_column)

# Calculate daily average of sentiment scores
daily_avg_sentiment = amz_3_sorted.groupby(pd.Grouper(key=date_column, freq='D'))[score_column].mean().reset_index()

# Plot the daily average sentiment scores using Plotly Express
fig = px.line(daily_avg_sentiment, x=date_column, y=score_column, title='Daily Average Sentiment Scores')
fig.show()