Load and merge the csv files containing the twitter data

In [21]:
import pandas as pd
import requests
from io import BytesIO
from typing import List

def load_and_merge_csv_xz_from_github(years: List[int], companies: List[str], base_url: str) -> pd.DataFrame:
    all_dataframes = []
    
    for year in years:
        for company in companies:
            filename = f"df_{year}_{company}.csv.xz"
            file_url = f"{base_url}/{filename}"
            
            response = requests.get(file_url)
            if response.status_code == 200:
                file_content = BytesIO(response.content)
                df = pd.read_csv(file_content, compression='xz')
                all_dataframes.append(df)

    merged_dataframe = pd.concat(all_dataframes, ignore_index=True)

    # Sort dataframe by date
    df_sorted = merged_dataframe.sort_values(by='post_date')

    # Remove the 'Unnamed: 0' column
    df_sorted = df_sorted.drop(columns=['Unnamed: 0'])
    
    # Drop 'year' columns that was used previously to split dataframes and save them as smaller csv files 
    df_sorted = df_sorted.drop(columns=['year'])


    # Reset the index of the dataframe and drop the old one
    df_sorted = df_sorted.reset_index(drop=True)

    return df_sorted

    
base_url = "https://raw.githubusercontent.com/inga-maria01/master_thesis/main/data"
years = [2015, 2016, 2017, 2018, 2019]
companies = ['AAPL', 'AMZN', 'GOOGL', 'TSLA', 'GOOG', 'MSFT']
tweets_df = load_and_merge_csv_xz_from_github(years, companies, base_url)

# Optionally, you can display the first few rows of the merged DataFrame to verify it loaded correctly
# print(merged_df.head())


In [20]:
tweets_df

Unnamed: 0,ticker_symbol,company_name,tweet_id,writer,post_date,body,comment_num,retweet_num,like_num,year
0,AAPL,apple,550441509175443456,VisualStockRSRC,2015-01-01 00:00:57,"lx21 made $10,008 on $AAPL -Check it out! htt...",0,0,1,2015
1,AAPL,apple,550441672312512512,KeralaGuy77,2015-01-01 00:01:36,Insanity of today weirdo massive selling. $aap...,0,0,0,2015
2,AMZN,Amazon.com,550441732014223360,DozenStocks,2015-01-01 00:01:50,S&P100 #Stocks Performance $HD $LOW $SBUX $TGT...,0,0,0,2015
3,TSLA,Tesla Inc,550442977802207232,ShowDreamCar,2015-01-01 00:06:47,$GM $TSLA: Volkswagen Pushes 2014 Record Recal...,0,0,1,2015
4,TSLA,Tesla Inc,550443808606126081,aaplstocknews,2015-01-01 00:10:05,Swing Trading: Up To 8.91% Return In 14 Days h...,0,0,1,2015
...,...,...,...,...,...,...,...,...,...,...
4336440,TSLA,Tesla Inc,1212159838882533376,ShortingIsFun,2019-12-31 23:53:21,In 2020 I may start Tweeting out positive news...,0,0,1,2019
4336441,TSLA,Tesla Inc,1212160015332728833,Commuternyc,2019-12-31 23:54:03,Patiently Waiting for the no twitter sitter tw...,0,0,5,2019
4336442,AAPL,apple,1212160410692046849,MoriaCrypto,2019-12-31 23:55:37,I don't discriminate. I own both $aapl and $ms...,1,0,1,2019
4336443,MSFT,Microsoft,1212160410692046849,MoriaCrypto,2019-12-31 23:55:37,I don't discriminate. I own both $aapl and $ms...,1,0,1,2019


In [25]:
missing_values = tweets_df['body'].isna().sum()

if missing_values > 0:
    print("There are", missing_values, "missing values in the 'body' column.")
else:
    print("There are no missing values in the 'body' column.")

There are no missing values in the 'body' column.


In [27]:
# Check for duplicate rows
duplicate_rows = tweets_df[tweets_df.duplicated()]

if not duplicate_rows.empty:
    print("There are duplicate rows in the DataFrame.")
    print(duplicate_rows)
else:
    print("There are no duplicate rows in the DataFrame.")


There are no duplicate rows in the DataFrame.


Obtain sentiment scores for each tweet using finBERT

In [28]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

# Load the model and tokenizer
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone', do_lower_case=True)
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')

# Initialize the pipeline for sentiment analysis
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Function to apply sentiment analysis on the text column
def get_sentiment_score(text):
    result = nlp(text)
    return result[0]

# Apply the function to the 'text' column
tweets_df['sentiment'] = tweets_df['body'].apply(get_sentiment_score)



KeyboardInterrupt: 

Weighing:
- Weight each tweet accordingly to its engagement rate consisting of number of likes, comments, retweets
    - Assign different weights to likes, comments, and retweets, e.g. likes = 1.5, comments = 2, retweets = 3 (according to a suggestion by Muñoz-Expósito et al. (2017))
- Group the weighted tweets by company for each day
    - Weigh the intensity of the sentiment score by normalized number of tweets?
- Weight the impact of each company on the index accordingly to their market cap.
    - In S&P 500 Twitter index: Index constituents are float-adjusted market capitalization (FMC) weighted, subject to a single constituent weight cap of 10%
        - FMC = Share Price × (Total Outstanding Shares × Free Float Percentage) - not relevant for nasdaq, fmc is only applied on sp500
        - The NDX's weights are adjusted quarterly
        - We probably don't need a cap as we're only including 5 big companies


Weighing:
 - Weigh by importance of company
    - Based on market value? 'market importance' -> how would that be measured?
    - Weigh based on volume traded? or weigh based on number of tweets? 
        - Could factor in the number of tweets per day normalized by the total number of tweets for that company
 - Weigh tweets by number likes/retweets/comments
 
- *Should we also weigh the importance of each day? Like if the sentiment is super extreme, but the number of tweets was very low that day (like only 50 tweets but all super positive), should we weigh it as not as telling as if there was a greater volume of tweets, so the score wouldn't be 1 it would be a bit more neutral?*