In [2]:
import requests
from io import BytesIO
from typing import List
import pandas as pd

def load_and_merge_csv_xz_from_github(years: List[int], companies: List[str], base_url: str) -> pd.DataFrame:
    all_dataframes = []
    
    for year in years:
        for company in companies:
            filename = f"df_{year}_{company}.csv.xz"
            file_url = f"{base_url}/{filename}"
            
            response = requests.get(file_url)
            if response.status_code == 200:
                file_content = BytesIO(response.content)
                df = pd.read_csv(file_content, compression='xz')
                all_dataframes.append(df)

    merged_dataframe = pd.concat(all_dataframes, ignore_index=True)

    # Sort dataframe by date
    df_sorted = merged_dataframe.sort_values(by='post_date')

    # Remove the 'Unnamed: 0' column
    df_sorted = df_sorted.drop(columns=['Unnamed: 0'])
    
    # Drop 'year' columns that was used previously to split dataframes and save them as smaller csv files 
    df_sorted = df_sorted.drop(columns=['year'])


    # Reset the index of the dataframe and drop the old one
    df_sorted = df_sorted.reset_index(drop=True)

    return df_sorted

    
base_url = "https://raw.githubusercontent.com/inga-maria01/master_thesis/main/data"
years = [2015, 2016, 2017, 2018, 2019]
companies = ['AAPL', 'AMZN', 'GOOGL', 'TSLA', 'GOOG', 'MSFT']
tweets_df = load_and_merge_csv_xz_from_github(years, companies, base_url)


In [3]:
import re

def filter_tweets_v7(df):
    df = df.copy()

    print("Initial dataset length:", len(df))

    # Normalize URLs in the tweet bodies - replace with "URL" or remove
    df['body'] = df['body'].apply(lambda text: re.sub(r'http\S+', 'URL', text))

    # Remove all duplicates based on 'tweet_id' and 'ticker_symbol' (seen as spam)
    df.drop_duplicates(subset=['tweet_id', 'ticker_symbol'], inplace=True, keep = False)
    print("After removing duplicate tweet_ids and ticker_symbols:", len(df))

    # Remove last duplicates based on 'tweet_id' and 'company_name'(will only apply to Google as they have different ticker_symbols)
    df.drop_duplicates(subset=['tweet_id', 'company_name'], inplace=True, keep = 'first')
    print("After removing duplicate tweet_ids and company_names (Google):", len(df))

    # Remove tweets with duplicate texts longer than 5 words considering the ticker symbol
    df['word_count'] = df['body'].apply(lambda text: len(text.split()))
    duplicates = df[(df['word_count'] > 5)].duplicated(subset=['body', 'ticker_symbol'], keep=False)
    df = df.loc[~df.index.isin(duplicates[duplicates].index)]
    print("After removing long duplicate texts:", len(df))

    # Apply counting functions
    df['hashtag_count'] = df['body'].apply(lambda text: sum(1 for word in text.split() if word.startswith('#')))
    df['cashtag_count'] = df['body'].apply(lambda text: sum(1 for word in text.split() if word.startswith('$')))
    df['mention_count'] = df['body'].apply(lambda text: sum(1 for word in text.split() if word.startswith('@')))

    # Filter tweets based on hashtag and cashtag counts
    df = df[(df['cashtag_count'] < 5) & (df['hashtag_count'] < 8)]
    print("After filtering by cashtags and hashtags:", len(df))

    # Ratio filters (cashtags, hashtags, mentions to words)
    df['cashtag_ratio'] = df['cashtag_count'] / df['word_count'].replace(0, 1)
    df['hashtag_ratio'] = df['hashtag_count'] / df['word_count'].replace(0, 1)
    df['mention_ratio'] = df['mention_count'] / df['word_count'].replace(0, 1)
    df = df[(df['cashtag_ratio'] <= 0.5) & (df['hashtag_ratio'] <= 0.5) & (df['mention_ratio'] <= 0.5)]
    print("After filtering by ratios:", len(df))

    # Keyword filter (defined by Wilksch, Abramova (2023) PyFin-sentiment: Towards a machine-learning-based model for deriving sentiment from financial tweets)
    keywords = ['bitcoin', 'etherium', 'btc', 'eth', 'nft', 'token', 'wallet', 'web3',
                'airdrop', 'wagmi', 'solana', 'opensea', 'cryptopunks', 'uniswap', 
                'lunar', 'hodl', 'binance', 'coinbase', 'cryptocom', 'doge']
    df['keyword_count'] = df['body'].apply(lambda text: sum(text.lower().count(kw) for kw in keywords))
    df = df[df['keyword_count'] <= 2]
    print("After filtering by keyword count:", len(df))

    return df


filtered_df_7 = filter_tweets_v7(tweets_df)


Initial dataset length: 4336445
After removing duplicate tweet_ids and ticker_symbols: 4336445
After removing duplicate tweet_ids and company_names (Google): 4288706
After removing long duplicate texts: 3196398
After filtering by cashtags and hashtags: 2529106
After filtering by ratios: 2508943
After filtering by keyword count: 2508222


In [4]:
# limit magnitude of preparation as it doesn't seem to be necessary according to example in pyfin documentation
import re

def preprocess_text_column(df):
    df = df.copy()  # Create a copy to avoid modifying the original DataFrame

    def transform_text(text):
        # Replace cashtags with 'TICKER'
        # text = re.sub(r'\$\w+', 'TICKER', text)
        
        # Replace mentions with '@user'
        text = re.sub(r'@\w+', '@user', text)
        
        # Replace all digits with '9'
        # text = re.sub(r'\d', '9', text)
        
        # Replace newlines with spaces
        text = text.replace('\n', ' ')
        
        # Convert to lowercase
        text = text.lower()

        # Remove repeated characters more than twice
        text = re.sub(r'(.)\1{2,}', r'\1', text)
        
        return text

    # Apply the transformation to the 'body' column
    df['body'] = df['body'].apply(transform_text)
    
    return df

# # To test the function, you would use:
# df = pd.DataFrame({'body': ['Helloooo!!!! How are youuuu???']})
# print(preprocess_text_column(df))

In [5]:
# Preprocess text column
preprocessed_df = preprocess_text_column(filtered_df_7)


In [6]:
# WARNINGS CAN BE IGNORED (PROOF THROUGH COMPARING SENTIMENT LABELS OBTAINED BY 1.1.1 AND 1.3.0)
from pyfin_sentiment.model import SentimentModel
import numpy as np

# Initialize the sentiment model
# SentimentModel.download("small")  # This line is commented because it's assumed the model is already downloaded
model = SentimentModel("small")

def apply_batch_sentiment(texts, batch_size=1000):
    # Initialize an empty list to store sentiment results
    sentiments = []
    
    # Process texts in batches
    total_texts = len(texts)
    for i in range(0, total_texts, batch_size):
        batch_texts = texts[i:i + batch_size]
        batch_predictions = model.predict_proba(batch_texts)
        sentiments.extend(batch_predictions)
        
        # Print progress
        processed = min(i + batch_size, total_texts)
        print(f"Processed {processed}/{total_texts} texts")
    
    return sentiments

# Assume preprocessed_df is your DataFrame and has already been defined
# Apply sentiment analysis in batches to the 'body' column and store the results in a new column
preprocessed_df['sentiment'] = apply_batch_sentiment(preprocessed_df['body'].tolist(), batch_size=1000)

# Now the DataFrame preprocessed_df includes a new column 'sentiment' with sentiment labels


Processed 1000/2508222 texts
Processed 2000/2508222 texts
Processed 3000/2508222 texts
Processed 4000/2508222 texts
Processed 5000/2508222 texts
Processed 6000/2508222 texts
Processed 7000/2508222 texts
Processed 8000/2508222 texts
Processed 9000/2508222 texts
Processed 10000/2508222 texts
Processed 11000/2508222 texts
Processed 12000/2508222 texts
Processed 13000/2508222 texts
Processed 14000/2508222 texts
Processed 15000/2508222 texts
Processed 16000/2508222 texts
Processed 17000/2508222 texts
Processed 18000/2508222 texts
Processed 19000/2508222 texts
Processed 20000/2508222 texts
Processed 21000/2508222 texts
Processed 22000/2508222 texts
Processed 23000/2508222 texts
Processed 24000/2508222 texts
Processed 25000/2508222 texts
Processed 26000/2508222 texts
Processed 27000/2508222 texts
Processed 28000/2508222 texts
Processed 29000/2508222 texts
Processed 30000/2508222 texts
Processed 31000/2508222 texts
Processed 32000/2508222 texts
Processed 33000/2508222 texts
Processed 34000/250

In [11]:
preprocessed_df['sentiment'][1]

array([0.30783402, 0.22237089, 0.46979509])

In [15]:
sentiment_score = 1*preprocessed_df['sentiment'][:][0] + 0*preprocessed_df['sentiment'][:][1] + -1*preprocessed_df['sentiment'][:][2]

KeyError: 0

In [None]:
s