In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import feedparser
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import seaborn as sns
import concurrent.futures
import yfinance as yf
from urllib.parse import quote
import pickle
import os
import time

In [2]:
file_path = 'justetf.csv'
etf_data = pd.read_csv(file_path)
etf_data
etf_data = etf_data.dropna(subset=['ytdReturnCUR'])

In [3]:
if 'fundSizeMillions' in etf_data.columns:
    # Remove commas and convert to numeric values
    etf_data['fundSizeMillions'] = pd.to_numeric(
        etf_data['fundSizeMillions'].replace(',', '', regex=True), errors='coerce'
    ).fillna(0).astype(int)

# Verify the data types after conversion and inspect the first few rows
etf_data.dtypes['fundSizeMillions'], etf_data['fundSizeMillions'].sort_values()

(dtype('int64'),
 949         0
 1761        0
 2079        0
 1765        0
 208         0
         ...  
 686     12892
 1104    14753
 1832    26091
 1537    43427
 1824    48857
 Name: fundSizeMillions, Length: 2233, dtype: int64)

In [4]:
# List of countries for each group
emerging_markets_countries = [
    'China', 'India', 'Brazil', 'Indonesia', 'Mexico', 'Russia', 'Argentina',
    'South Africa', 'Philippines', 'Saudi Arabia', 'Poland', 'Taiwan', 'Thailand',
    'Malaysia', 'Colombia', 'South Korea', 'Turkey'
]

developed_markets_countries = [
    'Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom', 'United States'
]

# Function to calculate combined exposure for a list of countries
def calculate_combined_exposure(df, countries):
    combined_exposure = sum([df[f'exposureCountry_{country}'] for country in countries if f'exposureCountry_{country}' in df.columns])
    return combined_exposure

# Calculate combined exposures
etf_data['combinedEmergingMarkets'] = calculate_combined_exposure(etf_data, emerging_markets_countries)
etf_data['combinedDevelopedMarkets'] = calculate_combined_exposure(etf_data, developed_markets_countries)

# Set a threshold for significant combined exposure
combined_threshold = 0.5
fund_size_threshold = 50  # in millions

# Filter ETFs by combined exposure
etfs_emerging_markets = etf_data[
    (etf_data['combinedEmergingMarkets'] >= combined_threshold) &
    (etf_data['fundSizeMillions'] >= fund_size_threshold)
]

etfs_developed_markets = etf_data[
    (etf_data['combinedDevelopedMarkets'] >= combined_threshold) &
    (etf_data['fundSizeMillions'] >= fund_size_threshold)
]

# Display the count of ETFs for each group
{
    "Emerging Markets": etfs_emerging_markets.shape[0],
    "Developed Markets": etfs_developed_markets.shape[0]
}

{'Emerging Markets': 91, 'Developed Markets': 774}

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# Define features to exclude
exclude_columns = ['isin', 'wkn', 'name', 'fundProvider', 'legalStructure', 'ytdReturnCUR']
# Select numeric columns and exclude non-relevant features
numeric_columns = etf_data.select_dtypes(include=np.number).columns.tolist()
feature_columns = [col for col in numeric_columns if col not in exclude_columns]


In [6]:
# Filter the relevant data
X_emerging = etfs_emerging_markets[feature_columns].fillna(0)
y_emerging = etfs_emerging_markets['ytdReturnCUR']

X_developed = etfs_developed_markets[feature_columns].fillna(0)
y_developed = etfs_developed_markets['ytdReturnCUR']

# Split each dataset into train and test sets
X_train_emerging, X_test_emerging, y_train_emerging, y_test_emerging = train_test_split(X_emerging, y_emerging, test_size=0.2, random_state=42)
X_train_developed, X_test_developed, y_train_developed, y_test_developed = train_test_split(X_developed, y_developed, test_size=0.2, random_state=42)

# Normalize the features
scaler_emerging = StandardScaler()
scaler_developed = StandardScaler()

X_train_emerging_scaled = scaler_emerging.fit_transform(X_train_emerging)
X_test_emerging_scaled = scaler_emerging.transform(X_test_emerging)

X_train_developed_scaled = scaler_developed.fit_transform(X_train_developed)
X_test_developed_scaled = scaler_developed.transform(X_test_developed)

# Check the data shapes for both markets
(X_train_emerging_scaled.shape, X_test_emerging_scaled.shape, y_train_emerging.shape, y_test_emerging.shape,
 X_train_developed_scaled.shape, X_test_developed_scaled.shape, y_train_developed.shape, y_test_developed.shape)

((72, 104), (19, 104), (72,), (19,), (619, 104), (155, 104), (619,), (155,))

In [7]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Function to build and train an MLP model using scikit-learn
def train_mlp_model(X_train, y_train, X_test, y_test):
    model = MLPRegressor(hidden_layer_sizes=(64, 32, 16), activation='relu', max_iter=500, random_state=42)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    return model, mse, mae

# Train the MLP models for emerging and developed markets
model_emerging, mse_emerging, mae_emerging = train_mlp_model(X_train_emerging_scaled, y_train_emerging, X_test_emerging_scaled, y_test_emerging)
model_developed, mse_developed, mae_developed = train_mlp_model(X_train_developed_scaled, y_train_developed, X_test_developed_scaled, y_test_developed)

mse_emerging, mae_emerging, mse_developed, mae_developed

(0.013135818640021424,
 0.08058535451859156,
 0.00828440155728867,
 0.06163909017369867)

In [8]:
from sklearn.metrics import r2_score

# Predict on the test datasets
y_pred_emerging = model_emerging.predict(X_test_emerging_scaled)
y_pred_developed = model_developed.predict(X_test_developed_scaled)

# Calculate R² scores
r2_emerging = r2_score(y_test_emerging, y_pred_emerging)
r2_developed = r2_score(y_test_developed, y_pred_developed)

r2_emerging, r2_developed

(-7.258948752051156, -4.322152879624173)

In [9]:
sia = SentimentIntensityAnalyzer()

# Function to encode strings to URL-friendly format
def encode_string_to_url(input_string):
    return quote(input_string)

# Construct an RSS URL with economic search terms
def construct_rss_url(after, before, search_terms):
    search_term = '%20OR%20'.join([encode_string_to_url(term) for term in search_terms])
    endpoint = f'https://news.google.com/rss/search?q={search_term}+after:{after}+before:{before}&ceid=US:en&hl=en-US&gl=US'
    return endpoint

# Function to parse an RSS feed URL and return the articles
def parse_rss_feed(url):
    feed = feedparser.parse(url)
    if feed.bozo == 0:
        return [entry.title + " " + entry.summary for entry in feed.entries]
    else:
        raise Exception('Failed feed pull')

# Function to process a specific date range and calculate sentiment
def process_date(date, search_terms):
    after = date.strftime('%Y-%m-%d')
    before = (date + pd.Timedelta(days=1)).strftime('%Y-%m-%d')
    url = construct_rss_url(after, before, search_terms)
    articles = parse_rss_feed(url)
    
    # Calculate sentiment scores
    sentiments = [sia.polarity_scores(article) for article in articles]
    avg_sentiment = sum([s['compound'] for s in sentiments]) / len(sentiments) if sentiments else 0
    return len(articles), avg_sentiment

# Load existing data if available
def load_existing_data(pickle_file, default_data):
    if os.path.exists(pickle_file):
        with open(pickle_file, 'rb') as f:
            return pickle.load(f)
    else:
        return default_data

# Save data incrementally
def save_data(pickle_file, data):
    with open(pickle_file, 'wb') as f:
        pickle.dump(data, f)

search_terms_dict = {
    "Emerging": {country: [f'{country} economy', f'{country} GDP', 'economic outlook'] for country in emerging_markets_countries},
    "Developed": {country: [f'{country} economy', f'{country} GDP', 'economic outlook'] for country in developed_markets_countries}
}

# Define date range
date_range = pd.date_range(start='2023-01-01', end='2023-12-31')
pickle_file = 'news_sentiment_stratified.pkl'
default_data = {market: {country: pd.DataFrame(index=date_range, columns=['article_count', 'avg_sentiment'])
                         for country in search_terms_dict[market].keys()}
                for market in search_terms_dict.keys()}

# Load existing or initialize new data
data = load_existing_data(pickle_file, default_data)

# Process news data in parallel for each market and country
for market in search_terms_dict.keys():
    for country, search_terms in search_terms_dict[market].items():
        df = data[market][country]
        
        # Identify dates that still need processing
        unprocessed_dates = df[df['article_count'].isna()].index
        
        with concurrent.futures.ThreadPoolExecutor() as executor:
            results = list(tqdm(executor.map(lambda d: process_date(d, search_terms), unprocessed_dates), total=len(unprocessed_dates)))
        
        for i, date in enumerate(unprocessed_dates):
            df.at[date, 'article_count'] = results[i][0]
            df.at[date, 'avg_sentiment'] = results[i][1]
        
        # Save progress for each country to the pickle file
        save_data(pickle_file, data)
        
        # Add a sleep interval to prevent rate limiting
        time.sleep(1)

0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|█████████████████████████████████████████| 365/365 [00:16<00:00, 21.64it/s]
100%|█████████████████████████████████████████| 365/365 [00:18<00:00, 20.20it/s]
100%|█████████████████████████████████████████| 365/365 [00:19<00:00, 18.99it/s]
100%|█████████████████████████████████████████| 365/365 [00:18<00:00, 20.28it/s]
100%|█████████████████████████████████████████| 365/365 [00:17<00:00, 21.18it/s]
100%|█████████████████████████████████████████| 365/365 [00:16<00:00, 22.37it/s]
100%|█████████████████████████████████████████| 365/365 [00:18<00:00, 19.76it/s]
100%|█████████████████████████████████████████| 365/365 [00:20<00:00, 17.45it/s]


Unnamed: 0,article_count,avg_sentiment
2023-01-01,10,-0.15202
2023-01-02,10,-0.02719
2023-01-03,16,0.102087
2023-01-04,12,-0.0661
2023-01-05,10,-0.20356
...,...,...
2023-12-27,17,-0.030976
2023-12-28,10,0.00992
2023-12-29,10,0.17868
2023-12-30,10,-0.03726
