# GENERAL TITLE

## Short description of the notebook

# General Setup

In [10]:
%%bash
cd /content

REPO=https://github.com/jacopo-raffaeli/portfolio-replica.git
DIR=portfolio-replica

# Clone if needed, else pull latest
if [ ! -d "$DIR" ]; then
  git clone $REPO > /dev/null 2>&1
else
  cd $DIR
  git pull origin main > /dev/null 2>&1
  cd ..
fi

# Enter project root and install dependencies
cd $DIR
pip install -r requirements.txt > /dev/null 2>&1


In [37]:
# Standard library
import os
import sys
import warnings
import random
import logging
import pickle

# Core scientific stack
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [12]:
# Add repo root to sys.path for imports
PROJECT_ROOT = "/content/portfolio-replica"
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)
    sys.path.append(os.path.join(PROJECT_ROOT, 'src'))

# Set working directory for relative paths
os.chdir(PROJECT_ROOT)
print(f"Current working directory: {os.getcwd()}")

Current working directory: /content/portfolio-replica


In [13]:
# Suppress warnings
warnings.filterwarnings('ignore')

# Pandas display options
pd.set_option('display.max_columns', 100)
pd.set_option('display.precision', 3)

# Seaborn and Matplotlib display options
sns.set(style='whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.labelsize'] = 12

# Set reproducible seeds
random.seed(42)
np.random.seed(42)

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)s %(message)s',
    datefmt='%H:%M:%S'
)

# Load & Inspect Dataset

In [38]:
# Define paths
data_raw_path = "data/raw/"
data_interim_path = "data/interim/"
data_processed_path = "data/processed/"

# Read full names (row 4)
full_names_df = pd.read_excel(data_raw_path + "dataset_raw.xlsx", header=None, skiprows=3, nrows=1)
full_names = full_names_df.iloc[0].tolist()[1:]
# Read Bloomberg tickers (row 5)
tickers_df = pd.read_excel(data_raw_path + "dataset_raw.xlsx", header=None, skiprows=4, nrows=1)
tickers = tickers_df.iloc[0].tolist()[1:]
# Create and save a dictionary mapping tickers to full names
tickers_name_dict = dict(zip(tickers, full_names))
with open(data_processed_path + "tickers_name_dict.pkl", "wb") as f:
    pickle.dump(tickers_name_dict, f)

# Read the actual data without using any row as a header (from row 6)
df_raw = pd.read_excel(data_raw_path + "dataset_raw.xlsx", header=None, skiprows=5)
# Set column names and index
df_raw.columns = ['Date'] + tickers
df_raw['Date'] = pd.to_datetime(df_raw['Date'], format='%d/%m/%Y')
df_raw = df_raw.set_index('Date')

In [62]:
print("Dataset shape:", df_raw.shape)
display(df_raw.head())

Dataset shape: (705, 15)


Unnamed: 0_level_0,MXWO,MXWD,LEGATRUU,HFRXGL,RX1,TY1,GC1,CO1,ES1,VG1,NQ1,LLL1,TP1,DU1,TU2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2007-10-23,1633.44,414.14,350.214,1343.63,113.7,110.516,759.0,82.85,1525.5,4393.0,2212.0,1287.0,1570.5,103.385,103.719
2007-10-30,1663.89,423.26,352.541,1356.53,113.79,110.656,787.8,87.44,1536.0,4476.0,2217.75,1323.9,1610.5,103.41,103.812
2007-11-06,1651.59,419.51,354.176,1360.2,113.79,110.875,823.4,93.26,1525.0,4425.0,2233.5,1320.0,1575.5,103.41,104.047
2007-11-13,1601.81,405.98,357.222,1347.16,114.35,111.719,799.0,88.83,1483.25,4323.0,2066.75,1271.0,1440.5,103.595,104.305
2007-11-20,1570.74,398.54,359.445,1335.21,114.72,113.156,791.4,95.49,1446.0,4296.0,2035.5,1225.2,1472.5,103.8,104.945


In [65]:
print("Dataset Statistics:")
display(df_raw.describe())

Dataset Statistics:


Unnamed: 0,MXWO,MXWD,LEGATRUU,HFRXGL,RX1,TY1,GC1,CO1,ES1,VG1,NQ1,LLL1,TP1,DU1,TU2
count,705.0,705.0,705.0,705.0,705.0,705.0,705.0,705.0,705.0,705.0,705.0,705.0,705.0,705.0,705.0
mean,1644.126,404.152,452.203,1210.092,147.279,125.967,1317.903,77.018,1960.443,3072.168,4526.981,1064.444,1277.263,110.221,108.628
std,443.39,100.98,47.338,73.965,19.319,6.295,274.478,26.794,770.071,473.813,2900.867,128.955,353.598,2.375,1.533
min,705.35,176.1,342.247,1021.4,110.2,110.516,732.7,19.33,689.5,1873.0,1072.0,465.1,704.0,102.19,103.719
25%,1293.43,328.79,431.737,1158.15,128.9,121.203,1178.0,54.27,1314.5,2753.0,2244.0,1103.6,905.0,109.23,107.871
50%,1646.69,402.57,456.729,1215.2,148.0,126.0,1283.0,72.05,1909.25,3071.0,3884.75,1103.6,1302.5,110.855,109.094
75%,1978.83,478.41,478.187,1254.25,163.24,130.344,1499.7,105.78,2569.75,3433.0,6333.25,1103.6,1587.5,111.99,109.906
max,2944.05,702.99,559.312,1415.11,179.32,140.328,2001.2,140.67,4132.8,4476.0,13975.75,1323.9,1978.0,112.72,110.527


In [64]:
print("Missing Values:")
display(df_raw.isna().sum().sort_values(ascending=False))

Missing Values:


Unnamed: 0,0
MXWO,0
MXWD,0
LEGATRUU,0
HFRXGL,0
RX1,0
TY1,0
GC1,0
CO1,0
ES1,0
VG1,0


In [67]:
print(f"From {df_raw.index.min()} to {df_raw.index.max()}, total days: {df_raw.shape[0]}")
display(df_raw.index.to_series().diff().value_counts().head())

From 2007-10-23 00:00:00 to 2021-04-20 00:00:00, total days: 705


Unnamed: 0_level_0,count
Date,Unnamed: 1_level_1
7 days,704


# PUSH

In [79]:
!curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg | sudo dd of=/usr/share/keyrings/githubcli-archive-keyring.gpg > /dev/null
!sudo chmod go+r /usr/share/keyrings/githubcli-archive-keyring.gpg > /dev/null
!echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | sudo tee /etc/apt/sources.list.d/github-cli.list > /dev/null
!sudo apt update > /dev/null
!sudo apt install gh -y > /dev/null


4+1 records in
4+1 records out
2270 bytes (2.3 kB, 2.2 KiB) copied, 0.0901815 s, 25.2 kB/s


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)




In [80]:
!git config --global user.email "jacoporaffaeli@gmail.com"
!git config --global user.name "jacopo-raffaeli"

In [85]:
from getpass import getpass
import os

# 1. Get your GitHub Personal Access Token securely
token = getpass("Paste your GitHub Personal Access Token: ")
os.environ['GITHUB_TOKEN'] = token

# 2. Set your GitHub repo details
GITHUB_USERNAME = "jacopo-raffaeli"
REPO_NAME = "portfolio-replica"
BRANCH = "main"

# 3. Construct remote URL with token embedded (hidden from output)
remote_url = f"https://{token}@github.com/{GITHUB_USERNAME}/{REPO_NAME}.git"

# 4. Set git user info (if not done already)
!git config --global user.email "jacoporaffaeli@gmail.com"
!git config --global user.name "jacopo-raffaeli"

# 5. Change remote origin URL to token-embedded one
!git remote set-url origin {remote_url}


Paste your GitHub Personal Access Token: ··········


In [86]:
# 6. Add and commit changes (customize your commit message)
!git add .
!git commit -m "feat: Add Initial EDA" || echo "No changes to commit."

# 7. Push to GitHub
!git push origin {BRANCH}

On branch main
Your branch is ahead of 'origin/main' by 1 commit.
  (use "git push" to publish your local commits)

nothing to commit, working tree clean
No changes to commit.
Enumerating objects: 8, done.
Counting objects: 100% (8/8), done.
Delta compression using up to 2 threads
Compressing objects: 100% (5/5), done.
Writing objects: 100% (5/5), 699 bytes | 699.00 KiB/s, done.
Total 5 (delta 1), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/jacopo-raffaeli/portfolio-replica.git
   d13d1b1..b1d6748  main -> main


In [87]:
# Show git status, which files are changed and staged
!git status

# Show last commit files changed (to check if notebook was included)
!git show --name-only --oneline -1

# Show current branch
!git branch

On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean
[33mb1d6748[m[33m ([m[1;36mHEAD -> [m[1;32mmain[m[33m, [m[1;31morigin/main[m[33m, [m[1;31morigin/HEAD[m[33m)[m feat: Add Initial EDA
data/processed/tickers_name_dict.pkl
* [32mmain[m
