In [357]:
# Initial imports
import pandas as pd
import numpy as np
import datetime as dt
from pathlib import Path

%matplotlib inline

#Data Cleaning

In this section, you will need to read the CSV files into DataFrames and perform any necessary data cleaning steps. After cleaning, combine all DataFrames into a single DataFrame.

Files:

whale_returns.csv: Contains returns of some famous "whale" investors' portfolios.

algo_returns.csv: Contains returns from the in-house trading algorithms from Harold's company.

sp500_history.csv: Contains historical closing prices of the S&P 500 Index.

#Whale Returns

Read the Whale Portfolio daily returns and clean the data

In [358]:
# Reading whale returns
csv_path = Path('whale_returns.csv')
whale_returns = pd.read_csv(csv_path)
whale_returns.head()

Unnamed: 0,Date,SOROS FUND MANAGEMENT LLC,PAULSON & CO.INC.,TIGER GLOBAL MANAGEMENT LLC,BERKSHIRE HATHAWAY INC
0,2015-03-02,,,,
1,2015-03-03,-0.001266,-0.004981,-0.000496,-0.006569
2,2015-03-04,0.00223,0.003241,-0.002534,0.004213
3,2015-03-05,0.004016,0.004076,0.002355,0.006726
4,2015-03-06,-0.007905,-0.003574,-0.008481,-0.013098


In [359]:
# Count nulls
whale_returns.isnull().mean() * 100
whale_returns.isnull().sum()

Date                           0
SOROS FUND MANAGEMENT LLC      1
PAULSON & CO.INC.              1
TIGER GLOBAL MANAGEMENT LLC    1
BERKSHIRE HATHAWAY INC         1
dtype: int64

In [360]:
# Drop nulls
whale_returns = whale_returns.dropna()

In [361]:
# Count nulls
whale_returns.isnull().mean() * 100
whale_returns.isnull().sum()

Date                           0
SOROS FUND MANAGEMENT LLC      0
PAULSON & CO.INC.              0
TIGER GLOBAL MANAGEMENT LLC    0
BERKSHIRE HATHAWAY INC         0
dtype: int64

In [362]:
whale_returns.head()

Unnamed: 0,Date,SOROS FUND MANAGEMENT LLC,PAULSON & CO.INC.,TIGER GLOBAL MANAGEMENT LLC,BERKSHIRE HATHAWAY INC
1,2015-03-03,-0.001266,-0.004981,-0.000496,-0.006569
2,2015-03-04,0.00223,0.003241,-0.002534,0.004213
3,2015-03-05,0.004016,0.004076,0.002355,0.006726
4,2015-03-06,-0.007905,-0.003574,-0.008481,-0.013098
5,2015-03-09,0.000582,0.004225,0.005843,-0.001652


Algorithmic Daily Returns
Read the algorithmic daily returns and clean the data

In [363]:
# Reading algorithmic returns
csv_path = Path('algo_returns.csv')
algo_returns = pd.read_csv(csv_path)
algo_returns.head()

Unnamed: 0,Date,Algo 1,Algo 2
0,2014-05-28,0.001745,
1,2014-05-29,0.003978,
2,2014-05-30,0.004464,
3,2014-06-02,0.005692,
4,2014-06-03,0.005292,


In [364]:
# Count nulls
algo_returns.isnull().mean() * 100
algo_returns.isnull().sum()

Date      0
Algo 1    0
Algo 2    6
dtype: int64

In [365]:
# Drop nulls
algo_returns = algo_returns.dropna()

In [366]:
# Count nulls
algo_returns.isnull().mean() * 100
algo_returns.isnull().sum()

Date      0
Algo 1    0
Algo 2    0
dtype: int64

In [367]:
algo_returns.head()

Unnamed: 0,Date,Algo 1,Algo 2
6,2014-06-05,0.004062,0.013285
7,2014-06-06,0.001857,0.008284
8,2014-06-09,-0.005012,0.005668
9,2014-06-10,0.004406,-0.000735
10,2014-06-11,0.00476,-0.003761


S&P 500 Returns
Read the S&P 500 historic closing prices and create a new daily returns DataFrame from the data.

In [368]:
# Reading S&P 500 Closing Prices
csv_path = Path('sp500_history.csv')
sp500_history = pd.read_csv(csv_path)
sp500_history.head()

Unnamed: 0,Date,Close
0,23-Apr-19,$2933.68
1,22-Apr-19,$2907.97
2,18-Apr-19,$2905.03
3,17-Apr-19,$2900.45
4,16-Apr-19,$2907.06


In [369]:
# Check Data Types
sp500_history.dtypes

Date     object
Close    object
dtype: object

In [370]:
sp500_history['Close'] = sp500_history['Close'].str.replace(',', '')
sp500_history['Close'] = sp500_history['Close'].str.replace('$', '')
sp500_history['Close'] = sp500_history['Close'].astype("float")

In [371]:
sp500_history['Date'] = sp500_history['Date'].astype("datetime64")
sp500_history.head()

Unnamed: 0,Date,Close
0,2019-04-23,2933.68
1,2019-04-22,2907.97
2,2019-04-18,2905.03
3,2019-04-17,2900.45
4,2019-04-16,2907.06


In [372]:
# Calculate Daily Returns
prices = sp500_history["Close"]
daily_return = prices.pct_change(1)
sp500_history ['daily_return']= daily_return
sp500_history.head()

Unnamed: 0,Date,Close,daily_return
0,2019-04-23,2933.68,
1,2019-04-22,2907.97,-0.008764
2,2019-04-18,2905.03,-0.001011
3,2019-04-17,2900.45,-0.001577
4,2019-04-16,2907.06,0.002279


In [373]:
# Count nulls
sp500_history.isnull().mean() * 100
sp500_history.isnull().sum()

Date            0
Close           0
daily_return    1
dtype: int64

In [374]:
# Drop nulls
sp500_history = sp500_history.dropna()

In [375]:
# Count nulls
sp500_history.isnull().mean() * 100
sp500_history.isnull().sum()

Date            0
Close           0
daily_return    0
dtype: int64

In [376]:
# Rename `Close` Column to be specific to this portfolio.

In [377]:
sp500_history

Unnamed: 0,Date,Close,daily_return
1,2019-04-22,2907.97,-0.008764
2,2019-04-18,2905.03,-0.001011
3,2019-04-17,2900.45,-0.001577
4,2019-04-16,2907.06,0.002279
5,2019-04-15,2905.58,-0.000509
...,...,...,...
1644,2012-10-05,1460.93,0.003469
1645,2012-10-04,1461.40,0.000322
1646,2012-10-03,1450.99,-0.007123
1647,2012-10-02,1445.75,-0.003611


Combine Whale, Algorithmic, and S&P 500 Returns

In [None]:
# Join Whale Returns, Algorithmic Returns, and the S&P 500 Returns into a single DataFrame with columns for each portfolio's returns.