In [436]:
# Initial imports
import pandas as pd
import numpy as np
import datetime as dt
from pathlib import Path

%matplotlib inline

#Data Cleaning

In this section, you will need to read the CSV files into DataFrames and perform any necessary data cleaning steps. After cleaning, combine all DataFrames into a single DataFrame.

Files:

whale_returns.csv: Contains returns of some famous "whale" investors' portfolios.

algo_returns.csv: Contains returns from the in-house trading algorithms from Harold's company.

sp500_history.csv: Contains historical closing prices of the S&P 500 Index.

#Whale Returns

Read the Whale Portfolio daily returns and clean the data

In [437]:
# Reading whale returns
csv_path = Path('whale_returns.csv')
whale_returns = pd.read_csv(csv_path)
whale_returns.head()

Unnamed: 0,Date,SOROS FUND MANAGEMENT LLC,PAULSON & CO.INC.,TIGER GLOBAL MANAGEMENT LLC,BERKSHIRE HATHAWAY INC
0,2015-03-02,,,,
1,2015-03-03,-0.001266,-0.004981,-0.000496,-0.006569
2,2015-03-04,0.00223,0.003241,-0.002534,0.004213
3,2015-03-05,0.004016,0.004076,0.002355,0.006726
4,2015-03-06,-0.007905,-0.003574,-0.008481,-0.013098


In [438]:
# Count nulls
whale_returns.isnull().mean() * 100
whale_returns.isnull().sum()

Date                           0
SOROS FUND MANAGEMENT LLC      1
PAULSON & CO.INC.              1
TIGER GLOBAL MANAGEMENT LLC    1
BERKSHIRE HATHAWAY INC         1
dtype: int64

In [439]:
# Drop nulls
whale_returns = whale_returns.dropna()

In [440]:
# Count nulls
whale_returns.isnull().mean() * 100
whale_returns.isnull().sum()

Date                           0
SOROS FUND MANAGEMENT LLC      0
PAULSON & CO.INC.              0
TIGER GLOBAL MANAGEMENT LLC    0
BERKSHIRE HATHAWAY INC         0
dtype: int64

In [441]:
whale_returns = whale_returns.sort_values(by=['Date'], ascending=True)
whale_returns

Unnamed: 0,Date,SOROS FUND MANAGEMENT LLC,PAULSON & CO.INC.,TIGER GLOBAL MANAGEMENT LLC,BERKSHIRE HATHAWAY INC
1,2015-03-03,-0.001266,-0.004981,-0.000496,-0.006569
2,2015-03-04,0.002230,0.003241,-0.002534,0.004213
3,2015-03-05,0.004016,0.004076,0.002355,0.006726
4,2015-03-06,-0.007905,-0.003574,-0.008481,-0.013098
5,2015-03-09,0.000582,0.004225,0.005843,-0.001652
...,...,...,...,...,...
1055,2019-04-25,-0.000285,-0.001291,-0.005153,0.004848
1056,2019-04-26,0.008149,0.009162,0.012355,0.010434
1057,2019-04-29,0.001254,0.002719,0.006251,0.005223
1058,2019-04-30,-0.001295,-0.002211,-0.000259,-0.003702


Algorithmic Daily Returns
Read the algorithmic daily returns and clean the data

In [442]:
# Reading algorithmic returns
csv_path = Path('algo_returns.csv')
algo_returns = pd.read_csv(csv_path)
algo_returns.head()

Unnamed: 0,Date,Algo 1,Algo 2
0,2014-05-28,0.001745,
1,2014-05-29,0.003978,
2,2014-05-30,0.004464,
3,2014-06-02,0.005692,
4,2014-06-03,0.005292,


In [443]:
# Count nulls
algo_returns.isnull().mean() * 100
algo_returns.isnull().sum()

Date      0
Algo 1    0
Algo 2    6
dtype: int64

In [444]:
# Drop nulls
algo_returns = algo_returns.dropna()

In [445]:
# Count nulls
algo_returns.isnull().mean() * 100
algo_returns.isnull().sum()

Date      0
Algo 1    0
Algo 2    0
dtype: int64

In [446]:
algo_returns = algo_returns.sort_values(by=['Date'], ascending=True)
algo_returns

Unnamed: 0,Date,Algo 1,Algo 2
6,2014-06-05,0.004062,0.013285
7,2014-06-06,0.001857,0.008284
8,2014-06-09,-0.005012,0.005668
9,2014-06-10,0.004406,-0.000735
10,2014-06-11,0.004760,-0.003761
...,...,...,...
1236,2019-04-25,0.000682,-0.007247
1237,2019-04-26,0.002981,0.005256
1238,2019-04-29,0.005208,0.002829
1239,2019-04-30,-0.002944,-0.001570


S&P 500 Returns
Read the S&P 500 historic closing prices and create a new daily returns DataFrame from the data.

In [447]:
# Reading S&P 500 Closing Prices
csv_path = Path('sp500_history.csv')
sp500_history = pd.read_csv(csv_path)
sp500_history.head()

Unnamed: 0,Date,Close
0,23-Apr-19,$2933.68
1,22-Apr-19,$2907.97
2,18-Apr-19,$2905.03
3,17-Apr-19,$2900.45
4,16-Apr-19,$2907.06


In [448]:
# Check Data Types
sp500_history.dtypes

Date     object
Close    object
dtype: object

In [449]:
sp500_history['Close'] = sp500_history['Close'].str.replace(',', '')
sp500_history['Close'] = sp500_history['Close'].str.replace('$', '')
sp500_history['Close'] = sp500_history['Close'].astype("float")

In [450]:
sp500_history['Date'] = sp500_history['Date'].astype("datetime64")
sp500_history.head()

Unnamed: 0,Date,Close
0,2019-04-23,2933.68
1,2019-04-22,2907.97
2,2019-04-18,2905.03
3,2019-04-17,2900.45
4,2019-04-16,2907.06


In [451]:
# Calculate Daily Returns
prices = sp500_history["Close"]
daily_return = prices.pct_change(1)
sp500_history ['daily_return']= daily_return
sp500_history.head()

Unnamed: 0,Date,Close,daily_return
0,2019-04-23,2933.68,
1,2019-04-22,2907.97,-0.008764
2,2019-04-18,2905.03,-0.001011
3,2019-04-17,2900.45,-0.001577
4,2019-04-16,2907.06,0.002279


In [452]:
# Count nulls
sp500_history.isnull().mean() * 100
sp500_history.isnull().sum()

Date            0
Close           0
daily_return    1
dtype: int64

In [453]:
# Drop nulls
sp500_history = sp500_history.dropna()

In [454]:
# Count nulls
sp500_history.isnull().mean() * 100
sp500_history.isnull().sum()

Date            0
Close           0
daily_return    0
dtype: int64

In [455]:
# Rename `Close` Column to be specific to this portfolio.

In [456]:
sp500_history.columns = ['Date', 'S&P_500_Close', 'S&P_500_Daily_Return']
sp500_history.head()

Unnamed: 0,Date,S&P_500_Close,S&P_500_Daily_Return
1,2019-04-22,2907.97,-0.008764
2,2019-04-18,2905.03,-0.001011
3,2019-04-17,2900.45,-0.001577
4,2019-04-16,2907.06,0.002279
5,2019-04-15,2905.58,-0.000509


In [457]:
sp500_history = sp500_history.sort_values(by=['Date'], ascending=True)
sp500_history

Unnamed: 0,Date,S&P_500_Close,S&P_500_Daily_Return
1648,2012-10-01,1444.49,-0.000872
1647,2012-10-02,1445.75,-0.003611
1646,2012-10-03,1450.99,-0.007123
1645,2012-10-04,1461.40,0.000322
1644,2012-10-05,1460.93,0.003469
...,...,...,...
5,2019-04-15,2905.58,-0.000509
4,2019-04-16,2907.06,0.002279
3,2019-04-17,2900.45,-0.001577
2,2019-04-18,2905.03,-0.001011


Combine Whale, Algorithmic, and S&P 500 Returns

In [458]:
# Join Whale Returns, Algorithmic Returns, and the S&P 500 Returns into a single DataFrame with columns for each portfolio's returns.
column_appended_data = pd.concat([whale_returns, algo_returns, sp500_history], sort=False, axis="columns", join="inner")
column_appended_data

Unnamed: 0,Date,SOROS FUND MANAGEMENT LLC,PAULSON & CO.INC.,TIGER GLOBAL MANAGEMENT LLC,BERKSHIRE HATHAWAY INC,Date.1,Algo 1,Algo 2,Date.2,S&P_500_Close,S&P_500_Daily_Return
6,2015-03-10,-0.010263,-0.005341,-0.012079,-0.009739,2014-06-05,0.004062,0.013285,2019-04-12,2907.41,0.000630
7,2015-03-11,0.004012,0.005163,0.003312,-0.001147,2014-06-06,0.001857,0.008284,2019-04-11,2888.32,-0.006566
8,2015-03-12,0.008265,0.010272,0.013117,0.010801,2014-06-09,-0.005012,0.005668,2019-04-10,2888.21,-0.000038
9,2015-03-13,-0.002497,-0.001428,-0.003697,-0.008142,2014-06-10,0.004406,-0.000735,2019-04-09,2878.20,-0.003466
10,2015-03-16,0.007901,0.003583,0.007953,0.008055,2014-06-11,0.004760,-0.003761,2019-04-08,2895.77,0.006105
...,...,...,...,...,...,...,...,...,...,...,...
1055,2019-04-25,-0.000285,-0.001291,-0.005153,0.004848,2018-08-03,-0.003656,0.002817,2015-02-11,2068.53,-0.009552
1056,2019-04-26,0.008149,0.009162,0.012355,0.010434,2018-08-06,0.000529,0.000285,2015-02-10,2068.59,0.000029
1057,2019-04-29,0.001254,0.002719,0.006251,0.005223,2018-08-07,0.000207,-0.001330,2015-02-09,2046.74,-0.010563
1058,2019-04-30,-0.001295,-0.002211,-0.000259,-0.003702,2018-08-08,0.008429,-0.001812,2015-02-06,2055.47,0.004265


In [461]:
column_appended_data = pd.concat([whale_returns, algo_returns, sp500_history], axis="rows", join="outer")
column_appended_data

Unnamed: 0,Date,SOROS FUND MANAGEMENT LLC,PAULSON & CO.INC.,TIGER GLOBAL MANAGEMENT LLC,BERKSHIRE HATHAWAY INC,Algo 1,Algo 2,S&P_500_Close,S&P_500_Daily_Return
1,2015-03-03,-0.001266,-0.004981,-0.000496,-0.006569,,,,
2,2015-03-04,0.002230,0.003241,-0.002534,0.004213,,,,
3,2015-03-05,0.004016,0.004076,0.002355,0.006726,,,,
4,2015-03-06,-0.007905,-0.003574,-0.008481,-0.013098,,,,
5,2015-03-09,0.000582,0.004225,0.005843,-0.001652,,,,
...,...,...,...,...,...,...,...,...,...
5,2019-04-15 00:00:00,,,,,,,2905.58,-0.000509
4,2019-04-16 00:00:00,,,,,,,2907.06,0.002279
3,2019-04-17 00:00:00,,,,,,,2900.45,-0.001577
2,2019-04-18 00:00:00,,,,,,,2905.03,-0.001011
