# Algorithmic Trading Python for Beginners

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 1. Importing and Pre-Processing Data

In [None]:
# Retrieve Apple from Yahoo Finance
Apple = yf.download("AAPL", start='2010-01-01', end='2021-01-01')
Apple

In [None]:
# Obtain multiple stocks using a list
ticker = ["SPY", "AAPL", "KO"]
stocks = yf.download(ticker, start='2010-01-01', end='2021-01-01')
stocks

In [None]:
# Print top five
stocks.head()

In [None]:
# Print bottom five
stocks.tail()

In [None]:
# Print general info about stocks
stocks.info()

In [None]:
# Export stocks into .csv and import it back
stocks.to_csv("stocksYT.csv")
stocks = pd.read_csv("stocksYT.csv")
stocks

In [None]:
# Set first two rows as the headers
stocks = pd.read_csv("stocksYT.csv", header=[0,1])
stocks

In [None]:
# Set first column as the index of the data frame
stocks = pd.read_csv("stocksYT.csv", header=[0,1], index_col=[0])
stocks

![](/Users/jooyoungoh/Pictures/Screenshots/column-indexing.png)

In [None]:
# Parse the `Date` column into datetime objects
stocks = pd.read_csv("stocksYT.csv", header=[0,1], index_col=[0], parse_dates=[0])
stocks

In [None]:
# Print column names
stocks.columns

In [None]:
# Convert the multi-index into one tuple
stocks.columns = stocks.columns.to_flat_index()
stocks.columns

In [None]:
stocks

<img src="/Users/jooyoungoh/Pictures/Screenshots/Screenshot 2024-06-01 at 4.49.45 PM.png"/>

In [None]:
stocks.columns = pd.MultiIndex.from_tuples(stocks.columns)
stocks

# 2. Describing the Data

In [None]:
stocks.columns

In [None]:
# Return the count, mean, stdev, etc.
stocks.describe()

In [None]:
# Return specified rows and columns of a data frame
stocks.loc[:,"Close"]

In [None]:
# We create an independent copy of the "Close" column using close().
# While it might seem suffice to assign it to 'close', it'd be just a view of the original data set.
close = stocks.loc[:, "Close"].copy()
close

In [None]:
# Configure Jupyter to display plots inline
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

In [None]:
close.plot(figsize=(15,8), fontsize=12)
plt.legend(fontsize=12)
plt.show

In [None]:
# We will now normalize the data for comparison
close.iloc[0,0]

In [None]:
close.AAPL

In [None]:
close.AAPL.div(close.iloc[0,0])

In [None]:
close.AAPL.div(close.iloc[0,0]).mul(100)

In [None]:
close.iloc[0]

In [None]:
close.div(close.iloc[0]).mul(100)

In [None]:
normClose = close.div(close.iloc[0]).mul(100)
normClose

In [None]:
normClose.plot(figsize=(15,8), fontsize=12)
plt.legend(fontsize=12)
plt.show

# 3. Working with Data

In [None]:
# Create a data frame containing the close of AAPL
aapl = close.AAPL.copy().to_frame()
aapl

In [None]:
# Let's say we need to calculate the increase and decrease from the previous day.
# One of the things we need to do before that is to learn the `shift` function,
# which shifts index by desired number of periods with an optional time `freq`.
aapl.shift() # appl.shift(preiod=1)

In [None]:
aapl["lag1"]=aapl.shift()
aapl

In [None]:
# Subtract `aapl.AAPL` by `aapl.lag1`
aapl["Diff"] = aapl.AAPL.sub(aapl.lag1)
aapl

In [None]:
aapl["% Change"] = aapl.AAPL.div(aapl.lag1)
aapl

In [None]:
aapl["Diff2"] = aapl.AAPL.diff(periods=1)
aapl

In [None]:
aapl["% Change 2"] = aapl.AAPL.pct_change(periods=1).mul(100)
aapl

In [None]:
aapl["% Change"] = aapl.AAPL.div(aapl.lag1).sub(1).mul(100)
aapl

In [None]:
del aapl["lag1"]
del aapl["Diff"]
del aapl["Diff2"]
del aapl["% Change"]

In [None]:
aapl

In [None]:
aapl.rename(columns = {"% Change 2": "Change"}, inplace = True)
aapl

In [None]:
# Convert the data into monthly data according to the last day of the month
aapl.AAPL.resample("M").last()

In [None]:
# Convert the data into monthly data according to the last business day of the month
aapl.AAPL.resample("BM").last().pct_change(periods=1).mul(100)

# 4. The Mean, Variance, and Standard Deviation of Data

In [None]:
aapl

In [None]:
del aapl["Change"]

In [None]:
aapl

In [None]:
ret = aapl.pct_change().dropna()
ret

In [None]:
ret.info()

In [None]:
# Plotting a histogram for the rate of return
ret.plot(kind="hist",figsize=(12, 8), bins=100)
plt.show()

In [None]:
# Calculating the mean return
mean_daily_ret = ret.mean()
mean_daily_ret

In [None]:
var_daily_ret = ret.var()
var_daily_ret

In [None]:
std_daily_ret = np.sqrt(var_daily_ret)
std_daily_ret

In [None]:
ret.std()

In [None]:
mean_annual_ret = mean_daily_ret * 252
mean_annual_ret

In [None]:
var_annual_ret = var_daily_ret * 252
var_annual_ret

In [None]:
std_annual_ret = np.sqrt(var_annual_ret)
std_annual_ret

In [None]:
ret.std() * np.sqrt(252)

In [None]:
# Download stock data
ticker = ["SPY", "AAPL", "KO", "IBM", "DIS", "MSFT"]
stocks = yf.download(ticker, start='2010-01-01', end='2021-01-01')

# Select and normalize close columns
close = stocks.loc[:, "Close"].copy()
normClose = close.div(close.iloc[0]).mul(100)

normClose

In [None]:
# Plot normalized data
normClose.plot(figsize=(15, 8), fontsize=12)
plt.legend(fontsize=12)
plt.show()

In [None]:
ret = close.pct_change().dropna()
ret.head()

In [None]:
ret.describe()

In [None]:
ret.describe().T

In [None]:
summary = ret.describe().T.loc[:, ["mean", "std"]]
summary

In [None]:
summary["mean"] = summary["mean"] * 252
summary["std"] = summary["std"] * np.sqrt(252)
summary

In [None]:
summary.plot.scatter(x="std", y="mean", figsize=[12,8], s=50, fontsize=15)
for i in summary.index:
    plt.annotate(i, xy=[summary.loc[i,"std"]+0.002, summary.loc[i,"mean"]+0.002], size=15)
plt.xlabel("Annual Risk (std)", fontsize = 15)
plt.ylabel("Annual Return (mean)", fontsize = 15)
plt.title("Annual Risk vs. Return", fontsize = 25)
plt.show()

# 5. Correlation and Covariance

In [None]:
ret

In [None]:
ret.cov()

In [None]:
ret.corr()

In [None]:
import seaborn as sns

In [None]:
plt.figure(figsize=(12, 8))
sns.set(font_scale=1.4)
sns.heatmap(ret.corr(), cmap="Reds_r", annot=True, annot_kws={"size":15}, vmax=0.6)
plt.show()

# Challenge 1
### Download 20 random stocks and:
### 1. Calculate risk and reward potential
### 2. Comopare the covariance and correlation
### 3. Find the best 5 to invest in in the long term

In [None]:
import random
import yfinance as yf
import pandas as pd

# Sample list of stock tickers
all_tickers = [
    "AAPL", "MSFT", "GOOGL", "AMZN", "META", "TSLA", "BRK-B", "JNJ", "V", "WMT",
    "JPM", "PG", "MA", "NVDA", "HD", "DIS", "PYPL", "BAC", "VZ", "NFLX",
    "ADBE", "CMCSA", "INTC", "PFE", "KO", "MRK", "PEP", "T", "ABBV", "CSCO",
    "XOM", "ABT", "CRM", "NKE", "LLY", "ORCL", "MCD", "NEE", "TMO", "QCOM",
    "ACN", "MDT", "DHR", "TXN", "UNH", "LIN", "UNP", "HON"
]

# Ensure the list has more than 20 tickers for randomness
assert len(all_tickers) > 20, "List of all tickers must be greater than 20"

# Check if ticker is valid
def ticker_is_valid(ticker):
    try:
        data = yf.download(ticker, start='2010-01-01', end='2024-01-01', progress=False)
        return not data.empty
    except:
        return False

valid_tickers = [ticker for ticker in all_tickers if ticker_is_valid(ticker)]
print(f"Number of valid tickers: {len(valid_tickers)}")

# Randomly select 20 tickers
selected_tickers = random.sample(valid_tickers, 20)
print("Selected Tickers:", selected_tickers)

# Download the stock data for the selected tickers
stocks = yf.download(selected_tickers, start='2010-01-01', end='2024-01-01')
stocks

In [None]:
close = stocks.loc[:, "Close"].copy()
normClose = close.div(close.iloc[0]).mul(100)
normClose

In [None]:
# Plot normalized data
normClose.plot(figsize=(15, 8), fontsize=12)
plt.legend(fontsize=12)
plt.show()

In [None]:
# Initiate ret
ret = close.pct_change().dropna()
ret.describe()

In [None]:
# Collect mean and standard deviation
summary = ret.describe().T.loc[:,["mean","std"]]
summary

In [None]:
summary["annual_mean"] = summary["mean"] * 252
summary["annual_std"] = summary["std"] * np.sqrt(252)

summary.plot.scatter(x="annual_std", y="annual_mean", figsize=[12,8], s=50, fontsize=15)
for i in summary.index:
    plt.annotate(i, xy=[summary.loc[i,"annual_std"]+0.002, summary.loc[i,"annual_mean"]+0.002], size=15)
plt.xlabel("Annual Risk (std)", fontsize = 15)
plt.ylabel("Annual Return (mean)", fontsize = 15)
plt.title("Annual Risk vs. Return", fontsize = 25)
plt.show()

In [None]:
import seaborn as sns

plt.figure(figsize=(12, 8))
sns.set(font_scale=1.4)
sns.heatmap(ret.corr(), cmap="Reds_r", annot=True, annot_kws={"size":8}, vmax=0.6)
plt.show()

In [None]:
summary['ret_risk_index'] = summary.annual_mean.div(summary.annual_std)
summary = summary.sort_values(by=['ret_risk_index'], ascending=False)
summary

### Determining which stocks to invest in:
Risk-Return Tradeoff: The plot you've created shows the annualized risk (standard deviation) versus the annualized return (mean) for different stocks. In general, investors seek a balance between risk and return. You can use the following approaches to make investment decisions:
* Efficient Frontier: Identify stocks that lie on the efficient frontier, which offers the highest expected return for a given level of risk.
* Sharpe Ratio: Calculate the Sharpe ratio for each stock to measure the return per unit of risk. The ratio is defined as: (Annual Return of the Stock - Risk-Free Rate) / (Annual Risk)

Diversification
* Portfolio Diversification: Diversify your investments across multiple stocks to reduce unsystematic risk. You can create a portfolio of stocks that have low correlations with each other to achieve diversification benefits.


The Sharpe Ratio does take into account the absolute value of the return, but it does so in a way that adjusts for risk. It measures the excess return (return above the risk-free rate) per unit of risk (standard deviation). However, it does not explicitly prioritize the magnitude of the return independently of risk.

The risk-free rate is typically derived from the yield on government securities, such as U.S. Treasury bills (T-bills), notes, or bonds, which are considered virtually free of credit risk. In practice, the 3-month U.S. Treasury bill yield is often used as a proxy for the risk-free rate.

To account for both the magnitude of return and the return-risk ratio, you might consider additional or complementary metrics, such as the Sortino Ratio.

In [None]:
import numpy as np

# Assume a risk-free rate (e.g., 3-month Treasury bill rate)
risk_free_rate = 0.02

# Calculate the Sharpe Ratio
summary['sharpe_ratio'] = (summary['annual_mean'] - risk_free_rate) / summary['annual_std']

# Display the dataframe with Sharpe Ratio
summary

# Sort stocks by Sharpe Ratio in descending order
summary_sorted = summary.sort_values(by="sharpe_ratio", ascending=False)

# Locate Top 5 Stocks
# Display the top stocks based on Sharpe Ratio
print("Top Stocks based on Sharpe Ratio:")
for ticker in summary_sorted.head().index:
    print(ticker)

# 6. Simple Return and Log Return
Simple return, also known as arithmetic return, measures the percentage change in the price of an asset over a specific period. It is calculated as the difference between the ending price and the starting price, divided by the starting price. Simple return is a straightforward way to assess the performance of an investment over a particular period.

Log return, also known as logarithmic return or continuously compounded return, measures the logarithm of the ratio of the ending price to the starting price. Log returns are additive over time, which simplifies the calculation of returns over multiple periods. This property makes log returns particularly useful in financial modeling and time series analysis.

In [None]:
# Create a random dataframe
df = pd.DataFrame(index=[2016, 2017, 2018], data=[100, 50, 95], columns=["Price"])
df

In [None]:
# Simple Returns
simpleReturns = df.pct_change().dropna()
simpleReturns

In [None]:
# Find the mean simple return
meanRet = simpleReturns.mean()
meanRet

In [None]:
# Since the mean simple return is 0.2, the following should equal 95
df.loc[2016, "Price"] * (1 + meanRet) ** 2

# However, the code returns 144, which is not equal to 95.
# From This result, we learn that simple or arithmetic return is not cumulative.

In [None]:
# This is where log return comes into perspective
logReturns = np.log(df / df.shift()).dropna()
logReturns

In [None]:
meanLogRet = logReturns.mean()
meanLogRet

In [None]:
100 ** meanLogRet

In [None]:
# From the following computation, we learn that log return is cumulative
df.loc[2016, "Price"] * np.exp(meanLogRet * 2)

# 7. Rolling Functions
To roll a function means to apply a calculation across a moving or sliding window of data points within a
dataset. This involves taking a fixed-size subset (window) of consecutive data points, performing a
specified function on this subset, and then moving the window one data point forward and repeating.

Rolling a dataset offers several benefits over applying functions to individual data points, especially when dealing with time series data.

1. Smoothing Data
* Reduced Noise: Rolling calculations, like rolling means, help smooth out short-term fluctuations and random noise in the data. This makes it easier to identify long-term trends and underlying patterns.
* Signal Clarity: By averaging or summing over a window, the true signal in the data becomes clearer, making it easier to analyze and interpret.

2. Trend Detection
* Trend Analysis: Rolling functions are excellent for detecting trends over time. For example, a rolling mean can highlight the overall direction of a time series, helping to identify upward or downward trends.
* Cyclic Patterns: Rolling calculations can reveal cyclic patterns, such as seasonal variations or business cycles, which might not be apparent from individual data points.

3. Anomaly Detection
- Outlier Detection: By comparing rolling statistics (like rolling standard deviation) to individual data points, anomalies or outliers can be identified more effectively. THis is useful in quality control, finance, and other fields where anomalies are significant.
- Behavioral Insights: Rolling functions cna help detect unusual behavior over time, providing insights into potential issues or areas needing attention.

4. Contextual Analysis
* Contextual Understanding: Rolling functions provide a context for each data point by considering its neighboring values. This contextual analysis is crucial for understanding how data points relate to each other over time.
* Better Decision Making: By looking at a window of data, rather than isolated points, decisions can be made based on a broader understanding of the data's behavior.

5. Data Aggregation
* Aggregated Insights: Rolling calculations aggregate data over a specified window, providing insights that individual data points cannot. For instance, rolling sums can show cumulative sales over a perfiod, offering a better view of overall performance.
* Simplified Data: Aggregating data through rolling functions simplifies complex datasets, making them easier to analyze and visualize.

6. Robustness to Variability
* Handling Volatility: In volatile datasets, such as stock prices, rolling functions help manage variability by focusing on a window of data rather than isolated points. This results in more stable and reliable insights.
* Adaptability: Rolling functions adapt to changes in data patterns, making them userful in dynamic environments where data characteristics may shift over time.

In [None]:
SPY = yf.download("SPY")

In [None]:
spy = SPY.Close.to_frame()
spy

In [None]:
spy.plot(figsize=(12,8), fontsize=15)
plt.legend(loc="upper left", fontsize=15)
plt.show

In [None]:
spy_roll = spy.rolling(window=10)
spy_roll = spy_roll.mean()

In [None]:
spy_roll.head(15)

In [None]:
spy.rolling(window=10).median()

In [None]:
spy.rolling(window=10).max()

In [None]:
# With `min_periods=`, we can set minimum number of values to start calculating
spy.rolling(window=10, min_periods=5).max()

In [None]:
SPY = yf.download("SPY")
spy = SPY.Close.to_frame()
spy["SMA50"] = spy.rolling(window=50, min_periods=50).mean()
spy