In [1]:
!pip install yfinance
!pip install pandas_datareader
!pip install tabulate 

import numpy as np 
import scipy as sp
import pandas as pd
import math
import random
import tabulate 
import pickle

#external data & file reading
import pandas_datareader as pdr
pd.options.mode.chained_assignment = None  # default='warn'


#for plotting
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

Collecting yfinance
  Using cached yfinance-0.1.67-py2.py3-none-any.whl (25 kB)
Collecting multitasking>=0.0.7
  Using cached multitasking-0.0.10-py3-none-any.whl
Installing collected packages: multitasking, yfinance
Successfully installed multitasking-0.0.10 yfinance-0.1.67
Collecting pandas_datareader
  Using cached pandas_datareader-0.10.0-py3-none-any.whl (109 kB)
Installing collected packages: pandas-datareader
Successfully installed pandas-datareader-0.10.0
Collecting tabulate
  Using cached tabulate-0.8.9-py3-none-any.whl (25 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.8.9


## Constructing a First Portfolio

#### Data Loading

In [28]:
#Gets trade info for the specified countries. This will be updated to all 37 in time! But for now just well developed
#asian countries as it is too slow for my pc. 

countries_with_trade_info = ['CHINA','HONG KONG','JAPAN','KOREA','TAIWAN']
trades_list = []
for i in countries_with_trade_info:
    trades_list.append(pd.read_csv(i + ".csv"))
all_trades  = pd.concat(trades_list)


#### Filtering & Getting initial metrics

The code below shows that we have a total of 3,135 companies which match our initial criteria. From here we'd now like to filter further. 

In [29]:
#At first we are to look at just all sector data. So this filters to the sector "All Sectors"
data = all_trades[all_trades['Sector'] == "All_sectors"]

#And now we collect the unique companies for which this filtered data corresponds to. 
#all_sector_trades['fsym_id'].unique()


#### Filter by Year

This will ensure that the stocks we take we have traded in the set year / years.

In [30]:
year = "2021"

#1.6 million missing end dates!
data['Exit Date'].isna().sum() 

#No missing start dates!
data['Date'].isna().sum() 

years = []
for element in data['Date']:
    years.append(element[:4])

data['Year'] = years

data = data[data.Year == year]

#we traded 3022 unique stocks in the previous year. 
#len(data['fsym_id'].unique())



#### Filter by Factor

This will filter based on factor level. Pass in a number and the factor and it will return an ordered dataframe of potential stocks by average return in the previous year.

In [31]:
data = data #data for all the trades we are investigating 
factor = "Volatility" #factor to investigate
n = 30 #number of stocks to return

def factor_filter(data,factor,n): 
    company_data = data[data.Factor == factor]

    company_data = company_data.groupby(['fsym_id']).mean()
    company_data = company_data.sort_values('returns', ascending = False)
    company_data = company_data.drop(['Unnamed: 0'], axis=1)
    selected = company_data.head(n) 
    return selected 
    
d = factor_filter(data,factor,n)


#### Constructing the Portfolio - Incoprorating Risk Behaviours

#### Constructing the Portfolio - selecting number of stocks for a given profile

This bit is slightly more complex. It takes a set $n$, and risk profile as for example is given above and determines the number of stocks (and then makes appropriate calls to the prev. function) to construct the portfolio. To determine the weightings, I first guess and use the system describe below.


$$\begin{cases}
\text{1 weighting points: Low} \\ 
\text{3 weighting points: Medium} \\
\text{5 weighting points: High} \end{cases} $$

This can easily be changed, this is just for my first initial guess. For example. For someone who wants a safe portfolio, the "volatility" factor will be assigned Low, wheras we would see the dividend factor assigned "high". I make initial guesses as to appropriate set ups for these weightings by risk tolerance in the next section. 

In [32]:
names = data['Factor'].unique()

#type 1 represents someone who is not risky - needs changed and interpreted and justified!
type_1 = [3,1,5,5,3,3,5,1,"Type 1"]

#type 2 is someone who is not risky / average
type_2 = [3,3,5,5,3,3,5,3,"Type 2"]

#type 3 is average
type_3 = [3,3,3,3,3,3,3,3,"Type 3"]

#type 4 - risky ish
type_4 = [1,5,3,3,5,3,1,5,"Type 4"]

#type 5 - someone risky
type_5 = [1,5,3,1,5,1,1,5,"Type 5"]

col_names = list(names)
col_names.append("Type")

types = [type_1,type_2,type_3,type_4,type_5]
df = pd.DataFrame(columns = col_names)
for i in types:
    a_series = pd.Series(i, index = df.columns)
    df = df.append(a_series, ignore_index=True)
    
df = df.set_index('Type')


In [33]:
beh_type = 1
n= 200

weightings = df.iloc[beh_type -1]
total_weighting_points = sum(weightings)
weights = []

for i in range (0,len(names)):
    weights.append(weightings[i]/total_weighting_points)   

weights = np.round(weights,2)
weights =  weights*n
weights = weights.astype(int)


if sum(weights) > n:
    remove = sum(weights) % n 
    for j in range(0,remove):
        drop = random.randint(0,len(names)-1)
        weights[drop] -= 1 
        
elif sum(weights) < n:
    add = sum(weights) % n 
    for j in range(0,add):
        drop = random.randint(0,len(names)-1)
        weights[drop] += 1 
        
df_list = [] 

for i in range (0,len(names)):
    factor_name = names[i]
    df_list.append(factor_filter(data,factor_name,int(weights[i])))
    
frame = pd.concat(df_list)

frame

Unnamed: 0_level_0,returns
fsym_id,Unnamed: 1_level_1
P8ZTFP-R,0.176711
DY978H-R,0.148645
MVZYGZ-R,0.145620
R46SYN-R,0.138662
SQCPW9-R,0.138116
...,...
VBMG3G-R,0.213008
HZ28DF-R,0.175422
JS497J-R,0.174260
SHVF2G-R,0.162166


In [34]:
#mean returns from previous year. 
np.mean(frame['returns'])

df_list2 = []
for i in names:
    df2 = data[data['Factor'] == i]
    df2 = df2.dropna()
    df2 = df2.head(20)
    df2 = np.mean(df2['returns'])
    df_list2.append(df2)

df_list2
df = pd.DataFrame(columns = names)

a_series = pd.Series(df_list2, index = df.columns)
df = df.append(a_series, ignore_index=True)




In [43]:
mappings = pd.read_csv("mapping.csv")

company_names = pd.DataFrame(mappings['company_name'])
company_names.reset_index(inplace=True,drop = True)
company_names['fsym_id'] = mappings['fsym_id']

len(data['fsym_id'].unique())




55375