## Imports

In [1]:
!pip install pytickersymbols



In [2]:
import pandas as pd
import numpy as np
import yfinance as yf
import requests
import bs4 as bs
from pytickersymbols import PyTickerSymbols

## 

## Dataset Init

In [3]:
def get_tickers():
    stock_data = PyTickerSymbols()
    dow_yahoo = stock_data.get_dow_jones_nyc_yahoo_tickers()
    dow_yahoo.remove('CAT')
    sp100_yahoo = stock_data.get_sp_100_nyc_yahoo_tickers()
    sp100_yahoo.remove('MNSLV')
    sp100_yahoo.remove('BOAPL')
    sp100_yahoo.remove('NEEXU')
    sp100_yahoo.remove('BMYMP')
    sp100_yahoo.remove('XON')
    return set(dow_yahoo + sp100_yahoo)

In [4]:
tickers = get_tickers()
df = pd.DataFrame(tickers, columns=['Ticker'])
df

Unnamed: 0,Ticker
0,MRK
1,AAPL
2,BK
3,ORCL
4,QCOM
...,...
99,WMT
100,CVS
101,OCLCF
102,RTX


In [5]:
def get_city(ticker):
    try:
        return yf.Ticker(ticker).info['city']
    except:
        return 'None'

In [6]:
# add city
df['City'] = df['Ticker'].apply(lambda x: get_city(x))
df

Unnamed: 0,Ticker,City
0,MRK,Rahway
1,AAPL,Cupertino
2,BK,New York
3,ORCL,Austin
4,QCOM,San Diego
...,...,...
99,WMT,Bentonville
100,CVS,Woonsocket
101,OCLCF,Tokyo
102,RTX,Arlington


In [7]:
def get_country(ticker):
    try:
        return yf.Ticker(ticker).info['country']
    except:
        return 'None'

In [8]:
# add country
df['Country'] = df['Ticker'].apply(lambda x: get_country(x))
df

Unnamed: 0,Ticker,City,Country
0,MRK,Rahway,United States
1,AAPL,Cupertino,United States
2,BK,New York,United States
3,ORCL,Austin,United States
4,QCOM,San Diego,United States
...,...,...,...
99,WMT,Bentonville,United States
100,CVS,Woonsocket,United States
101,OCLCF,Tokyo,Japan
102,RTX,Arlington,United States


In [9]:
def get_industry(ticker):
    try:
        return yf.Ticker(ticker).info['industry']
    except:
        return 'None'

In [10]:
# add industry
df['Industry'] = df['Ticker'].apply(lambda x: get_industry(x))
df

Unnamed: 0,Ticker,City,Country,Industry
0,MRK,Rahway,United States,Drug Manufacturers - General
1,AAPL,Cupertino,United States,Consumer Electronics
2,BK,New York,United States,Asset Management
3,ORCL,Austin,United States,Software - Infrastructure
4,QCOM,San Diego,United States,Semiconductors
...,...,...,...,...
99,WMT,Bentonville,United States,Discount Stores
100,CVS,Woonsocket,United States,Healthcare Plans
101,OCLCF,Tokyo,Japan,Software - Application
102,RTX,Arlington,United States,Aerospace & Defense


In [11]:
def get_sector(ticker):
    try:
        return yf.Ticker(ticker).info['sector']
    except:
        return 'None'

In [12]:
# add sector
df['Sector'] = df['Ticker'].apply(lambda x: get_sector(x))
df

Unnamed: 0,Ticker,City,Country,Industry,Sector
0,MRK,Rahway,United States,Drug Manufacturers - General,Healthcare
1,AAPL,Cupertino,United States,Consumer Electronics,Technology
2,BK,New York,United States,Asset Management,Financial Services
3,ORCL,Austin,United States,Software - Infrastructure,Technology
4,QCOM,San Diego,United States,Semiconductors,Technology
...,...,...,...,...,...
99,WMT,Bentonville,United States,Discount Stores,Consumer Defensive
100,CVS,Woonsocket,United States,Healthcare Plans,Healthcare
101,OCLCF,Tokyo,Japan,Software - Application,Technology
102,RTX,Arlington,United States,Aerospace & Defense,Industrials


In [13]:
# Here I am not using the idea we wrote in preliminaries
# For the beginning I just want to test my idea which is simplier and I just want get some results if it is possible
def rsi(close, periods = 14):
    
    close_delta = close.diff()

    # Make two series: one for lower closes and one for higher closes
    up = close_delta.clip(lower=0)
    down = -1 * close_delta.clip(upper=0)
    
    ma_up = up.ewm(com = periods - 1, adjust=True, min_periods = periods).mean()
    ma_down = down.ewm(com = periods - 1, adjust=True, min_periods = periods).mean()

    rsi = ma_up / ma_down
    rsi = 100 - (100/(1 + rsi))
    return rsi

def rsi_for_ticker(ticker):
    history_data = yf.Ticker(ticker).history(start='2019-01-01', end='2020-04-01', interval='1mo')
    rsi_vec = rsi(history_data['Close'])
    return rsi_vec[-1]

df['RSI'] = df['Ticker'].apply(lambda x: rsi_for_ticker(x))
df

Unnamed: 0,Ticker,City,Country,Industry,Sector,RSI
0,MRK,Rahway,United States,Drug Manufacturers - General,Healthcare,46.591292
1,AAPL,Cupertino,United States,Consumer Electronics,Technology,62.504121
2,BK,New York,United States,Asset Management,Financial Services,27.445960
3,ORCL,Austin,United States,Software - Infrastructure,Technology,42.124895
4,QCOM,San Diego,United States,Semiconductors,Technology,51.680406
...,...,...,...,...,...,...
99,WMT,Bentonville,United States,Discount Stores,Consumer Defensive,63.841546
100,CVS,Woonsocket,United States,Healthcare Plans,Healthcare,45.852480
101,OCLCF,Tokyo,Japan,Software - Application,Technology,0.000000
102,RTX,Arlington,United States,Aerospace & Defense,Industrials,31.725722


In [14]:
def get_relative_change_vector(ticker):
    prices = yf.Ticker(ticker).history(start='2020-04-01', end = '2020-04-30', interval='1d')['Close']
    vec = list()
    for i in range(len(prices)-1):
        vec.append((prices[i+1]-prices[i])/ prices[i])
    return vec

In [15]:
df['Ticker'].apply(lambda x: print(f'{x}: {get_relative_change_vector(x)}'))

MRK: [0.04159886902982164, -0.00806531729975877, 0.05324563450340075, -0.021790471199518567, 0.038951085603320765, 0.01065913984719255, -0.02363946705195953, 0.028557604271280995, -0.009295228048513174, 0.011331823047558766, 0.00554200186361919, -0.004313277229194543, -0.05463287469608067, 0.018075235601558513, 0.01125260091180441, 0.006800537231508482, 0.031315047502466574, -0.033341093770722786, -0.005050663418750664]
AAPL: [0.016686721815789767, -0.014371417411081904, 0.08723738801529769, -0.011582355988688156, 0.025594824205357206, 0.007216052740912986, 0.01962764549425008, 0.050503257663080746, -0.009127559246884761, 0.007945725822972512, -0.013568810688185806, -0.020756509791587347, -0.030910483037365714, 0.028803844544086625, -0.0038754673365982133, 0.02886946673944299, 0.0007069749754020135, -0.01620948001969314, 0.032845003387243554]
BK: [0.028746148756400736, 0.0032702258096487472, 0.04414807171222362, -0.012485901903816569, 0.024425396812198203, 0.04179492273958613, -0.03069

KeyboardInterrupt: 

In [None]:
def SMA(data, ndays): 
    SMA = data['Close'].rolling(ndays).mean()
    return SMA

In [None]:
def get_macd(ticker):
    df1 = yf.Ticker(ticker).history(start='2018-06-01', end='2020-04-01', interval='1mo')
    macd = SMA(df1, 7)[-1] - SMA(df1, 15)[-1]
    return macd

df['MACD'] = df['Ticker'].apply(lambda x: get_macd(x))
df

Unnamed: 0,Ticker,City,Country,Industry,Sector,RSI,MACD
0,AVGO,San Jose,United States,Semiconductors,Technology,39.318896,4.656958
1,CVS,Woonsocket,United States,Healthcare Plans,Healthcare,45.852464,5.168253
2,CL,New York,United States,Household & Personal Products,Consumer Defensive,48.796158,0.318577
3,GOOG,Mountain View,United States,Internet Content & Information,Communication Services,47.008815,3.879943
4,NEE,Juno Beach,United States,Utilities - Regulated Electric,Utilities,69.144812,6.025250
...,...,...,...,...,...,...,...
99,AMT,Boston,United States,REIT - Specialty,Real Estate,63.658766,12.329809
100,V,San Francisco,United States,Credit Services,Financial Services,50.345642,9.965439
101,ADBE,San Jose,United States,Software - Infrastructure,Technology,62.335515,20.564762
102,MS,New York,United States,Capital Markets,Financial Services,36.306217,1.621525


In [None]:
def get_price_change(ticker):
    history_data = yf.Ticker(ticker).history(start='2019-01-01', end='2020-05-01', interval='1mo')
    close = history_data['Close']
    return close[-1] - close[-2]

df['Price Change'] = df['Ticker'].apply(lambda x: get_price_change(x))
df

Unnamed: 0,Ticker,City,Country,Industry,Sector,RSI,MACD,Price Change
0,AVGO,San Jose,United States,Semiconductors,Technology,39.318896,4.656958,34.590179
1,CVS,Woonsocket,United States,Healthcare Plans,Healthcare,45.852464,5.168253,2.018501
2,CL,New York,United States,Household & Personal Products,Consumer Defensive,48.796158,0.318577,3.598969
3,GOOG,Mountain View,United States,Internet Content & Information,Communication Services,47.008815,3.879943,9.292500
4,NEE,Juno Beach,United States,Utilities - Regulated Electric,Utilities,69.144812,6.025250,-2.202202
...,...,...,...,...,...,...,...,...
99,AMT,Boston,United States,REIT - Specialty,Real Estate,63.658766,12.329809,18.509796
100,V,San Francisco,United States,Credit Services,Financial Services,50.345642,9.965439,17.180374
101,ADBE,San Jose,United States,Software - Infrastructure,Technology,62.335515,20.564762,35.400024
102,MS,New York,United States,Capital Markets,Financial Services,36.306217,1.621525,4.878447


In [None]:
def get_relative_change_vector(ticker):
    prices = yf.Ticker(ticker).history(start='2020-04-01', end = '2020-04-30', interval='1d')['Close']
    vec = list()
    for i in range(len(prices)-1):
        vec.append((prices.iloc[i+1]-prices.iloc[i])/ prices.iloc[i])
    vec = tuple(vec)
    return vec

In [None]:
price_vectors = {x: get_relative_change_vector(x) for x in df["Ticker"]}
# df['Ticker'].apply(lambda x: print(f'{x}: {get_relative_change_vector(x)}'))
display(price_vectors)

{'AVGO': (0.06005449803914246,
  -0.011811290610755108,
  0.07760594462247428,
  0.0032880712178943164,
  0.030915494444617598,
  -0.026043666960479248,
  0.022099885667457523,
  0.02966310002525628,
  -0.038075025423057955,
  0.0033016253592575776,
  0.02938562568156317,
  -0.028584112489957462,
  -0.04115702195789145,
  0.04974774918293877,
  -0.0040005900486685675,
  0.02270879316916858,
  0.013745859776370389,
  -0.013857446033520683,
  0.04264720923588843),
 'CVS': (-0.05022274186294266,
  0.005594621627822279,
  0.008973516620688212,
  0.011561537828933866,
  0.0446633589965119,
  0.017842261031474533,
  -0.024971099051954545,
  0.04409756044799469,
  -0.01624418371899991,
  0.0330249673819058,
  0.012787728073133895,
  -0.0160985563955041,
  -0.0335257387019472,
  0.010041932141612914,
  0.015410080710573492,
  0.016808037255595702,
  0.019098128889383018,
  -0.004094450773941552,
  -0.009962178079944935),
 'CL': (0.03996922467824722,
  -0.008067971143087209,
  0.042591040949465

In [None]:
df["Ticker_diff"] = df["Ticker"].apply(lambda x: get_relative_change_vector(x))
display(df)

Unnamed: 0,Ticker,City,Country,Industry,Sector,RSI,MACD,Price Change,Ticker_diff
0,AVGO,San Jose,United States,Semiconductors,Technology,39.318896,4.656958,34.590179,"(0.06005442668691568, -0.011811220691882012, 0..."
1,CVS,Woonsocket,United States,Healthcare Plans,Healthcare,45.852464,5.168253,2.018501,"(-0.050222806554260896, 0.0055945454866221786,..."
2,CL,New York,United States,Household & Personal Products,Consumer Defensive,48.796158,0.318577,3.598969,"(0.03996935870909021, -0.008067910838663426, 0..."
3,GOOG,Mountain View,United States,Internet Content & Information,Communication Services,47.008815,3.879943,9.292500,"(0.013766060875893328, -0.020484615352151927, ..."
4,NEE,Juno Beach,United States,Utilities - Regulated Electric,Utilities,69.144812,6.025250,-2.202202,"(0.037666716460442264, -0.0212407277880222, 0...."
...,...,...,...,...,...,...,...,...,...
99,AMT,Boston,United States,REIT - Specialty,Real Estate,63.658766,12.329809,18.509796,"(0.06657973028320409, 0.00928642685288422, 0.0..."
100,V,San Francisco,United States,Credit Services,Financial Services,50.345642,9.965439,17.180374,"(0.027953639052215384, -0.03519922437421124, 0..."
101,ADBE,San Jose,United States,Software - Infrastructure,Technology,62.335515,20.564762,35.400024,"(0.009129843225702323, -0.03405055400208892, 0..."
102,MS,New York,United States,Capital Markets,Financial Services,36.306217,1.621525,4.878447,"(0.07210643843837956, -0.001475195504423739, 0..."


In [None]:
df.to_pickle('df_stocks.pkl')

## Experiments

In [1]:
import pandas as pd
df = pd.read_pickle('df_stocks.pkl')
df

Unnamed: 0,Ticker,City,Country,Industry,Sector,RSI,MACD,Price Change,Ticker_diff
0,AVGO,San Jose,United States,Semiconductors,Technology,39.318896,4.656958,34.590179,"(0.06005442668691568, -0.011811220691882012, 0..."
1,CVS,Woonsocket,United States,Healthcare Plans,Healthcare,45.852464,5.168253,2.018501,"(-0.050222806554260896, 0.0055945454866221786,..."
2,CL,New York,United States,Household & Personal Products,Consumer Defensive,48.796158,0.318577,3.598969,"(0.03996935870909021, -0.008067910838663426, 0..."
3,GOOG,Mountain View,United States,Internet Content & Information,Communication Services,47.008815,3.879943,9.292500,"(0.013766060875893328, -0.020484615352151927, ..."
4,NEE,Juno Beach,United States,Utilities - Regulated Electric,Utilities,69.144812,6.025250,-2.202202,"(0.037666716460442264, -0.0212407277880222, 0...."
...,...,...,...,...,...,...,...,...,...
99,AMT,Boston,United States,REIT - Specialty,Real Estate,63.658766,12.329809,18.509796,"(0.06657973028320409, 0.00928642685288422, 0.0..."
100,V,San Francisco,United States,Credit Services,Financial Services,50.345642,9.965439,17.180374,"(0.027953639052215384, -0.03519922437421124, 0..."
101,ADBE,San Jose,United States,Software - Infrastructure,Technology,62.335515,20.564762,35.400024,"(0.009129843225702323, -0.03405055400208892, 0..."
102,MS,New York,United States,Capital Markets,Financial Services,36.306217,1.621525,4.878447,"(0.07210643843837956, -0.001475195504423739, 0..."


In [2]:
df = df.drop(['Ticker', 'MACD', 'Price Change'], axis=1)
df

Unnamed: 0,City,Country,Industry,Sector,RSI,Ticker_diff
0,San Jose,United States,Semiconductors,Technology,39.318896,"(0.06005442668691568, -0.011811220691882012, 0..."
1,Woonsocket,United States,Healthcare Plans,Healthcare,45.852464,"(-0.050222806554260896, 0.0055945454866221786,..."
2,New York,United States,Household & Personal Products,Consumer Defensive,48.796158,"(0.03996935870909021, -0.008067910838663426, 0..."
3,Mountain View,United States,Internet Content & Information,Communication Services,47.008815,"(0.013766060875893328, -0.020484615352151927, ..."
4,Juno Beach,United States,Utilities - Regulated Electric,Utilities,69.144812,"(0.037666716460442264, -0.0212407277880222, 0...."
...,...,...,...,...,...,...
99,Boston,United States,REIT - Specialty,Real Estate,63.658766,"(0.06657973028320409, 0.00928642685288422, 0.0..."
100,San Francisco,United States,Credit Services,Financial Services,50.345642,"(0.027953639052215384, -0.03519922437421124, 0..."
101,San Jose,United States,Software - Infrastructure,Technology,62.335515,"(0.009129843225702323, -0.03405055400208892, 0..."
102,New York,United States,Capital Markets,Financial Services,36.306217,"(0.07210643843837956, -0.001475195504423739, 0..."


In [3]:
import sys
sys.path.append('emm')
from emm import EMM


In [4]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [6]:
import math
import numpy as np
population_trend = None
def metric(subgroup_target, dataset_target, use_complement=False): # Edited from EMM library, custom metric
    """

    :param subgroup_target:
    :param dataset_target:
    :param use_complement:
    :return:
    """
    def entropy(subgroup_target, dataset_target): # Given by EMM library
        """
        
        Args:
            subgroup_target:
            dataset_target:

        Returns:

        """
        n_c = max(1, len(dataset_target) - len(subgroup_target))
        n = len(subgroup_target)
        N = len(dataset_target)
        return -n/N * math.log(n/N) - n_c/N * math.log(n_c/N)
    def find_trend(df, col_x): # Edited from EMM library
        #avg_x = avg(df[col_x])
        #avg_y = avg(df[col_y])
        vec1 = np.zeros(19)
        for x, y in df[col_x].items():
            vec1 = np.add(vec1, np.array(list(y)))
        #print(vec1 / len(df))
        return vec1 / len(df)
        #top = df.apply(lambda row: (row[col_x] - avg_x) * (row[col_y] - avg_y), axis=1)
        #bottom_x = df.apply(lambda row: (row[col_x] - avg_x) ** 2, axis=1)
        #bottom_y = df.apply(lambda row: (row[col_y] - avg_y) ** 2, axis=1)
        #try:
            #return top.sum() / math.sqrt(bottom_x.sum() * bottom_y.sum())
        #except Warning:  # Both x.sum() and y.sum() equal zero
            #return 0
        # global population_trend
    global population_trend
    print("In correlation func")
    #if len(subgroup_target.columns) != 2:
    #    raise ValueError("Correlation metric expects exactly 2 columns as target variables")
    #x_col, y_col = list(subgroup_target.columns)
    print(subgroup_target)
    print(dataset_target)
    x_col = list(subgroup_target.columns)[0]
    if population_trend is None:
        population_trend = find_trend(dataset_target, x_col)
    # print(subgroup_target, x_col, y_col)
    subgroup_trend = find_trend(subgroup_target, x_col)
    corr_coeff = np.corrcoef(population_trend, subgroup_trend)[0][1]
    # if corr_coeff < 0.5:
    #     print(corr_coeff)
    #if math.isnan(r_gd):
        #return 0, 0
    entr = entropy(subgroup_target, dataset_target)
    # reverse_coeff = len(dataset_target) / len(subgroup_target)
    #print(entr)
    return entr * -corr_coeff, 1

In [7]:
target_columns = ['Ticker_diff']
target_col2 = ['City', 'City']
clf = EMM.EMM(width=8, depth=4, evaluation_metric=metric, strategy='maximize')
clf.search(df, target_cols=target_columns)
#clf.visualise(subgroups=5, cols=3, target_columns=target_col2) 

In correlation func
                                           Ticker_diff
0    (0.06005442668691568, -0.011811220691882012, 0...
1    (-0.050222806554260896, 0.0055945454866221786,...
2    (0.03996935870909021, -0.008067910838663426, 0...
3    (0.013766060875893328, -0.020484615352151927, ...
4    (0.037666716460442264, -0.0212407277880222, 0....
..                                                 ...
99   (0.06657973028320409, 0.00928642685288422, 0.0...
100  (0.027953639052215384, -0.03519922437421124, 0...
101  (0.009129843225702323, -0.03405055400208892, 0...
102  (0.07210643843837956, -0.001475195504423739, 0...
103  (0.022377365432651612, -0.00803229814453156, 0...

[104 rows x 1 columns]
                                           Ticker_diff
0    (0.06005442668691568, -0.011811220691882012, 0...
1    (-0.050222806554260896, 0.0055945454866221786,...
2    (0.03996935870909021, -0.008067910838663426, 0...
3    (0.013766060875893328, -0.020484615352151927, ...
4    (0.0376667164604

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=88720f49-c7ba-4446-9879-24b162efc293' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>