# Librerias

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import graphviz

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
%matplotlib inline

cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb

# Análisis exploratorio

In [2]:
data = pd.read_csv('Data/ds_index_next.csv')
data.shape

(87438, 43)

In [3]:
data.head()

Unnamed: 0,YEAR,CODE,VQ_MARKET_CAP,CSR__ROE,VR__EV_TO_EBIT,CSR__ROC_JOEL_GREENBLATT,CSR__OPERATING_MARGIN,VQ__PIOTROSKI_F_SCORE,VR__PS_RATIO,VR__PB_RATIO,...,RISK,INDUSTRY01,INDUSTRY02,RETURNS_NEXT,RETURNS_CAT,RETURNS_NEXT_CAT,INDEX_RETURNS,INDEX_RETURNS_NEXT,RETURNS_vs_INDEX,RETURNS_vs_INDEX_NEXT
0,2014,ETE,31002.7009,72.22,28.98,8.0,5.1,7,0.56,46.73,...,3.47292,,,-0.512675,POSITIVE,NEGATIVE,0.123866,-0.006928,WINS,LOSES
1,2010,ETE,8712.4952,317.83,24.53,9.21,15.92,5,1.33,72.37,...,0.730981,,,0.045082,POSITIVE,POSITIVE,0.110019,-0.01122,WINS,WINS
2,2015,ETE,14384.6808,0.0,28.67,5.9,7.28,7,0.41,0.0,...,6.159618,,,0.897959,NEGATIVE,POSITIVE,-0.006928,0.112374,LOSES,WINS
3,2012,ETE,12732.3534,27.89,19.82,11.46,8.02,5,0.73,6.03,...,0.649843,,,0.65798,POSITIVE,POSITIVE,0.116776,0.263905,WINS,WINS
4,2007,ETE,8191.2308,2122.84,25.13,14.74,11.92,5,1.16,0.0,...,0.797453,,,-0.502987,POSITIVE,NEGATIVE,0.036538,-0.375847,WINS,LOSES


## Preprocesamiento de variables

In [4]:
gb_plus = data[['YEAR', 'CODE', 'CSR__ROE', 'VR__EV_TO_EBIT', 'CSR__ROC_JOEL_GREENBLATT',
                'CSR__OPERATING_MARGIN', 'VQ__PIOTROSKI_F_SCORE', 'VR__PS_RATIO', 'VR__PB_RATIO',
                'VQ__ALTMAN_Z_SCORE', 'CSR__ROA', 'PSDA__EARNINGS_PER_SHARE_DILUTED', 'CSR__ROIC',
                'CSR__GROSS_MARGIN', 'VQ__YOY_EPS_GROWTH', 'VQ__YOY_EBITDA_GROWTH',
                'PSDA__EBITDA_PER_SHARE', 'PSDA__TOTAL_DEBT_PER_SHARE', 'CSR__NET_MARGIN',
                'IS__REVENUE', 'IS__NET_INCOME', 'IS__COST_OF_GOODS_SOLD', 'BS__TOTAL_EQUITY',
                'IS__EBITDA', 'BS__TOTAL_ASSETS', 'BS__LONG_TERM_DEBT', 'BS__TOTAL_CURRENT_LIABILITIES',
                'IS_EPS_BASIC', 'VR_PE_RATIO', 'RETURNS', 'RISK', 'RETURNS_NEXT']]
gb_plus.head()

Unnamed: 0,YEAR,CODE,CSR__ROE,VR__EV_TO_EBIT,CSR__ROC_JOEL_GREENBLATT,CSR__OPERATING_MARGIN,VQ__PIOTROSKI_F_SCORE,VR__PS_RATIO,VR__PB_RATIO,VQ__ALTMAN_Z_SCORE,...,BS__TOTAL_EQUITY,IS__EBITDA,BS__TOTAL_ASSETS,BS__LONG_TERM_DEBT,BS__TOTAL_CURRENT_LIABILITIES,IS_EPS_BASIC,VR_PE_RATIO,RETURNS,RISK,RETURNS_NEXT
0,2014,ETE,72.22,28.98,8.0,5.1,7,0.56,46.73,1.44,...,22329.0,4564.0,64279.0,29477.0,6683.0,0.58,50.33,0.392368,3.47292,-0.512675
1,2010,ETE,317.83,24.53,9.21,15.92,5,1.33,72.37,1.05,...,6247.732,1390.0,17378.73,9346.067,1081.075,0.215,45.44,0.245752,0.730981,0.045082
2,2015,ETE,0.0,28.67,5.9,7.28,7,0.41,0.0,0.81,...,23613.0,4577.0,71189.0,36837.0,4910.0,1.11,12.27,-0.512675,6.159618,0.897959
3,2012,ETE,27.89,19.82,11.46,8.02,5,0.73,6.03,0.74,...,16350.0,3326.0,48904.0,21440.0,5845.0,0.285,37.04,0.120197,0.649843,0.65798
4,2007,ETE,2122.84,25.13,14.74,11.92,5,1.16,0.0,1.52,...,2091.156,1007.41,9462.094,5916.585,1241.433,0.39,30.33,0.144703,0.797453,-0.502987


In [5]:
gb_plus.query('YEAR > 1990 & YEAR < 2016', inplace=True)
gb_plus.dropna(inplace=True)
gb_plus['RETURNS_NEXT_RANKING'] = gb_plus.groupby(by=['YEAR'])['RETURNS_NEXT'] \
        .transform(lambda x: x.rank(method='dense', ascending=False))
gb_plus.head()

Unnamed: 0,YEAR,CODE,CSR__ROE,VR__EV_TO_EBIT,CSR__ROC_JOEL_GREENBLATT,CSR__OPERATING_MARGIN,VQ__PIOTROSKI_F_SCORE,VR__PS_RATIO,VR__PB_RATIO,VQ__ALTMAN_Z_SCORE,...,IS__EBITDA,BS__TOTAL_ASSETS,BS__LONG_TERM_DEBT,BS__TOTAL_CURRENT_LIABILITIES,IS_EPS_BASIC,VR_PE_RATIO,RETURNS,RISK,RETURNS_NEXT,RETURNS_NEXT_RANKING
0,2014,ETE,72.22,28.98,8.0,5.1,7,0.56,46.73,1.44,...,4564.0,64279.0,29477.0,6683.0,0.58,50.33,0.392368,3.47292,-0.512675,3510.0
1,2010,ETE,317.83,24.53,9.21,15.92,5,1.33,72.37,1.05,...,1390.0,17378.73,9346.067,1081.075,0.215,45.44,0.245752,0.730981,0.045082,809.0
2,2015,ETE,0.0,28.67,5.9,7.28,7,0.41,0.0,0.81,...,4577.0,71189.0,36837.0,4910.0,1.11,12.27,-0.512675,6.159618,0.897959,283.0
3,2012,ETE,27.89,19.82,11.46,8.02,5,0.73,6.03,0.74,...,3326.0,48904.0,21440.0,5845.0,0.285,37.04,0.120197,0.649843,0.65798,560.0
4,2007,ETE,2122.84,25.13,14.74,11.92,5,1.16,0.0,1.52,...,1007.41,9462.094,5916.585,1241.433,0.39,30.33,0.144703,0.797453,-0.502987,993.0


# Aproximación 1

In [6]:
gb3 = gb_plus.drop(['CODE', 'RETURNS_NEXT'], axis=1)

In [7]:
for c in gb3.columns:
    if c not in ['YEAR', 'VQ__PIOTROSKI_F_SCORE', 'RETURNS_NEXT_RANKING']:
        for y in gb3['YEAR'].unique():
            tmp = gb3.query('YEAR == @y')[c]
            q1 = tmp.quantile(0.25)
            q3 = tmp.quantile(0.75)
            iqr = q3-q1 #Interquartile range
            fence_low  = q1-1.5*iqr
            fence_high = q3+1.5*iqr

            gb3.drop(gb3.loc[(gb3['YEAR'] == y) &
                             ((gb3[c] < fence_low) |
                              (gb3[c] > fence_high))].index, inplace=True)

In [8]:
for c in gb3.columns:
    if c not in ['YEAR', 'VQ__PIOTROSKI_F_SCORE', 'RETURNS_NEXT_RANKING']:
        gb3[c + '_RANKING'] = gb3.groupby(by=['YEAR'])[c] \
            .transform(lambda x: x.rank(method='dense', ascending=False))
        gb3.drop([c], axis=1, inplace=True)
gb3.head()

Unnamed: 0,YEAR,VQ__PIOTROSKI_F_SCORE,RETURNS_NEXT_RANKING,CSR__ROE_RANKING,VR__EV_TO_EBIT_RANKING,CSR__ROC_JOEL_GREENBLATT_RANKING,CSR__OPERATING_MARGIN_RANKING,VR__PS_RATIO_RANKING,VR__PB_RATIO_RANKING,VQ__ALTMAN_Z_SCORE_RANKING,...,IS__COST_OF_GOODS_SOLD_RANKING,BS__TOTAL_EQUITY_RANKING,IS__EBITDA_RANKING,BS__TOTAL_ASSETS_RANKING,BS__LONG_TERM_DEBT_RANKING,BS__TOTAL_CURRENT_LIABILITIES_RANKING,IS_EPS_BASIC_RANKING,VR_PE_RATIO_RANKING,RETURNS_RANKING,RISK_RANKING
13,2009,4,2564.0,225.0,189.0,127.0,140.0,143.0,153.0,177.0,...,213.0,210.0,222.0,220.0,181.0,225.0,203.0,216.0,54.0,18.0
37,2008,6,136.0,193.0,14.0,211.0,198.0,116.0,140.0,112.0,...,186.0,210.0,217.0,208.0,128.0,202.0,156.0,11.0,175.0,209.0
44,1998,5,401.0,91.0,55.0,79.0,108.0,73.0,80.0,49.0,...,88.0,103.0,103.0,107.0,79.0,101.0,86.0,60.0,113.0,89.0
46,1997,4,956.0,96.0,4.0,92.0,108.0,64.0,39.0,26.0,...,90.0,104.0,108.0,108.0,80.0,101.0,95.0,1.0,28.0,88.0
79,2006,6,132.0,3.0,62.0,20.0,137.0,100.0,17.0,109.0,...,126.0,158.0,151.0,146.0,116.0,104.0,106.0,99.0,56.0,169.0


In [9]:
final = pd.DataFrame()
for x in range(2000, 2018):
    gbT = gb3.query('YEAR >= @x-5 & YEAR <= @x-1').drop('YEAR', axis=1)
    X3 = gbT.drop(['RETURNS_NEXT_RANKING'], axis=1)
    y3 = gbT['RETURNS_NEXT_RANKING']

    feature_names = X3.columns

    scaler = StandardScaler().fit(X3)
    X3s = scaler.transform(X3)
    
    param_dist = {'objective': 'reg:linear',  
                  'max_depth': 5,
                  'colsample_bytree': 0.7,
                  'alpha': 10,
                  'learning_rate': 0.15,
                  'silent': 1}

    n_folds = 3
    early_stopping = 10

    xg_train = xgb.DMatrix(X3s, label=y3)

    cv = xgb.cv(param_dist, xg_train, 500, nfold=n_folds, early_stopping_rounds=early_stopping)
    
    xg_reg_fold = xgb.train(params=param_dist, dtrain=xg_train, num_boost_round=cv.shape[0])
    
    save = data[['YEAR', 'CODE', 'CSR__ROE', 'VR__EV_TO_EBIT', 'CSR__ROC_JOEL_GREENBLATT',
                 'CSR__OPERATING_MARGIN', 'VQ__PIOTROSKI_F_SCORE', 'VR__PS_RATIO', 'VR__PB_RATIO',
                 'VQ__ALTMAN_Z_SCORE', 'CSR__ROA', 'PSDA__EARNINGS_PER_SHARE_DILUTED', 'CSR__ROIC',
                 'CSR__GROSS_MARGIN', 'VQ__YOY_EPS_GROWTH', 'VQ__YOY_EBITDA_GROWTH',
                 'PSDA__EBITDA_PER_SHARE', 'PSDA__TOTAL_DEBT_PER_SHARE', 'CSR__NET_MARGIN',
                 'IS__REVENUE', 'IS__NET_INCOME', 'IS__COST_OF_GOODS_SOLD', 'BS__TOTAL_EQUITY',
                 'IS__EBITDA', 'BS__TOTAL_ASSETS', 'BS__LONG_TERM_DEBT', 'BS__TOTAL_CURRENT_LIABILITIES',
                 'IS_EPS_BASIC', 'VR_PE_RATIO', 'RETURNS', 'RISK']]
    save.query('YEAR == @x', inplace=True)
    for c in save.columns:
        if c not in ['YEAR', 'VQ__PIOTROSKI_F_SCORE', 'CODE']:
            save[c + '_RANKING'] = save.groupby(by=['YEAR'])[c] \
                .transform(lambda x: x.rank(method='dense', ascending=False))
            save.drop([c], axis=1, inplace=True)
    
    result = save.drop(['YEAR', 'CODE'], axis=1)
    result.dropna(inplace=True)
    
    scaler = StandardScaler().fit(result)
    result_s = scaler.transform(result)
    result_xg = xgb.DMatrix(result_s)
    result['prediction'] = xg_reg_fold.predict(result_xg)
    
    save = pd.concat([data, result['prediction']], axis=1, join='inner')
    save['prediction'] = save.groupby(by=['YEAR'])['prediction'] \
        .transform(lambda x: x.rank(method='dense', ascending=True))
    save = save[['YEAR', 'CODE', 'VQ_MARKET_CAP', 'IS_EPS_BASIC', 'IS__REVENUE', 'VR_PE_RATIO',
                 'VQ__PIOTROSKI_F_SCORE', 'PRICE_START', 'PRICE_END', 'RETURNS', 'RISK', 'prediction']]
    save.columns = ['YEAR', 'CODE', 'CAP', 'EPS', 'REVENUE', 'PER', 'PIOTROSKI',
                    'PRICE_START', 'PRICE_END', 'RETURNS', 'RISK', 'RANKING']
    save.sort_values(['YEAR', 'RANKING'], ascending=[True, True], inplace=True)
    
    final = final.append(save)

final.to_csv('Results/metrics_XGB_KF_W5_ALLRANK.csv', index=False)