# Librerias

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import graphviz

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
%matplotlib inline

cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb

# Análisis exploratorio

In [2]:
rankings = pd.read_csv('Data/ds_index_next.csv')
data = rankings.copy()
rankings.shape

(87438, 43)

In [3]:
rankings.head()

Unnamed: 0,YEAR,CODE,VQ_MARKET_CAP,CSR__ROE,VR__EV_TO_EBIT,CSR__ROC_JOEL_GREENBLATT,CSR__OPERATING_MARGIN,VQ__PIOTROSKI_F_SCORE,VR__PS_RATIO,VR__PB_RATIO,...,RISK,INDUSTRY01,INDUSTRY02,RETURNS_NEXT,RETURNS_CAT,RETURNS_NEXT_CAT,INDEX_RETURNS,INDEX_RETURNS_NEXT,RETURNS_vs_INDEX,RETURNS_vs_INDEX_NEXT
0,2014,ETE,31002.7009,72.22,28.98,8.0,5.1,7,0.56,46.73,...,3.47292,,,-0.512675,POSITIVE,NEGATIVE,0.123866,-0.006928,WINS,LOSES
1,2010,ETE,8712.4952,317.83,24.53,9.21,15.92,5,1.33,72.37,...,0.730981,,,0.045082,POSITIVE,POSITIVE,0.110019,-0.01122,WINS,WINS
2,2015,ETE,14384.6808,0.0,28.67,5.9,7.28,7,0.41,0.0,...,6.159618,,,0.897959,NEGATIVE,POSITIVE,-0.006928,0.112374,LOSES,WINS
3,2012,ETE,12732.3534,27.89,19.82,11.46,8.02,5,0.73,6.03,...,0.649843,,,0.65798,POSITIVE,POSITIVE,0.116776,0.263905,WINS,WINS
4,2007,ETE,8191.2308,2122.84,25.13,14.74,11.92,5,1.16,0.0,...,0.797453,,,-0.502987,POSITIVE,NEGATIVE,0.036538,-0.375847,WINS,LOSES


## Preprocesamiento de variables

In [4]:
rankings.query('YEAR >= 1990 & YEAR <= 2018', inplace=True)

In [5]:
rankings['RETURNS_vs_INDEX_NEXT'] = np.where((rankings['RETURNS_NEXT_CAT'] == 'POSITIVE') &
                                             (rankings['RETURNS_vs_INDEX_NEXT'] == 'WINS'),
                                             'WINS', 'LOSES')

In [6]:
rankings.drop(['INDUSTRY01', 'INDUSTRY02', 'RETURNS_NEXT', 'RETURNS_NEXT_CAT',
               'INDEX_RETURNS_NEXT'], axis=1, inplace=True)

In [7]:
for c in rankings.columns:
    if c not in ['YEAR', 'CODE', 'VQ__PIOTROSKI_F_SCORE', 'RETURNS_CAT',
                 'RETURNS_vs_INDEX', 'RETURNS_vs_INDEX_NEXT']:
        for y in rankings['YEAR'].unique():
            tmp = rankings.query('YEAR == @y')[c]
            q1 = tmp.quantile(0.25)
            q3 = tmp.quantile(0.75)
            iqr = q3-q1 #Interquartile range
            fence_low  = q1-1.5*iqr
            fence_high = q3+1.5*iqr
            
            rankings[c][(rankings['YEAR'] == y) & (rankings[c] < fence_low)] = fence_low
            rankings[c][(rankings['YEAR'] == y) & (rankings[c] > fence_high)] = fence_high

In [8]:
le = LabelEncoder()
rankings['RETURNS_CAT'] = le.fit_transform(rankings['RETURNS_CAT'])
rankings['RETURNS_vs_INDEX'] = le.fit_transform(rankings['RETURNS_vs_INDEX'])
rankings['RETURNS_vs_INDEX_NEXT'] = le.fit_transform(rankings['RETURNS_vs_INDEX_NEXT'])

In [9]:
rankings.drop(['CODE'], axis=1, inplace=True)

# Algoritmo

In [10]:
final = pd.DataFrame()
for x in range(2000, 2018):
    rankingsT = rankings.query('YEAR >= @x-10 & YEAR <= @x-1').drop('YEAR', axis=1)
    X = rankingsT.drop(['RETURNS_vs_INDEX_NEXT'], axis = 1)
    y = rankingsT['RETURNS_vs_INDEX_NEXT']

    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle= True, random_state=10, test_size=0.20)
    xg_train = xgb.DMatrix(X_train, label=y_train)
    xg_test = xgb.DMatrix(X_test, label=y_test)
    
    n_folds = 5
    early_stopping = 10

    params = {
        'eta': 0.015,
        'max_depth': 10,
        'min_child_weight': 6,
        'subsample': 0.7,
        'colsample_bytre': 1,
        'objective': 'binary:logistic',
        'seed': 99,
        'silent': 1,
        'eval_metric': 'error',
        'nthread': 4}

    cv = xgb.cv(params, xg_train, 300, nfold=n_folds, early_stopping_rounds=early_stopping)
    
    model = xgb.train(params=params, dtrain=xg_train, num_boost_round=cv.shape[0])
    
    result = rankings.query('YEAR == @x').drop(['YEAR', 'RETURNS_vs_INDEX_NEXT'], axis = 1)
    result_xg = xgb.DMatrix(result)
    result_predict = model.predict(result_xg)
    result['prediction_probability'] = result_predict
    result['prediction'] = [round(value) for value in result_predict]
    
    save = pd.concat([data, result[['prediction', 'prediction_probability']]], axis=1, join='inner')
    save['prediction_order'] = save.groupby(by=['YEAR'])['prediction_probability'] \
        .transform(lambda x: x.rank(method='dense', ascending=False))
    save = save[['YEAR', 'CODE', 'VQ_MARKET_CAP', 'IS_EPS_BASIC', 'IS__REVENUE', 'VR_PE_RATIO',
                 'VQ__PIOTROSKI_F_SCORE', 'PRICE_START', 'PRICE_END', 'RETURNS', 'RISK',
                 'prediction', 'prediction_probability', 'prediction_order']]
    save.columns = ['YEAR', 'CODE', 'CAP', 'EPS', 'REVENUE', 'PER', 'PIOTROSKI',
                    'PRICE_START', 'PRICE_END', 'RETURNS', 'RISK',
                    'PREDICTION', 'PROBABILITY', 'PROBABILITY_ORDER']
    save.sort_values(['YEAR', 'PROBABILITY_ORDER'], ascending=[True, True], inplace=True)
    
    final = final.append(save)

final.to_csv('Results/metrics_XGB_KF_W10_CLASS_V2.csv', index=False)