In [None]:
import pandas            as pd
import numpy             as np
import matplotlib.pyplot as plt
import joblib            as jb
import seaborn           as sns

from sklearn.feature_selection        import RFE
from sklearn.feature_selection        import VarianceThreshold
from sklearn.preprocessing            import MinMaxScaler

from lightgbm                         import LGBMClassifier
from sklearn.ensemble                 import RandomForestClassifier

%matplotlib inline

## Carregar dataset preparado e divido

In [None]:
X_train             = jb.load("../dados/treino/X_train.pkl.z")
y_train             = jb.load("../dados/treino/y_train.pkl.z")
X_test              = jb.load("../dados/teste/X_teste.pkl.z")
y_test              = jb.load("../dados/teste/y_teste.pkl.z")

In [None]:
print(X_train.head())
print(y_train.head())
colnames = X_train.columns
print(colnames)

In [None]:
# Tratar o rank das melhores features
ranks = {}
def ranking(ranks, names, order=1):
    minmax = MinMaxScaler()
    ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
    ranks = map(lambda x: round(x,2), ranks)
    return dict(zip(names, ranks))

## Verifica melhores features para LGBM

In [None]:
# Iniciando o objeto (Recursive Feature Elimination) e definindo qual modelo
# será utilizado 
rfe = RFE(LGBMClassifier(), step=1)

# Utilizando os DataSet de treino inicia o processo de treino 
# e eliminação de features 
fit = rfe.fit(X_train, y_train)

# Submete a função previamente criada para gravar o ranking das features 
# para este modelo. 
ranks["LGBM"] = ranking(list(map(float, rfe.ranking_)), colnames, order=-1)

# Exibe o score das features
print('Ranking ', ranks)

## Verifica melhores features para RF

In [None]:
# Random Forest
mdl_rf = RandomForestClassifier(n_estimators=2000, random_state=2,max_depth=64,
                                min_samples_leaf=10, class_weight="balanced", n_jobs=6,
                               min_samples_split=200)

In [None]:
rfe = RFE(RandomForestClassifier(), step=1)
fit = rfe.fit(X_train, y_train)
ranks["RF"] = ranking(list(map(float, rfe.ranking_)), colnames, order=-1)
print('Ranking RF ', ranks["RF"])
print('Todos Rankins ', ranks)

## Formatar a exibição das médias das features

In [None]:
r = {}
for name in colnames:
    r[name] = round(np.mean([ranks[method][name] 
                             for method in ranks.keys()]), 2)
 
methods = sorted(ranks.keys())
ranks["Media"] = r
methods.append("Media")
 
print("\t%s" % "\t".join(methods))
for name in colnames:
    print("%s\t%s" % (name, "\t".join(map(str, 
                         [ranks[method][name] for method in methods]))))

## Exibição do Gráfico (ranking das features)

In [None]:
meanplot = pd.DataFrame(list(r.items()), columns= ['Feature','Ranking Media'])
meanplot = meanplot.sort_values('Ranking Media', ascending=False)
# sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})
sns.catplot(x="Ranking Media", y="Feature", data = meanplot, kind="bar", 
               height=14, aspect=1.9, palette='coolwarm')

# Variance Threshold

In [None]:
# Outra forma de analisar features, sempre precisar dos modelos
selection = VarianceThreshold(threshold=0.150)
X_new = selection.fit_transform(X_train)
index = np.where(selection.variances_ > 0.150)
for i in range(len(selection.variances_)):
    print(colnames[i], selection.variances_[i])
print('Colunas Escolhidas', colnames[index])
print('Todas as colunas ', colnames)

In [None]:
# Remover as features menos significativas
dropFeature = ['FRETE']
X_train = X_train.drop(dropFeature, axis=1)
X_test  = X_test.drop(dropFeature, axis=1)

# X_train.head()
X_test.head()

In [None]:
# Gravar dataset com as melhores features
jb.dump(X_train   , "../dados/treino/X_trainFinal.pkl.z")
jb.dump(y_train   , "../dados/treino/y_trainFinal.pkl.z")
jb.dump(X_test   , "../dados/teste/X_testeFinal.pkl.z")
jb.dump(y_test   , "../dados/teste/y_testeFinal.pkl.z")