In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, auc, roc_auc_score
from tqdm import tqdm

In [None]:
maestro_consultora = pd.read_csv('../input/datathon-belcorp-prueba/maestro_consultora.csv').iloc[:, 1:]
maestro_consultora.head()

In [None]:
edad, edad_labels = pd.qcut(maestro_consultora['edad'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'], retbins=True)
maestro_consultora['edad_cuartil'] = edad

In [None]:
consultora_filter = pd.get_dummies(maestro_consultora[
    ['IdConsultora', 'campanaingreso', 'campanaultimopedido', 'campanaprimerpedido', 'edad_cuartil', 'flagsupervisor']
])
consultora_filter.head()

In [None]:
maestro_producto = pd.read_csv('../input/datathon-belcorp-prueba/maestro_producto.csv').iloc[:, 1:]
maestro_producto.head()

In [None]:
venta_campaña = pd.read_csv('../input/datathon-belcorp-prueba/dtt_fvta_cl.csv')
venta_campaña.head()

In [None]:
#campaña_consultora['IdConsultora'].unique().shape, consultora_filter['IdConsultora'].unique().shape

In [None]:
campaña_consultora = pd.read_csv('../input/datathon-belcorp-prueba/campana_consultora.csv').iloc[:,1:]
campaña_consultora = campaña_consultora.sort_values(by=['IdConsultora', 'campana'])
campaña_consultora.head()

In [None]:
campaña_consultora = campaña_consultora.merge(consultora_filter[['IdConsultora', 'campanaultimopedido', 
                                                                 'flagsupervisor',
                                                                 'edad_cuartil_Q1', 'edad_cuartil_Q2', 
                                                                 'edad_cuartil_Q3', 'edad_cuartil_Q4']], 
                                              on='IdConsultora', how='left')
campaña_consultora.dropna(subset=['campanaultimopedido'], inplace=True)

In [None]:
campaña_consultora.tail()

In [None]:
campaña_consultora['diff_ultimopedido'] = campaña_consultora['campanaultimopedido'] - campaña_consultora['campana']

In [None]:
campaña_agg1 = pd.DataFrame(
    venta_campaña.groupby(
        ['idconsultora', 'campana']
    )['ahorro', 'realvtamncatalogo', 'realvtamnneto', 'realanulmnneto', 'realdevmnneto', 'realuuanuladas', 
      'realuudevueltas', 'realuufaltantes', 'realuuvendidas', 'realvtamnfaltneto'].sum()
).reset_index()

In [None]:
campaña_agg1.rename(columns={'idconsultora': 'IdConsultora'}, inplace=True)
campaña_agg1.head()

In [None]:
campaña_agg1.columns = ['IdConsultora', 'campana', 'ahorro_sum', 'realvtamncatalogo_sum',
       'realvtamnneto_sum', 'realanulmnneto_sum', 'realdevmnneto_sum', 'realuuanuladas_sum',
       'realuudevueltas_sum', 'realuufaltantes_sum', 'realuuvendidas_sum',
       'realvtamnfaltneto_sum']
campaña_agg1.columns

In [None]:
campaña_agg2 = pd.DataFrame(
    venta_campaña.groupby(
        ['idconsultora', 'campana']
    )['ahorro', 'realvtamncatalogo', 'realvtamnneto', 'realanulmnneto', 'realdevmnneto', 'realuuanuladas', 
      'realuudevueltas', 'realuufaltantes', 'realuuvendidas', 'realvtamnfaltneto'].mean()
).reset_index()

In [None]:
campaña_agg2.rename(columns={'idconsultora': 'IdConsultora'}, inplace=True)
campaña_agg2.head()

In [None]:
campaña_agg2.columns = ['IdConsultora', 'campana', 'ahorro_mean', 'realvtamncatalogo_mean',
       'realvtamnneto_mean', 'realanulmnneto_mean', 'realdevmnneto_mean', 'realuuanuladas_mean',
       'realuudevueltas_mean', 'realuufaltantes_mean', 'realuuvendidas_mean',
       'realvtamnfaltneto_mean']
campaña_agg2.columns

In [None]:
campaña_agg = campaña_agg1.merge(campaña_agg2, on=['IdConsultora', 'campana'])
campaña_agg.head()

In [None]:
set_a = set(campaña_agg['IdConsultora'].unique().tolist()) 
set_b = set(campaña_consultora['IdConsultora'].unique().tolist())
inter_ = set_a & set_b

In [None]:
campaña_merge = campaña_consultora.merge(campaña_agg, on=['IdConsultora', 'campana'], how='left')
campaña_merge = campaña_merge.sort_values(by=['IdConsultora', 'campana'])
campaña_merge = campaña_merge.drop(['codigocanalorigen'], axis=1)
campaña_merge.fillna(0, inplace=True)
campaña_merge.head()

In [None]:
campaña_merge['Flag_shift'] = campaña_merge.groupby('IdConsultora')['Flagpasopedido'].shift(-1)
campaña_merge = campaña_merge.select_dtypes(exclude=['object'])

In [None]:
df_val = campaña_merge[campaña_merge['Flag_shift'].isnull()].set_index(['IdConsultora', 'campana'])
df_model = campaña_merge[campaña_merge['Flag_shift'].notna()].set_index(['IdConsultora', 'campana'])

In [None]:
X = df_model.drop(['Flag_shift'], axis=1)
y = df_model['Flag_shift']

In [None]:
X.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Lightgbm

In [None]:
lgbm_class = lgb.LGBMClassifier(n_estimators=1000, learning_rate=0.01, n_jobs=-1, random_state=1234)
lgbm_class.fit(X_train, y_train)

In [None]:
predict_lgbm = lgbm_class.predict(X_test)

In [None]:
accuracy_score(y_test, predict_lgbm)

In [None]:
predict_probs = lgbm_class.predict_proba(X_test)[:, 1]

In [None]:
auc = roc_auc_score(y_test, predict_probs)
auc

In [None]:
feature_imp = pd.DataFrame(sorted(zip(lgbm_class.feature_importances_, X.columns)), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()

### Crossvalidation

# Submition

In [None]:
df_val.columns

In [None]:
#df_val.drop('campanaultimopedido', axis=1, inplace=True)

In [None]:
predict_bel = pd.read_csv('../input/datathon-belcorp-prueba/predict_submission.csv')
predict_bel.head()

In [None]:
df_sub_tmp = df_val.loc[predict_bel['idconsultora'].tolist()].drop(['Flag_shift'], axis=1)

In [None]:
set(df_sub_tmp.columns.tolist()).symmetric_difference(set(X_train.columns.tolist()))

In [None]:
df_sub_tmp.columns

## Sub 1

In [None]:
df_sub_tmp['Flag_shift'] = lgbm_class.predict_proba(df_sub_tmp)[:, 1]
df_sub_tmp.head()

In [None]:
df_sub = df_sub_tmp.reset_index()[['IdConsultora', 'Flag_shift']]
df_sub = df_sub.set_index('IdConsultora').loc[predict_bel['idconsultora'].tolist()].reset_index()
df_sub.rename(columns={'IdConsultora': 'idconsultora', 'Flag_shift': 'flagpasopedido'}, inplace=True)
df_sub.head()

In [None]:
df_sub.shape, predict_bel.shape

In [None]:
#df_sub.to_csv('watermelon3.csv', encoding='utf-8', index=None)