In [290]:
import pandas as pd
import numpy as np

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import matplotlib.image as mpimg

# Préparation des données
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# Modèles
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance
import time

# Option d'affchage
pd.set_option('display.max_columns', None)

In [291]:
t0 = time.time()

In [None]:
df0 = pd.read_csv('/Users/lilian/Desktop/hackathon2021/data/train.csv')


Columns (13,32) have mixed types.Specify dtype option on import or set low_memory=False.



In [None]:
df = df0.copy()

In [None]:
df = df[df['store_id'].isin(list(df['store_id'].sample(10)))].copy()

In [None]:
df.shape

In [None]:
df.sample()

### Construction de variables intéressantes à partir des préexistantes

In [None]:
# Note moyenne

def compute_note(overall, count):
    try:
        overall / count
    except:
        return np.nan

df['note_moyenne'] = df.apply(lambda row : compute_note(row.sum_rating_overall, row.rating_count) , axis = 1)

In [None]:
# Lifetime

col_date = ['date', 'items_first_enabled_date', 'store_last_saving_date', 'store_first_saving_date',\
            'pickup_start', 'pickup_end']
for col in col_date :
    df[col]= pd.to_datetime(df[col])

df['lifetime'] = df['store_last_saving_date'] - df['store_first_saving_date']



In [None]:
# Reduction

df['reduction'] = 1 - df['item_price'] / df['before_price']

In [None]:
# Temps d'ouverture

df['temps_ouverture'] = df['pickup_end'] - df['pickup_start']

In [None]:
# Heure de début d'ouverture

df['heure_debut_ouverture'] = df.apply(lambda row : row.pickup_start.hour, axis = 1)


In [None]:
# Efficacité

df['efficacite'] = df['meals_saved'] / df['total_supply']

In [None]:
# Franchise

df['franchise'] = df.apply(lambda row : int(row.parent_chain_id > 0), axis = 1)

### Création de nouvelles variables

## Recodage de la variable objectif

In [None]:
def determine_absence_future(date, store_id):
    df_short = df[df['store_id'] == store_id]
    df_short = df_short[df_short['date'] > date]
    
    serie = list(df_short['total_supply'])
    
    i = 0
    l = len(serie)
    try:
        while serie[i] == 0 and i < l:
            i+=1
    except:
        return 0
    
    return i
    

In [None]:
df['absence_future'] = df.apply(lambda row : determine_absence_future(row.date, row.store_id), axis = 1)

In [None]:
# Variance du nombre de vente pour un shop

def compute_variance(date, store_id, variable):
    
    df_short = df[df['store_id'] == store_id]
    df_short = df_short[df_short['date'] < date]
    
    
    
    return df_short[variable].var()

df['variance_ventes'] = df.apply(lambda row : compute_variance(row.date, row.store_id, 'meals_saved'), axis = 1)

In [None]:
# Baisse du nombre de vente dans le mois précedent

def compute_baisse_vente(date, store_id, variable):
    df_short = df[df['store_id'] == store_id]
    df_short = df_short[df_short['date'] < date]
    serie = df_short[variable]
    
    try:
        b = int(np.mean(serie[-30:]) / np.mean(serie[:-30]) < 1)
        
    except:
        b = 0
        
    return b

df['baisse_ventes'] = df.apply(lambda row : compute_baisse_vente(row.date, row.store_id, 'meals_saved'), axis = 1)

In [None]:
# Augmentation de la réduction durant la semaine précedente


def compute_hausse_reduction(date, store_id, variable):
    df_short = df[df['store_id'] == store_id]
    df_short = df_short[df_short['date'] < date]
    serie = df_short[variable]
    
    try:
        b = int(np.mean(serie[-7:]) / np.mean(serie[:-7]) > 1)
        if b:
            print(ok)
        
    except:
        
        b = 0
        
    return b

df['hausse_reduction'] = df.apply(lambda row : compute_hausse_reduction(row.date, row.store_id, 'reduction'), axis = 1)


In [None]:
# Variance de la durée d'ouverture

def compute_variance(date, store_id, variable):
    
    df_short = df[df['store_id'] == store_id]
    #print(df_short.shape)
    df_short = df_short[df_short['date'] < date]
    display(df_short)
    serie = df_short[variable]
    #print(serie)
    variance = np.var([(i.seconds / 3600) for i in serie])
    
    return variance

#df['variance_duree_ouverture'] = df.apply(lambda row : compute_variance(row.date, row.store_id, 'temps_ouverture'), axis = 1)


In [None]:
# Baisse de la note moyenne durant la semaine précedente

def compute_baisse_note(date, store_id, variable):
    df_short = df[df['store_id'] == store_id]
    df_short = df_short[df_short['date'] < date]
    serie = df_short[variable]
    
    try:
        b = np.mean(serie[-7:]) / np.mean(serie[:-7]) < 1
        
    except:
        
        b = 0
        
    return b

df['baisse_note'] = df.apply(lambda row : compute_baisse_note(row.date, row.store_id, 'note_moyenne'), axis = 1)


In [None]:
# Augmentation du nombre d'invendus durant le mois précédent

def compute_hausse_invendus(date, store_id, variable):
    df_short = df[df['store_id'] == store_id]
    df_short = df_short[df_short['date'] < date]
    serie = df_short[variable]
    
    try:
        b = np.mean(serie[-30:]) / np.mean(serie[:-30]) > 1
        
    except:
        b = 0
        
    return b

df['hausse_reduction'] = df.apply(lambda row : compute_hausse_reduction(row.date, row.store_id, 'reduction'), axis = 1)

### Suppression des variables non intéressantes

In [None]:
df.columns

In [None]:
df = df.drop(columns = ['parent_chain_id', 'store_country', 'country_iso_code', 'region_id', 'store_activity_name', 'item_id', 'item_name', 'currency_code',
                       'pickup_end', 'pickup_start', 'declared_supply', 'manual_removed_supply', 'store_cancellation', 'item_price',
                       'meals_refunded', 'rating_count', 'sum_rating_overall', 'item_view', 'no_unique_consumers', 'is_enabled', 'Département', 'store_id', 'target'])
df.sample(10)

In [None]:
print((time.time() - t0)/ 60)

### Mise en forme des données

In [None]:
categ_var = ['store_region', 'store_segment']
for var in categ_var:
    df = pd.concat([df, pd.get_dummies(df[var], prefix = var)], axis = 1).drop(columns = [var])


In [None]:
df['lifetime'] = df['lifetime'].dt.days

In [None]:
m = min(df['items_first_enabled_date'])

date_m = df['date'] - m
items_first_enabled_date_m = df['items_first_enabled_date']-m
store_first_saving_date_m = df['store_first_saving_date']-m
store_last_saving_date_m = df['store_last_saving_date']-m

df.drop(columns = ['date'])
df.drop(columns = ['items_first_enabled_date'])
df.drop(columns = ['store_first_saving_date'])
df.drop(columns = ['store_last_saving_date'])

df['date'] = date_m.apply(lambda x: x.days)
df['items_first_enabled_date'] = items_first_enabled_date_m.apply(lambda x: x.days)
df['store_first_saving_date'] = store_first_saving_date_m.apply(lambda x: x.days)
df['store_last_saving_date'] = store_last_saving_date_m.apply(lambda x: x.days)


df['temps_ouverture'] = df['temps_ouverture'].apply(lambda x: x.seconds/3600)

In [None]:
df

In [None]:
df = df.drop(columns = ['before_price', 'note_moyenne', 'reduction', 'efficacite'])

In [None]:
df = df.dropna()

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()

df[list(df.columns)] = min_max_scaler.fit_transform(df[list(df.columns)])

In [None]:
df

## Application des modeles

In [None]:
# On définit les échantillons

y = df[['absence_future']]
X = df.drop(columns = ['absence_future'])

# On choisit un échantillon de validation de 20 %

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)

In [None]:
N = X.shape[1]

## Regression linéaire

In [None]:
lin_reg = LinearRegression().fit(X_train, y_train)

# On prédit à partir de l'échantillon de test pour calculer les scores

y_pred = lin_reg.predict(X_test)

In [None]:
px.histogram(pd.DataFrame([
    (X.columns[i], lin_reg.coef_[0][i]) for i in range(N)
                    ]).T.rename(index = {0 : 'variable', 1 : 'coeff'}).T, x = 'variable', y = 'coeff', histfunc = 'sum'
            ).show()

print('MSE :', mean_squared_error(y_test, y_pred))

In [None]:
# Affichage des p-values

mod = sm.OLS(y,X)
fii = mod.fit()
p_values = fii.summary2().tables[1]['P>|t|']
pd.DataFrame(p_values).T

In [None]:
EN_reg = ElasticNet(alpha=.1, copy_X=True, fit_intercept = False, l1_ratio=.031)

EN_reg.fit(X_train, y_train)

In [None]:
y_pred = EN_reg.predict(X_test)
print('MSE : ', mean_squared_error(y_test, y_pred))

In [None]:
pd.DataFrame([(X.columns[i], EN_reg.coef_[i]) for i in range(N)]).T.rename(index = {0 : 'variable', 1 : 'coeff'})

## RF

In [None]:
# On définit d'abord une fonction qui donne l'importance des variables vis à vis d'un certain modèle

def feat_importance(model, x_train, y_train, X):
    """
    Renvoie le tableau de l'importance des variables vis à vis du modèle par la méthode des permutations
    """

    result = permutation_importance(
                                    model, 
                                    X, 
                                    y, 
                                    n_repeats = 3,
                                    random_state = 0
                                    )['importances_mean']
    
    importance = pd.DataFrame(result, index = X.columns, columns = ["Importance"])
    
    return importance.sort_values(by = ['Importance'], ascending = False)

In [None]:
# Création des essembles de tests et d'entrainement, on choisit une taile de test de 30% ici

X = df.drop(['absence_future'], axis = 1)
x = np.array(X)
y = np.array(df['absence_future'])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 41)
# random_state correspond à la graine générant l'échantillon aléatoire

In [None]:
max_depth_ls = [1, 10, 13, 15, 17, 20, 25, 30] # profondeurs maximales des arbres de décision testées
mse_train_max_depth = []
mse_test_max_depth = []

# Pour chaque profondeur max, on regresse avec random forest

for m in max_depth_ls :
    
    print('Profondeur téstée : ', m)
    
    rf = RandomForestRegressor(
                            max_depth = m, 
                            random_state=0,
                            n_estimators = 30) # nombre d'arbres utilisés
    
    rf = rf.fit(x_train, y_train)
    y_pred_train = rf.predict(x_train)
    y_pred = rf.predict(x_test)
    
    mse_train_max_depth.append(mean_squared_error(y_train, y_pred_train))
    mse_test_max_depth.append(mean_squared_error(y_test, y_pred))

In [None]:
# On affiche ensuite les performances de la regression sur les deux échantillon (train et test)

fig, ax = plt.subplots(figsize = (18, 8))
plt.plot(max_depth_ls, mse_train_max_depth, color = 'red', label = 'Train')
plt.plot(max_depth_ls, mse_test_max_depth, color = 'blue', label = 'Test')
plt.title('MSE en fonction de max_depth')
plt.legend()
plt.show()

In [None]:
#On regarde la valeur qui minimise la MSE sur l'ensemble de test

max_depth_ls[mse_test_max_depth.index(min(mse_test_max_depth))]

In [None]:
# On regarde maintenant l'effet du nombre d'arbre sur l'effet de la regression

nb_estimators_ls = [1, 2, 3, 5, 20, 40, 50, 60, 80]
mse_train_nb_estimators = []
mse_test_nb_estimators = []

for m in nb_estimators_ls :
    print("Nombre d'arbres testés : ", m)
    rf = RandomForestRegressor(max_depth = 15, 
                               random_state = 0,
                                n_estimators = m)    
    
    rf = rf.fit(x_train, y_train)
    y_pred_train = rf.predict(x_train)
    y_pred = rf.predict(x_test)
    
    mse_train_nb_estimators.append(mean_squared_error(y_train, y_pred_train))
    mse_test_nb_estimators.append(mean_squared_error(y_test, y_pred))



In [None]:
# On affiche ensuite les performances de la regression sur les deux échantillon (train et test)

fig, ax = plt.subplots(figsize = (18, 8))
plt.plot(nb_estimators_ls, mse_train_nb_estimators, color = 'red', label = 'Train')
plt.plot(nb_estimators_ls, mse_test_nb_estimators, color = 'blue', label = 'Test')
plt.title('MSE en fonction de n_estimators')
plt.legend()
plt.show()

In [None]:
# On fait varier le nombre minimum d'exemple requis pour créer une feuille/noeud

samples_leaf_ls = [1, 2, 3, 4, 10]
mse_train_samples_leaf = []
mse_test_samples_leaf = []


for m in samples_leaf_ls :
    print('min_samples_leaf testé : ', m)
    rf = RandomForestRegressor( max_depth = 15, 
                                min_samples_leaf = m,
                                n_estimators = 60, 
                                random_state = 0
                              )    
    
    rf = rf.fit(x_train, y_train)
    y_pred_train = rf.predict(x_train)
    y_pred = rf.predict(x_test)
    
    mse_train_samples_leaf.append(mean_squared_error(y_train, y_pred_train))
    mse_test_samples_leaf.append(mean_squared_error(y_test, y_pred))

In [None]:
# On affiche ensuite les performances de la regression sur les deux échantillon (train et test)

fig, ax = plt.subplots(figsize = (18, 8))
plt.plot(samples_leaf_ls, mse_train_samples_leaf, color='red', label='Train')
plt.plot(samples_leaf_ls, mse_test_samples_leaf, color='blue', label='Test')
plt.title('MSE en fct de min samples leaf')
plt.legend()
plt.show()

In [None]:
max_leaf_ls = [2, 10, 100, 150, 200, 1000, 1500]
mse_train_max_leaf = []
mse_test_max_leaf = []



for m in max_leaf_ls :
    
    print('Nombre de feuilles max testé : ', m)
    rf = RandomForestRegressor(max_depth = 15, 
                               min_samples_leaf = 1, 
                               max_leaf_nodes = m,
                               n_estimators = 60)   
    
    rf = rf.fit(x_train, y_train)
    y_pred_train = rf.predict(x_train)
    y_pred = rf.predict(x_test)
    mse_train_max_leaf.append(mean_squared_error(y_train, y_pred_train))
    mse_test_max_leaf.append(mean_squared_error(y_test, y_pred))

In [None]:
# On affiche ensuite les performances de la regression sur les deux échantillon (train et test)

fig, ax = plt.subplots(figsize = (18, 8))
plt.plot(max_leaf_ls, mse_train_max_leaf, color = 'red', label = 'Train')
plt.plot(max_leaf_ls, mse_test_max_leaf, color = 'blue', label = 'Test')
plt.title('MSE en fonction max_leaf_nodes')
plt.legend()
plt.show()

In [None]:
# On a maintenant tous nos paramètres

rf = RandomForestRegressor(
                        max_depth = 15, 
                        min_samples_leaf = 1, 
                        max_leaf_nodes = 1000,
                        n_estimators = 60
                            )    

rf = rf.fit(x_train, y_train)
y_pred_train = rf.predict(x_train)
y_pred = rf.predict(x_test)
oo = np.zeros(y_pred.shape)

print('MSE train : ', mean_squared_error(y_train, y_pred_train))
print('MSE test : ', mean_squared_error(y_test, y_pred))
print('MSE modèle nulle : ', mean_squared_error(y_test, oo))

importance = feat_importance(rf, x_train, y_train, X)

In [None]:
importance.plot(kind = 'barh', figsize = (18, 14))

In [None]:
# On définit les échantillons

y = df[['absence_future']]
X = df.drop(columns = ['absence_future'])

In [None]:
# Construction de l'architecture du réseau

model = Sequential()

model.add(Dense(70, input_dim = N, activation = 'relu'))

model.add(Dense(12))
model.add(Dense(13))
model.add(Dense(15))
model.add(Dense(5))
model.add(Dense(1))

model.compile(optimizer = 'adam', loss = 'mse')

In [None]:
# Lancement de la phase d'apprentissage

history = model.fit(X, y, validation_split = 0.2,  epochs = 30)

In [None]:
# On affiche l'évolution de la loss au fil des époques pour les échantillons train et test.

fig, ax = plt.subplots(figsize = (18, 8))
plt.plot(history.history['loss'])
plt.title('Evolution de MSE sur X_train au fil des époques')
plt.ylabel('mse')
plt.legend(['train'], loc = 'upper left')
plt.show()

fig, ax = plt.subplots(figsize = (18, 8))
plt.plot(history.history['val_loss'])
plt.title('Evolution de MSE sur X_test au fil des époques')
plt.ylabel('mse')
plt.legend(['test'], loc = 'upper left')
plt.show()

In [None]:
perm = PermutationImportance(model, scoring = 'r2', random_state = 1).fit(X,y)
eli5.show_weights(perm, feature_names = X.columns.tolist(), top = N)

In [None]:
print((time.time() - t0)/ 60)