In [341]:
import pandas as pd
import numpy as np

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import matplotlib.image as mpimg

# Préparation des données
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# Modèles
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance

# Option d'affchage
pd.set_option('display.max_columns', None)

In [342]:
df0 = pd.read_csv('/Users/lilian/Desktop/hackathon2021/data/train.csv')


Columns (13,32) have mixed types.Specify dtype option on import or set low_memory=False.



In [343]:
df = df0.copy()

In [344]:
df = df[df['store_id'].isin(list(df['store_id'].sample(10)))].copy()

In [345]:
df.shape

(3952, 34)

In [346]:
df.sample()

Unnamed: 0,date,store_id,parent_chain_id,store_country,country_iso_code,region_id,store_region,store_segment,store_activity_name,items_first_enabled_date,store_first_saving_date,store_last_saving_date,item_id,item_name,before_price,currency_code,pickup_start,pickup_end,total_supply,declared_supply,manual_added_supply,manual_removed_supply,meals_saved,consumer_cancellation,store_cancellation,item_price,meals_refunded,rating_count,sum_rating_overall,item_view,no_unique_consumers,is_enabled,Département,target
1179625,2019-05-26,28670,,France,FR,FR10,Ile-De-France,Bakery,Unknown,2018-01-25,2018-01-25,2019-10-15,28833,,12.0,EUR,2019-05-26 17:00:00.000000,2019-05-26 18:30:00.000000,2,2,0,0,2,0,0,4.0,0,0,0,0,2,True,75,0.0


### Construction de variables intéressantes à partir des préexistantes

In [347]:
# Note moyenne

def compute_note(overall, count):
    try:
        overall / count
    except:
        return np.nan

df['note_moyenne'] = df.apply(lambda row : compute_note(row.sum_rating_overall, row.rating_count) , axis = 1)

In [348]:
# Lifetime

col_date = ['date', 'items_first_enabled_date', 'store_last_saving_date', 'store_first_saving_date',\
            'pickup_start', 'pickup_end']
for col in col_date :
    df[col]= pd.to_datetime(df[col])

df['lifetime'] = df['store_last_saving_date'] - df['store_first_saving_date']



In [349]:
# Reduction

df['reduction'] = 1 - df['item_price'] / df['before_price']

In [350]:
# Temps d'ouverture

df['temps_ouverture'] = df['pickup_end'] - df['pickup_start']

In [352]:
# Heure de début d'ouverture

df['heure_debut_ouverture'] = df.apply(lambda row : row.pickup_start.hour, axis = 1)


In [353]:
# Efficacité

df['efficacite'] = df['meals_saved'] / df['total_supply']

In [354]:
# Franchise

df['franchise'] = df.apply(lambda row : int(row.parent_chain_id > 0), axis = 1)

### Création de nouvelles variables

In [355]:
# Variance du nombre de vente pour un shop

def compute_variance(date, store_id, variable):
    
    df_short = df[df['store_id'] == store_id]
    df_short = df_short[df_short['date'] < date]
    
    
    
    return df_short[variable].var()

df['variance_ventes'] = df.apply(lambda row : compute_variance(row.date, row.store_id, 'meals_saved'), axis = 1)

In [None]:
# Baisse du nombre de vente dans le mois précedent

def compute_baisse_vente(date, store_id, variable):
    df_short = df[df['store_id'] == store_id]
    df_short = df_short[df_short['date'] < date]
    serie = df_short[variable]
    
    try:
        b = int(np.mean(serie[-30:]) / np.mean(serie[:-30]) < 1)
        
    except:
        b = 0
        
    return b

df['baisse_ventes'] = df.apply(lambda row : compute_baisse_vente(row.date, row.store_id, 'meals_saved'), axis = 1)

In [None]:
# Augmentation de la réduction durant la semaine précedente


def compute_hausse_reduction(date, store_id, variable):
    df_short = df[df['store_id'] == store_id]
    df_short = df_short[df_short['date'] < date]
    serie = df_short[variable]
    
    try:
        b = int(np.mean(serie[-7:]) / np.mean(serie[:-7]) > 1)
        if b:
            print(ok)
        
    except:
        
        b = 0
        
    return b

df['hausse_reduction'] = df.apply(lambda row : compute_hausse_reduction(row.date, row.store_id, 'reduction'), axis = 1)


In [None]:
# Variance de la durée d'ouverture

def compute_variance(date, store_id, variable):
    
    df_short = df[df['store_id'] == store_id]
    #print(df_short.shape)
    df_short = df_short[df_short['date'] < date]
    display(df_short)
    serie = df_short[variable]
    #print(serie)
    variance = np.var([(i.seconds / 3600) for i in serie])
    
    return variance

#df['variance_duree_ouverture'] = df.apply(lambda row : compute_variance(row.date, row.store_id, 'temps_ouverture'), axis = 1)


In [None]:
# Baisse de la note moyenne durant la semaine précedente

def compute_baisse_note(date, store_id, variable):
    df_short = df[df['store_id'] == store_id]
    df_short = df_short[df_short['date'] < date]
    serie = df_short[variable]
    
    try:
        b = np.mean(serie[-7:]) / np.mean(serie[:-7]) < 1
        
    except:
        
        b = 0
        
    return b

df['baisse_note'] = df.apply(lambda row : compute_baisse_note(row.date, row.store_id, 'note_moyenne'), axis = 1)


In [None]:
# Augmentation du nombre d'invendus durant le mois précédent

def compute_hausse_invendus(date, store_id, variable):
    df_short = df[df['store_id'] == store_id]
    df_short = df_short[df_short['date'] < date]
    serie = df_short[variable]
    
    try:
        b = np.mean(serie[-30:]) / np.mean(serie[:-30]) > 1
        
    except:
        b = 0
        
    return b

df['hausse_reduction'] = df.apply(lambda row : compute_hausse_reduction(row.date, row.store_id, 'reduction'), axis = 1)

### Suppression des variables non intéressantes

In [None]:
df.columns

In [None]:
df = df.drop(columns = ['parent_chain_id', 'store_country', 'country_iso_code', 'region_id', 'store_activity_name', 'item_id', 'item_name', 'currency_code',
                       'pickup_end', 'pickup_start', 'declared_supply', 'manual_removed_supply', 'store_cancellation', 'item_price',
                       'meals_refunded', 'rating_count', 'sum_rating_overall', 'item_view', 'no_unique_consumers', 'is_enabled', 'Département', 'store_id'])
df.sample(10)

## Recodage de la variable objectif