In [1]:
import pandas as pd
import numpy as np

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import matplotlib.image as mpimg

# Préparation des données
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# Modèles
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance

# Option d'affchage
pd.set_option('display.max_columns', None)

In [4]:
df0 = pd.read_csv('/Users/lilian/Desktop/hackathon2021/data/train.csv')

In [5]:
df = df0.copy()

In [7]:
df.sample(10)

Unnamed: 0,date,store_id,parent_chain_id,store_country,country_iso_code,region_id,store_region,store_segment,store_activity_name,items_first_enabled_date,store_first_saving_date,store_last_saving_date,item_id,item_name,before_price,currency_code,pickup_start,pickup_end,total_supply,declared_supply,manual_added_supply,manual_removed_supply,meals_saved,consumer_cancellation,store_cancellation,item_price,meals_refunded,rating_count,sum_rating_overall,item_view,no_unique_consumers,is_enabled,Département,target
3405033,2019-03-13,34707,,France,FR,FRG0,Pays De La Loire,Traditional Restaurant,Commercial,2019-02-25,2019-02-25,2020-10-29,34870,,12.0,EUR,2019-03-13 17:00:00.000000,2019-03-13 18:00:00.000000,4,2,2,0,4,1,0,3.99,0,2,9,0,4,True,44,0.0
4385822,2020-04-01,37792,3314.0,France,FR,FRK1,Auvergne,Supermarket Medium,Commercial,2019-06-11,2019-06-11,2021-01-29,37956,,12.0,EUR,,,0,0,0,0,0,0,0,4.0,0,0,0,12,0,True,63,0.0
3880476,2020-03-07,36017,,France,FR,FRJ2,Midi-Pyrénées,Bakery,Commercial,2019-04-08,2019-04-08,2021-03-11,36180,,12.0,EUR,,,0,0,0,0,0,0,0,3.99,0,0,0,3,0,True,31,0.0
4449144,2019-07-10,38014,3534.0,France,FR,FRE1,Nord-Pas De Calais,Supermarket Medium,Commercial,2019-07-01,2019-07-01,2021-03-13,38178,,12.0,EUR,2019-07-10 10:00:00.000000,2019-07-10 17:00:00.000000,3,3,2,-2,3,0,0,4.0,0,1,3,0,2,True,59,0.0
3586087,2019-11-03,35180,3671.0,France,FR,FRE1,Nord-Pas De Calais,Bakery,Commercial,2019-03-19,2019-03-19,2021-02-01,35343,,12.0,EUR,,,0,0,0,0,0,0,0,3.99,0,0,0,0,0,True,62,0.0
249152,2019-01-06,25663,,France,FR,FR10,Ile-De-France,Fast Food / Take away restaurant,Commercial,2017-01-13,,,25797,,11.0,EUR,,,0,0,0,0,0,0,0,5.0,0,0,0,0,0,True,75,0.0
2721986,2019-06-23,32872,3810.0,France,FR,FRI1,Aquitaine,Bakery,Commercial,2018-11-06,2018-11-06,2021-03-15,34506,Entremets,,EUR,2019-06-23 11:30:00.000000,2019-06-23 12:00:00.000000,5,2,3,0,2,0,0,12.0,0,0,0,0,1,True,33,0.0
6255,2020-03-22,274,,France,FR,FR10,Ile-De-France,Buffet,Commercial,2019-06-18,2019-06-18,2020-10-29,342,Midi,12.0,EUR,,,0,0,0,0,0,0,0,3.99,0,0,0,2,0,True,95,0.0
1650679,2019-07-23,29930,3922.0,France,FR,FRG0,Pays De La Loire,Supermarket Medium,Commercial,2018-04-25,2018-04-25,2021-03-15,30093,,15.0,EUR,2019-07-23 15:30:00.000000,2019-07-23 17:00:00.000000,1,1,0,0,1,0,0,5.0,0,0,0,0,1,True,44,0.0
3865676,2019-04-05,35971,,France,FR,FRK1,Auvergne,Florist,Commercial,2019-04-04,2019-04-04,2019-06-24,36134,,15.0,EUR,2019-04-05 15:00:00.000000,2019-04-05 17:00:00.000000,1,1,0,0,1,0,0,3.99,0,0,0,0,1,True,3,0.0


### Construction de variables intéressantes à partir des préexistantes

In [17]:
# Note moyenne

def compute_note(overall, count):
    try:
        overall / count
    except:
        return np.nan

df['note_moyenne'] = df.apply(lambda row : compute_note(row.sum_rating_overall, row.rating_count) , axis = 1)

In [19]:
# Lifetime

col_date = ['date', 'items_first_enabled_date', 'store_last_saving_date', 'store_first_saving_date',\
            'pickup_start', 'pickup_end']
for col in col_date :
    df[col]= pd.to_datetime(df[col])

df['lifetime'] = df['store_last_saving_date'] - df['store_first_saving_date']



In [20]:
# Reduction

df['reduction'] = 1 - df['item_price'] / df['before_price']

In [21]:
# Temps d'ouverture

df['temps_ouverture'] = df['pickup_end'] - df['pickup_start']

In [26]:
# Heure de début d'ouverture

df['heure_debut_ouverture'] = df.apply(lambda row : row.pickup_start.hour, axis = 1)


In [27]:
# Efficacité

df['efficacite'] = df['meals_saved'] - df['total_supply']

In [None]:
# Franchise

#df['franchise'] = df.apply(lambda row : int(row.parent_chain_id > 0), axis = 1)

### Suppression des variables non intéressantes

In [None]:
df.columns

In [None]:
df = df.drop(columns = ['store_country', 'country_'])

In [None]:
df.sample(10)

### Création de nouvelles variables

In [None]:
# Variance du nombre de vente

In [None]:
# Baisse du nombre de vente dans le mois précedent

In [None]:
# Augmentation de la réduction durant la semaine précedente

In [None]:
# Variance de la durée d'ouverture

In [None]:
# Baisse de la note moyenne durant la semaine précedente

In [None]:
# Augmentation du nombre d'invendus durant le mois précédent

## Recodage de la variable objectif