# Explication de la procédure finale

- Import des librairies et des packages implémentées pour le projet
- Import des données et features engineering( ajouts des features, et construction des matrices finales)
- Correction des positions des bases
- Retirer les devices qui affectent la stabilité de nos prédictions
- Entrainement des algorithmes d'apprentissage et combinaison des résultats pour prédictions grâce à un Voting Regressor
- Production des prédictions
- Export de fichier de prédictions



### Projet réalisé par :
- Mahmoud Benboubker
- Nicolas Calligaro
- Aïcha Lahlou


# Import de librairies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IpyTools import *
from IotTools import *
pd.options.mode.chained_assignment = None  

In [2]:
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import ExtraTreeRegressor

from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import cross_val_predict

# Import des données 

In [3]:
df_mess_train = pd.read_csv('mess_train_list.csv')
df_mess_test = pd.read_csv('mess_test_list.csv')
pos_train = pd.read_csv('pos_train_list.csv')

In [4]:
listOfBs = np.union1d(df_mess_train.bsid.unique(),
                      df_mess_test.bsid.unique())

# Correction des bases

In [5]:
X_train= Correct_Bases (df_mess_train)
X_test = Correct_Bases(df_mess_test)

Nous avons 27 bases outliers
Base 9949 non vu
il reste 0 base avec lat >60
Nous avons 23 bases outliers
Correction manuelle de la bsid 9949
il reste 0 base avec lat >60


# Retirer les devices trop complexes à prédire

In [6]:
a=[476212., 476830., 476861., 476256,477201, 476829.,476609.,
   476327,476315,476835,476598,476889,474192,473288]

In [7]:
X_mod = X_train[~X_train.did.isin(a)]
df_feat, id_list = feat_mat_const(X_mod, listOfBs)

y_full = ground_truth_const(X_mod, pos_train, id_list)
y_full.shape, df_feat.shape

((4122, 3), (4122, 273))

# Entrainement des algorithmes d'apprentissages

## Combinaison des modèles avec un Voting Regressor

#### Prédiction de la longitude

In [8]:
r1 = RandomForestRegressor(**get_hyperparameter('RandomForestRegressor', 'lng'))
r2 = GradientBoostingRegressor(**get_hyperparameter('GradientBoostingRegressor', 'lng'))
r3 = ExtraTreeRegressor(**get_hyperparameter('ExtraTreeRegressor', 'lng'))
r4 = xgb.XGBRegressor(**get_hyperparameter('XGBRegressor', 'lng'))
r5 = BaggingRegressor(**get_hyperparameter('BaggingRegressor', 'lng'))
Vr_lng = VotingRegressor(estimators=[('Et',r1),('Rf',r2),('Gb',r3),('Xg',r4),('Xdg',r5)])

#### Prédiction de la latitude

In [9]:
r1 = RandomForestRegressor(**get_hyperparameter('RandomForestRegressor', 'lat'))
r2 = GradientBoostingRegressor(**get_hyperparameter('GradientBoostingRegressor', 'lat'))
r3 = ExtraTreeRegressor(**get_hyperparameter('ExtraTreeRegressor', 'lat'))
r4 = xgb.XGBRegressor(**get_hyperparameter('XGBRegressor', 'lat'))
r5 = BaggingRegressor(**get_hyperparameter('BaggingRegressor', 'lat'))
Vr_lat = VotingRegressor(estimators=[('Et',r1),('Rf',r2),('Gb',r3),('Xg',r4),('Xdg',r5)])

#### Estimation de l'erreur

In [10]:
#err_vec = Eval_geoloc(y_full.lat , y_full.lng, y_pred_lat, y_pred_lng)
#np.percentile(err_vec, 80)

# Prédiction sur les données test

#### Entrainement des modèles finaux pour la latitude et la longitude

In [11]:
Vr_lat.fit(df_feat, y_full.lat)

VotingRegressor(estimators=[('Et',
                             RandomForestRegressor(criterion='mae',
                                                   max_depth=10,
                                                   n_estimators=25)),
                            ('Rf',
                             GradientBoostingRegressor(max_depth=4,
                                                       subsample=0.8)),
                            ('Gb',
                             ExtraTreeRegressor(criterion='friedman_mse',
                                                max_depth=8, splitter='best')),
                            ('Xg',
                             XGBRegressor(base_score=None, booster='gbtree',
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_...
                                          learning_rate=0.1,
                                      

In [12]:
Vr_lng.fit(df_feat, y_full.lng)

VotingRegressor(estimators=[('Et',
                             RandomForestRegressor(criterion='mae',
                                                   max_depth=10,
                                                   n_estimators=25)),
                            ('Rf',
                             GradientBoostingRegressor(learning_rate=0.2,
                                                       max_depth=4,
                                                       subsample=0.8)),
                            ('Gb',
                             ExtraTreeRegressor(criterion='friedman_mse',
                                                max_depth=8, splitter='best')),
                            ('Xg',
                             XGBRegressor(base_score=None, booster='gbtree',
                                          colsample_bylevel=None,
                                          colsample_byno...
                                          learning_rate=0.2,
                         

#### Construction de la matrice test

In [13]:
df_feat

Unnamed: 0_level_0,879,911,921,944,980,1012,1086,1092,1120,1131,...,bs_L_did_min,bs_L_did_max,mean_x,min_x,max_x,mean_y,min_y,max_y,BCW_lat,BCW_lng
messid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
573bf1d9864fce1a9af8c5c9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-104.954917,-104.952721,39.647522,39.617794,39.677251,-104.953819,-104.954917,-104.952721,39.636152,-104.954239
573bf3533e952e19126b256a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-105.008827,-105.008827,39.612745,39.612745,39.612745,-105.008827,-105.008827,-105.008827,39.612745,-105.008827
573c0cd0f0fe6e735a699b93,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-105.073460,-104.956216,39.751055,39.723151,39.797969,-105.001109,-105.073460,-104.956216,39.731998,-104.973541
573c1272f0fe6e735a6cb8bd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-105.053109,-105.008827,39.616885,39.495225,39.704887,-105.030503,-105.053109,-105.008827,39.664931,-105.028516
573c8ea8864fce1a9a5fbf7a,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-105.165355,-104.891717,39.778865,39.612745,39.973995,-105.033121,-105.165355,-104.891717,39.759319,-105.041000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5848551912f14360d786ede6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-105.012547,-104.976127,39.760135,39.734643,39.777690,-105.000386,-105.012547,-104.976127,39.760168,-105.002909
58485a25e541cd0e1329b8d6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-105.099323,-104.952721,39.678859,39.612745,39.706436,-105.024327,-105.099323,-104.952721,39.653966,-105.013210
58485bd412f14360d78bebdb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-105.062479,-105.001415,39.775471,39.759396,39.793585,-105.027557,-105.062479,-105.001415,39.786737,-105.021962
5848672e12f14360d7942374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-105.025753,-105.001415,39.747650,39.704887,39.777690,-105.011706,-105.025753,-105.001415,39.741485,-105.014010


In [14]:
X_test = Correct_Bases(df_mess_test)
df_feat_test, id_list_test  = feat_mat_const(X_test, listOfBs)

Nous avons 23 bases outliers
Correction manuelle de la bsid 9949
il reste 0 base avec lat >60


In [15]:
pd.DataFrame(df_feat_test)

Unnamed: 0_level_0,879,911,921,944,980,1012,1086,1092,1120,1131,...,bs_L_did_min,bs_L_did_max,mean_x,min_x,max_x,mean_y,min_y,max_y,BCW_lat,BCW_lng
messid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
573be2503e952e191262c351,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-105.163032,-105.163032,39.728651,39.728651,39.728651,-105.163032,-105.163032,-105.163032,39.728651,-105.163032
573c05f83e952e1912758013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-105.165355,-105.053676,39.763474,39.728651,39.783211,-105.094321,-105.165355,-105.053676,39.782587,-105.088273
573c0796f0fe6e735a66deb3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-105.044371,-105.008827,39.666322,39.612745,39.704887,-105.028319,-105.044371,-105.008827,39.660943,-105.040357
573c08d2864fce1a9a0563bc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-105.072701,-105.072701,39.782113,39.782113,39.782113,-105.072701,-105.072701,-105.072701,39.782113,-105.072701
573c08ff864fce1a9a0579b0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-105.044371,-105.008827,39.666322,39.612745,39.704887,-105.028319,-105.044371,-105.008827,39.660387,-105.039809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58484ca812f14360d7808eb0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-105.005623,-104.941022,39.751030,39.732045,39.761988,-104.974106,-105.005623,-104.941022,39.745129,-104.977991
58484cb6e541cd0e131f862c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-105.077530,-104.939772,39.779460,39.666188,39.853418,-105.029898,-105.077530,-104.939772,39.782637,-105.044490
58484dc9cf554f422f94665b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-105.077530,-105.008827,39.656647,39.612745,39.692976,-105.043603,-105.077530,-105.008827,39.662458,-105.047702
58485da512f14360d78d5378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-105.077530,-105.043685,39.660435,39.654682,39.666188,-105.060608,-105.077530,-105.043685,39.655211,-105.045243


In [16]:
df_feat.shape

(4122, 273)

#### Construction des prédictions

In [17]:
latitude_predictions = Vr_lat.predict(df_feat_test)

In [18]:
longitude_predictions = Vr_lng.predict(df_feat_test)

In [19]:
latitude_predictions

array([39.70949963, 39.77557711, 39.68775989, ..., 39.68068915,
       39.67262791, 39.67406199])

In [20]:
longitude_predictions

array([-105.0615389 , -105.08073586, -105.01052474, ..., -105.02408113,
       -105.015891  , -105.01777576])

#### Export sous fichier csv

In [21]:
predictions = pd.DataFrame()
predictions['lat'] = latitude_predictions
predictions['lng'] = longitude_predictions
predictions['messid']  = id_list_test
predictions.head()

Unnamed: 0,lat,lng,messid
0,39.7095,-105.061539,573be2503e952e191262c351
1,39.775577,-105.080736,573c05f83e952e1912758013
2,39.68776,-105.010525,573c0796f0fe6e735a66deb3
3,39.781912,-105.074035,573c08d2864fce1a9a0563bc
4,39.687852,-105.00756,573c08ff864fce1a9a0579b0


In [22]:
predictions.to_csv('test_results.csv', index = False )