# QoS Prediction Challenge

## 0. Préalable

### Importation des modules

In [24]:
# numerical calculation
import numpy as np
# dataframe
import pandas as pd
# vizualisation
import matplotlib.pyplot as plt
import seaborn as sns
# models
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
# splitting
from sklearn.model_selection import train_test_split
# encoding
from sklearn.preprocessing import OrdinalEncoder
# scaling
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, MaxAbsScaler
# metrics
from sklearn.metrics import mean_squared_error
# utils
import random
from datetime import datetime as dt
# saving model
import joblib

In [11]:
# Set seed for reproducability
SEED = 12
random.seed(SEED)
np.random.seed(SEED)

### Chargement du jeu de données

In [12]:
DATA_PATH = 'data/' # repertoire des données
train = pd.read_csv(DATA_PATH + 'Train.csv') # ensemble d'apprentissage
test = pd.read_csv(DATA_PATH + 'Test.csv') # ensemble de test
sample_submission = pd.read_csv(DATA_PATH + 'SampleSubmission.csv') # exemple de soumission

In [13]:
train.head(3)

Unnamed: 0,id,timestamp,device,PCell_RSRP_max,PCell_RSRQ_max,PCell_RSSI_max,PCell_SNR_max,PCell_Downlink_Num_RBs,PCell_Downlink_Average_MCS,PCell_Downlink_bandwidth_MHz,...,dewPoint,humidity,pressure,windSpeed,cloudCover,uvIndex,visibility,Traffic Jam Factor,area,target
0,Id_qt2qn56050,1624367008,pc1,-84.49875,-14.586875,-49.855625,16.457,35076.0,23.0,20.0,...,13.86,0.62,1013.6,2.39,0.95,4.0,16.093,0.38979,Residential,51500000.0
1,Id_f1zf07cwb6,1624372465,pc3,-86.818125,-11.9825,-54.838125,16.674,42704.0,24.0,20.0,...,14.52,0.59,1013.5,2.29,0.82,3.0,16.093,3.78322,Residential,50300000.0
2,Id_uoxdz7di5b,1624371871,pc3,-101.91625,-13.2575,-71.24125,9.392,36017.0,22.0,20.0,...,14.71,0.61,1013.8,2.13,0.82,3.0,16.093,2.08425,Residential,16500000.0


Variables identifiant et objectif

In [7]:
ID = 'id'
TARGET = 'target'

## 1. Analyse exploratoire

### 1.1. Analyse de la forme

La target de notre dataset est la variable **target**.

Dimension du dataset

In [14]:
# Let’s observe the shape of our datasets.
print('Dimension du train set :', train.shape)
print('Dimension du test set :', test.shape)

Dimension du train set : (34274, 41)
Dimension du test set : (18243, 40)


Types des variables

In [15]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34274 entries, 0 to 34273
Data columns (total 41 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            34274 non-null  object 
 1   timestamp                     34274 non-null  int64  
 2   device                        34274 non-null  object 
 3   PCell_RSRP_max                34274 non-null  float64
 4   PCell_RSRQ_max                34274 non-null  float64
 5   PCell_RSSI_max                34274 non-null  float64
 6   PCell_SNR_max                 34274 non-null  float64
 7   PCell_Downlink_Num_RBs        34274 non-null  float64
 8   PCell_Downlink_Average_MCS    34274 non-null  float64
 9   PCell_Downlink_bandwidth_MHz  33207 non-null  float64
 10  PCell_Cell_Identity           33207 non-null  float64
 11  PCell_freq_MHz                34274 non-null  float64
 12  SCell_RSRP_max                19012 non-null  float64
 13  S

Analyse des variables qualitatives

In [16]:
train.select_dtypes(include='object').describe()

Unnamed: 0,id,device,area
count,34274,34274,34274
unique,34274,2,5
top,Id_qt2qn56050,pc1,Park
freq,1,19505,13438


In [17]:
for colonne in list(train.columns):
    print(colonne)
    print(train[colonne].value_counts(), '\n')

id
Id_qt2qn56050    1
Id_ojvi2rdfpw    1
Id_edb0fu4jap    1
Id_9ftaq07qu0    1
Id_65drpv9yul    1
                ..
Id_ge28z7rjds    1
Id_29npmuawod    1
Id_cgkdhn72cf    1
Id_ykdq0weq3r    1
Id_6ovc5xvg0o    1
Name: id, Length: 34274, dtype: int64 

timestamp
1624552037    2
1624548733    2
1624552672    2
1624348630    2
1624552476    2
             ..
1624356799    1
1624545345    1
1624348818    1
1624373680    1
1624376571    1
Name: timestamp, Length: 27373, dtype: int64 

device
pc1    19505
pc3    14769
Name: device, dtype: int64 

PCell_RSRP_max
-85.495000    7
-86.025000    7
-94.078125    6
-88.596875    6
-85.829375    5
             ..
-95.941250    1
-78.746875    1
-92.836875    1
-84.349375    1
-97.323125    1
Name: PCell_RSRP_max, Length: 27457, dtype: int64 

PCell_RSRQ_max
-12.220625    17
-12.361250    15
-12.253125    14
-12.194375    13
-12.323750    13
              ..
-17.967500     1
-18.084478     1
-12.055707     1
-18.013125     1
-17.122500     1
Name: PC

Analyse des variables quantitatives

In [18]:
train.select_dtypes(include='number').describe()

Unnamed: 0,timestamp,PCell_RSRP_max,PCell_RSRQ_max,PCell_RSSI_max,PCell_SNR_max,PCell_Downlink_Num_RBs,PCell_Downlink_Average_MCS,PCell_Downlink_bandwidth_MHz,PCell_Cell_Identity,PCell_freq_MHz,...,apparentTemperature,dewPoint,humidity,pressure,windSpeed,cloudCover,uvIndex,visibility,Traffic Jam Factor,target
count,34274.0,34274.0,34274.0,34274.0,34274.0,34274.0,34274.0,33207.0,33207.0,34274.0,...,34274.0,34274.0,34274.0,34274.0,34274.0,34274.0,34274.0,34274.0,33946.0,34274.0
mean,1624447000.0,-87.085767,-13.061258,-54.435712,11.861843,64089.078835,19.973858,19.233746,18988010.0,1926.737469,...,20.076148,13.952753,0.682186,1015.18641,2.613565,0.938075,2.564072,16.093,3.328803,55503860.0
std,91763.44,11.719533,2.463729,10.765029,7.414607,22991.689616,5.054947,1.898843,13818580.0,264.412295,...,1.95112,0.462473,0.063657,2.377349,0.756239,0.078149,1.023344,3.552766e-15,2.202747,43193850.0
min,1624348000.0,-175.605625,-24.774375,-87.8525,-15.603,4.0,0.0,5.0,2567188.0,900.0,...,18.13,13.3,0.55,1011.9,1.96,0.76,1.0,16.093,0.0,48000.0
25%,1624367000.0,-95.201562,-14.79,-61.961719,6.43125,43724.0,17.0,20.0,3282957.0,1800.0,...,18.67,13.61,0.62,1013.5,2.16,0.94,2.0,16.093,2.17118,24000000.0
50%,1624377000.0,-86.845313,-12.681875,-53.9925,12.171,62785.0,21.0,20.0,26358780.0,1800.0,...,18.88,13.8,0.71,1013.8,2.29,0.97,3.0,16.093,3.00946,42400000.0
75%,1624548000.0,-79.246719,-11.378252,-46.845156,17.579,87828.0,24.0,20.0,26947070.0,2000.0,...,21.61,14.11,0.73,1017.7,2.43,0.99,3.0,16.093,3.45451,72000000.0
max,1624554000.0,-55.550625,-5.389375,-13.0325,28.622,100471.0,28.0,20.0,51842580.0,2600.0,...,23.88,14.92,0.77,1018.0,4.25,1.0,4.0,16.093,9.60259,271000000.0


Analyse des données manquantes

In [20]:
train.isnull().sum().sort_values()

id                                  0
Latitude                            0
Longitude                           0
speed_kmh                           0
COG                                 0
precipIntensity                     0
precipProbability                   0
temperature                         0
apparentTemperature                 0
dewPoint                            0
humidity                            0
pressure                            0
windSpeed                           0
cloudCover                          0
uvIndex                             0
visibility                          0
operator                            0
area                                0
target                              0
timestamp                           0
device                              0
PCell_RSRP_max                      0
PCell_RSRQ_max                      0
PCell_RSSI_max                      0
PCell_freq_MHz                      0
PCell_SNR_max                       0
PCell_Downli

## Modelling

In [None]:
params = {
    'n_estimators': 3000,
    'learning_rate': 0.1,
    'objective': 'RMSE',
    'rsm': 0.7,
    'random_seed': SEED,
    'early_stopping_rounds': 200,
    'use_best_model': True,
}

# Instantiating the model
CB = CatBoostRegressor(**params)

In [25]:
def rmse(y_test, y_pred) -> float:
	''' returns the root mean squared error of a model
	'''
	return mean_squared_error(y_test, y_pred, squared=False)

In [26]:
# Instantiating the scalers
MMS = MinMaxScaler()
SS = StandardScaler()
MAS = MaxAbsScaler()
RS = RobustScaler()

In [27]:
def preprocessing_data(data: pd.DataFrame) -> pd.DataFrame:
    ''' Réalise la phase de pré-traitement d'un dataset à savoir l'encodage 
    des variables discrètes et la normalisation de toutes les variables 
    '''
    # copie du dataset
    data1 = data.copy()
    # normalisation
    scaler = MMS
    try:
        data2 = data1.drop([ID, 'timestamp', TARGET], axis=1)
    except:
        data2 = data1.drop([ID, 'timestamp'], axis=1)
    data2[data2.columns] = scaler.fit_transform(data2)
    data1[data2.columns] = data2
    return data1


In [None]:
train_eng_pp = preprocessing_data(train_eng)
test_eng_pp = preprocessing_data(test_eng)
# Selecting the independent variables and the target variable
X = train_eng_pp.drop([ID, 'location', TARGET], axis = 1)
# X = train_eng_pp
y = train_eng_pp[TARGET]
# spliting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = SEED)

In [None]:
model = CB
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=100)
y_pred = model.predict(X_test)
print(f'RMSE Score: {rmse(y_test, y_pred)}')

0:	learn: 47908.4863770	test: 48335.9537961	best: 48335.9537961 (0)	total: 60.9ms	remaining: 5m 4s
100:	learn: 8861.7593231	test: 10273.0666825	best: 10273.0666825 (100)	total: 5.5s	remaining: 4m 26s
200:	learn: 5890.4995065	test: 7856.8710896	best: 7856.8710896 (200)	total: 11.1s	remaining: 4m 24s
300:	learn: 4852.7218190	test: 7265.9113872	best: 7265.9113872 (300)	total: 16.7s	remaining: 4m 20s
400:	learn: 4250.8162201	test: 7007.9337184	best: 7007.9337184 (400)	total: 24s	remaining: 4m 35s
500:	learn: 3772.7038820	test: 6756.5536708	best: 6756.5536708 (500)	total: 30.7s	remaining: 4m 35s
600:	learn: 3380.2989949	test: 6590.3603336	best: 6589.2489042 (599)	total: 37.7s	remaining: 4m 35s
700:	learn: 3120.4899850	test: 6508.1155713	best: 6508.1155713 (700)	total: 44.4s	remaining: 4m 32s
800:	learn: 2930.0946930	test: 6470.1397305	best: 6470.1397305 (800)	total: 51.1s	remaining: 4m 27s
900:	learn: 2717.5498204	test: 6422.2930136	best: 6421.5138080 (891)	total: 58.1s	remaining: 4m 24s
10

Analyse des prédictions

In [None]:
pred_errors: pd.DataFrame = X_test.copy()
pred_errors[TARGET] = y_test
pred_errors['prediction'] = y_pred
pred_errors['error'] = abs(pred_errors['prediction'] - pred_errors[TARGET])
pred_errors = pred_errors[['latitude', 'longitude',
                           'year',	'week_no', TARGET, 'prediction', 'error']]
pred_errors.sort_values(by='error', ascending=False, inplace=True)
pred_errors

Unnamed: 0,latitude,longitude,year,week_no,emission,prediction,error
20319,0.784314,0.821782,0.000000,0.470588,1.181701e+06,908764.023182,272937.176818
20304,0.784314,0.821782,0.000000,0.176471,1.038997e+06,789738.120771,249258.939229
20345,0.784314,0.821782,0.000000,0.980392,1.002098e+06,781394.001911,220704.248089
20303,0.784314,0.821782,0.000000,0.156863,1.030984e+06,839834.807231,191148.942769
20410,0.784314,0.821782,0.666667,0.215686,9.417426e+05,763080.582675,178661.977325
...,...,...,...,...,...,...,...
37479,0.568627,0.039604,1.000000,0.254902,3.605458e+00,3.502072,0.103387
60550,0.392157,0.940594,0.333333,0.450980,1.958257e+02,195.742193,0.083507
72315,0.019608,0.435644,0.666667,1.000000,4.119779e+03,4119.855760,0.076460
42184,0.539216,0.039604,1.000000,0.058824,1.111047e+01,11.178302,0.067835


## Submission

In [None]:
# Make prediction on the test set
test_df = test_selected
predictions = model_selected.predict(test_df)

# Create a submission file
sub_file = pd.DataFrame({ID: test_eng[ID], TARGET: predictions})
sub_file.head()

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,emission
0,ID_-23.53_27.47_2019_1,152.972145
1,ID_-23.53_27.47_2019_2,152.972145
2,ID_-23.53_27.47_2019_3,152.972145
3,ID_-23.53_27.47_2019_4,152.972145
4,ID_-23.53_27.47_2019_5,152.972145


In [None]:
# Create file
today = dt.now().strftime(format="%Y-%m-%d_%Hh%M")
sub_file.to_csv(f'submissions/submission_{today}.csv', index=False)

## Saving model

In [None]:
# Save the model as a pickle in a file
filename = 'catboost_private_9460.077124'
joblib.dump(model_selected, f'models/{filename}.pkl')

['models/catboost_private_9460.077124.pkl']