In [1]:
import gc
import os
from time import time
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
from tqdm import tqdm_notebook
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
##############################################
%matplotlib inline

In [2]:
DATA_PATH = "/Users/lemeillefrancois/OneDrive - Capgemini/03 - Projet /03 - Innovation Cup/01 - Data /Challenge Data/"

In [3]:
for dirname, _, filenames in os.walk(DATA_PATH):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/Users/lemeillefrancois/OneDrive - Capgemini/03 - Projet /03 - Innovation Cup/01 - Data /Challenge Data/ID_Data_train.csv
/Users/lemeillefrancois/OneDrive - Capgemini/03 - Projet /03 - Innovation Cup/01 - Data /Challenge Data/.DS_Store
/Users/lemeillefrancois/OneDrive - Capgemini/03 - Projet /03 - Innovation Cup/01 - Data /Challenge Data/testkagglecompetition.zip
/Users/lemeillefrancois/OneDrive - Capgemini/03 - Projet /03 - Innovation Cup/01 - Data /Challenge Data/sample_sub.csv
/Users/lemeillefrancois/OneDrive - Capgemini/03 - Projet /03 - Innovation Cup/01 - Data /Challenge Data/ID_Data_test.csv
/Users/lemeillefrancois/OneDrive - Capgemini/03 - Projet /03 - Innovation Cup/01 - Data /Challenge Data/ID_Time_train.csv


In [4]:
ID_Data_train = pd.read_csv(DATA_PATH+"ID_Data_train.csv")
ID_Data_test = pd.read_csv(DATA_PATH+"ID_Data_test.csv")
ID_Time_train = pd.read_csv(DATA_PATH+"ID_time_train.csv")
sample_submission = pd.read_csv(DATA_PATH+"sample_sub.csv")

In [5]:
print(ID_Data_train.shape)
print(ID_Data_train.columns)
print(ID_Data_train['id'].nunique())
ID_Data_train.sample(3)

(310610, 7)
Index(['id_race', 'id', 'cap', 'latitude', 'longitude', 'nb_satellite',
       'direction_vent'],
      dtype='object')
249


Unnamed: 0,id_race,id,cap,latitude,longitude,nb_satellite,direction_vent
193599,R3,b20a755ee867c8defad8b4d01f985493,209.0,38.267807,21.704176,9.0,247.5
118138,R13,5f74204bb62cae3e17a726be45bbb174,74.0,45.803547,10.832287,7.0,
278346,R12,fbd780998f95e83297fd14ac3ca9c462,246.0,45.7927,10.82275,7.0,


In [6]:
print(ID_Time_train.shape)
print(ID_Time_train.columns)
print(ID_Time_train['id'].nunique())
ID_Time_train.sample(3)

(249, 4)
Index(['id', 'id_race', 'temps', 'rang'], dtype='object')
249


Unnamed: 0,id,id_race,temps,rang
235,6df38ea2cdf2fd987243b55ab801cbce,R3,3873.0,16.0
178,7aa8a1636a445f8f8992752233ad2884,R12,3954.0,46.0
229,2456f6ce9944073a22257add107e4faa,R3,3636.0,2.0


## Création des features

In [7]:
def calc_diff_angle(data):
    data.loc[:,'Diff_angle'] = data.loc[:,'direction_vent'] - data.loc[:,'cap']
    return data

In [8]:
# ID_Time_trainV1.head()

## Features

In [11]:
def creer_features(input_data, time_id):
    """
    input_data : DF comprenant les courses des bateaux (séries temporelles des variables considérées)
    time_id : DF lié à input_data qui comprend l'ID, le temps, la course, et le rang 
    """
    X_model = pd.DataFrame()
    IDs = time_id['id'].values
    data = input_data.copy()
    for i in tqdm_notebook(IDs): 
        data_id = data[data['id']==i]
        data_id = calc_diff_angle(data_id)
        X_model.loc[i, 'lat_mean'] = data_id['latitude'].mean()
        X_model.loc[i, 'long_std'] = data_id['longitude'].std()
        
       
    X_model = X_model.fillna(0)
    return X_model

In [12]:
X_train = creer_features(ID_Data_train, ID_Time_train)

HBox(children=(IntProgress(value=0, max=249), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s





In [13]:
y_train = X_train.merge(ID_Time_train, left_index=True, right_on='id', how='left')['temps']
print(len(y_train), X_train.shape)


249 (249, 2)


In [14]:
X_test = creer_features(ID_Data_test, sample_submission)
print(X_test.shape)

HBox(children=(IntProgress(value=0, max=48), HTML(value='')))


(48, 2)


# Modèles de Regression 

## Métrique d'évaluation 

In [15]:
def ecart_classement(y_true, y_pred):
    return np.mean(abs(y_true - y_pred))

def rmse(y, y_pred):
    return np.sqrt(np.mean(np.square(y - y_pred)))

def calc_rank_from_time(data_pred, id_time_train): 
    """ 
    Calcul le rang à partir d'un fichier de prédiction de temps 
    Il faut que la prédiction soit un dataframe avec en indice les id bateau hash, en colonne le temps prédit
    et une autre colonne avec la course.
    Cela permet de reconstruire le classement à partir des données prédites et des données présentes 
    dans le jeu d'entrainement
    """
    IDs = data_pred['ID'].tolist()
    id_time_all = pd.concat([data_pred, id_time_train], axis=0)
    id_time_all['Rank'] = id_time_all.groupby('id_race')['Time'].rank(ascending=True)
    data_pred_rank = id_time_all[id_time_all['ID'].isin(IDs)]
    return data_pred_rank 

def score_innovation_cup(sub_true, sub_pred):
    """
    Il faut fournir une soumission sous format dataframe avec en index les id_bateau_hash, 
    une première colonne Time, et une seconde colonne rang, calculée par l'étudiant
    Il est conseillé d'inclure la course en colonne également mais ce n'est pas obligatoire
    """
    true_time, true_rank = sub_true['temps'], sub_true['rang']
    pred_time, pred_rank = sub_pred['temps'], sub_pred['rang']
    
    score = ecart_classement(true_rank, pred_rank) + (rmse(true_time, pred_time) / np.sqrt(np.mean(true_time)))
    return score



## Regression Linéaire simple

In [16]:
LR = LinearRegression()
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)

In [17]:
submission_final= sample_submission.copy()
submission_final['temps'] = y_pred
submission_final = calc_rank_from_time(submission_final, ID_Time_train)

KeyError: 'ID'

In [None]:
submission_final.head(7)

In [None]:
submission_final.to_csv("/Users/lemeillefrancois/OneDrive - Capgemini/03 - Projet /03 - Innovation Cup/04 - Evaluation/soumission_finale.csv", index=False)