In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


# Observed data for training a model

In [3]:
data_file = "./data/restaurants_feat_label_SMOTE_v1.csv"    # Applied SMOTE using WEKA
data = pd.read_csv(data_file)
data.head()

Unnamed: 0,Note_resto,Note_hygiène_resto,Variance_note_resto,Variance_note_hygiène_resto,16,alimentair,assaison,attend,brul,canap,...,sourd,tomb,traversent,ventr,vom,écrev,épic,éton,rev_cnt,inspection_note
0,3.0,5.0,2.0,?,0,0,0.0,0.0,0,0,...,0,0.0,0,0,0,0,0,0,3.0,C
1,4.5,4.883333,0.25,0.000278,0,0,0.0,0.0,0,0,...,0,0.0,0,0,0,0,0,0,1.0,C
2,4.5,4.883333,0.25,0.000278,0,0,0.0,0.0,0,0,...,0,0.0,0,0,0,0,0,0,5.0,C
3,3.666667,4.95,1.555556,0,0,0,0.0,0.0,0,0,...,0,0.0,0,0,0,0,0,0,3.0,B
4,4.0,4.180556,0.571429,0.696867,0,0,0.0,0.0,0,0,...,0,0.0,0,0,0,0,0,0,7.0,C


In [4]:
# Features
data.shape
data = data.convert_objects(convert_numeric=True)

  app.launch_new_instance()


In [5]:
# Fill missing values
data.fillna(0, inplace=True)

In [6]:
label = "inspection_note"
feature_cols = data.columns.tolist()
feature_cols.remove(label)
X = data.drop(label, axis=1)
y = data[label]
print(X.shape, y.shape)

(253, 63) (253,)


In [7]:
n_samples, n_features = X.shape
print("Number of samples: ", n_samples)
print("Number of features: ", n_features)

Number of samples:  253
Number of features:  63


In [8]:
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import logistic
from sklearn.cross_validation import cross_val_score

In [9]:
random_state = np.random.RandomState(0)

###############################################################################
# Regression and ROC analysis

# Run regressor with cross-validation and plot ROC curves
cv = StratifiedKFold(y, n_folds=3)
rf = RandomForestClassifier(random_state=0, n_estimators=30, max_features="sqrt")
gbt = GradientBoostingClassifier(random_state=0, n_estimators=30)

In [10]:
scores = cross_val_score(rf, X, y)
print("Cross-validation scores: ", scores)
print("Mean of cross-validation scores: ", scores.mean())


Cross-validation scores:  [ 0.82352941  0.80952381  0.82142857]
Mean of cross-validation scores:  0.818160597572


In [11]:
# Unobseved data set
unlabeled_file = "./data/base_DFG_note_feat_pp_loc.csv"
unlabeled = pd.read_csv(unlabeled_file)

unlabeled.head()

Unnamed: 0,Adresse,Code Postal,Resto,Ville,Note_resto,Note_hygiène_resto,Variance_note_resto,Variance_note_hygiène_resto,16,alimentair,...,traversent,ventr,vom,écrev,épic,éton,rev_cnt,latitude,longitude,coords
0,55 Boulevard Saint Marcel,75013,0 d'attente,Paris,3.9,3.67,0.29,2.2696,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,1.0,10.0,48.837656,2.355498,"48.8376561,2.3554979"
1,"128, rue du Faubourg Saint Martin",75010,0039 ristorante italiano,Paris,3.222222,4.846111,2.17284,0.020053,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9.0,48.874388,2.358907,"48.8743883,2.3589068"
2,60 rue Albert,75013,015 gang nam,Paris,4.333333,4.844444,0.222222,0.010617,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,6.0,48.826235,2.371878,"48.82623539999999,2.371877500000001"
3,161 Avenue D'Italie,75013,1 pot,Paris,4.0,4.916667,0.666667,0.000278,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,6.0,48.81953,2.359702,"48.8195302,2.3597021"
4,55 Boulevard Saint Marcel,75013,0 d'attente,Paris,3.9,3.67,0.29,2.2696,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,1.0,10.0,48.837656,2.355498,"48.8376561,2.3554979"


In [12]:
# Remove NaNs
unlabeled.dropna(axis=0, inplace=True)
unlabeled.reset_index(inplace=True)
# Create features DataFrame
X_unlabeled = unlabeled[feature_cols]
X_unlabeled.shape

(8442, 63)

In [13]:
# Now train in the entire observed data
rf.fit(X, y)
# Apply the model in the unobserved data
predict = rf.predict(X_unlabeled)
predict_proba = rf.predict_proba(X_unlabeled)

In [17]:
# Append predictions to data set
predictions_df = pd.DataFrame(data=predict, columns=["predict"])
predictions_proba_df = pd.DataFrame(data=predict_proba,
                                    columns=["predict_proba_0", "predict_proba_1", "predict_proba_2", "predict_proba_3"])
data_predicted = pd.concat([unlabeled, predictions_df, predictions_proba_df], axis=1)
data_predicted.drop(["index"], inplace=True, axis=1)
# predictions_df = pd.DataFrame(predictions, columns=[feature_cols, label])

In [18]:
data_predicted.head()

Unnamed: 0,Adresse,Code Postal,Resto,Ville,Note_resto,Note_hygiène_resto,Variance_note_resto,Variance_note_hygiène_resto,16,alimentair,...,éton,rev_cnt,latitude,longitude,coords,predict,predict_proba_0,predict_proba_1,predict_proba_2,predict_proba_3
0,55 Boulevard Saint Marcel,75013,0 d'attente,Paris,3.9,3.67,0.29,2.2696,0.0,0.0,...,1.0,10.0,48.837656,2.355498,"48.8376561,2.3554979",C,0.2,0.233333,0.566667,0.0
1,"128, rue du Faubourg Saint Martin",75010,0039 ristorante italiano,Paris,3.222222,4.846111,2.17284,0.020053,0.0,0.0,...,0.0,9.0,48.874388,2.358907,"48.8743883,2.3589068",C,0.0,0.466667,0.533333,0.0
2,60 rue Albert,75013,015 gang nam,Paris,4.333333,4.844444,0.222222,0.010617,0.0,0.0,...,0.0,6.0,48.826235,2.371878,"48.82623539999999,2.371877500000001",C,0.0,0.166667,0.833333,0.0
3,161 Avenue D'Italie,75013,1 pot,Paris,4.0,4.916667,0.666667,0.000278,0.0,0.0,...,0.0,6.0,48.81953,2.359702,"48.8195302,2.3597021",C,0.4,0.066667,0.533333,0.0
4,55 Boulevard Saint Marcel,75013,0 d'attente,Paris,3.9,3.67,0.29,2.2696,0.0,0.0,...,1.0,10.0,48.837656,2.355498,"48.8376561,2.3554979",C,0.2,0.233333,0.566667,0.0


In [19]:
# Save to file
data_predicted_file = "./data/base_DFG_predicted_v1.csv"
data_predicted.to_csv(data_predicted_file, index=False, sep=";")

In [24]:
# For plotting purposes only
data_predicted[["latitude", "longitude", "predict","predict_proba_0", "predict_proba_1", "predict_proba_2"]].to_csv(
    "./data/plot_predictions.csv", index=False)