In [98]:
from os import path

import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from imblearn.under_sampling import RandomUnderSampler

In [99]:
df = pd.read_csv(path.join("..","data/raw","paris_threes_raw.csv"), sep=";")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200137 entries, 0 to 200136
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   200137 non-null  int64  
 1   type_emplacement     200137 non-null  object 
 2   domanialite          200136 non-null  object 
 3   arrondissement       200137 non-null  object 
 4   complement_addresse  30902 non-null   object 
 5   numero               0 non-null       float64
 6   lieu                 200137 non-null  object 
 7   id_emplacement       200137 non-null  object 
 8   libelle_francais     198640 non-null  object 
 9   genre                200121 non-null  object 
 10  espece               198385 non-null  object 
 11  variete              36777 non-null   object 
 12  circonference_cm     200137 non-null  int64  
 13  hauteur_m            200137 non-null  int64  
 14  stade_developpement  132932 non-null  object 
 15  remarquable      

In [100]:
dropped_cols = [col for col in df.columns if col not in ["circonference_cm", "hauteur_m", "remarquable"]]
df.dropna(subset=["remarquable"], inplace=True)
X, y = df.drop("remarquable", axis=1), df["remarquable"]
X.drop(dropped_cols, axis=1, inplace=True)

In [112]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,  random_state=42)
log_reg = LogisticRegression()
uder_sample = RandomUnderSampler()
X_train, y_train = uder_sample.fit_resample(X_train, y_train)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
print(X[y==1].count())
print(X[y==0].count())
print(X[y==1].count() / X[y==0].count())

circonference_cm    184
hauteur_m           184
dtype: int64
circonference_cm    136855
hauteur_m           136855
dtype: int64
circonference_cm    0.001344
hauteur_m           0.001344
dtype: float64


In [102]:
def print_scores(y_true, y_pred):
    acc_score = accuracy_score(y_true, y_pred)
    print("accuracy: ",acc_score)
    pre_score = precision_score(y_true, y_pred)
    print("precision: ",pre_score)
    rec_score = recall_score(y_true, y_pred)
    print("recall: ",rec_score)
    f_score = f1_score(y_true, y_pred)
    print("f1_score: ",f_score)


print_scores(y_test, y_pred)


accuracy:  0.8659272231951741
precision:  0.006324539212143115
recall:  0.7291666666666666
f1_score:  0.012540308133285561


# It does not work ... why ?

The prediction of remarquable three can't work because first of all because of the dataset size.  
it is a 13k instances dataset, with only 138 remarquable three. If we under sample it or over sample it, it will over predict threes (because we don't have a 99,9% correlation with our cols).  
We will assume that the 138 threes are the chosen one by the 200k instances of the datasets, and all the unlabelized are not remarquable.