In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict



In [7]:
# Import dataset
print("Loading dataset...")
dataset = pd.read_csv(r"C:\Users\jrala\Documents\jedha\Flask\linear_classification_with_flask\src\Data.csv")
print("...Done.")
print()

Loading dataset...
...Done.



In [8]:
# Basic stats
data_desc = dataset.describe(include='all')
print(data_desc)
print(dataset.shape)

       Country        Age        Salary Purchased
count       10   9.000000     10.000000        10
unique       3        NaN           NaN         2
top     France        NaN           NaN        No
freq         4        NaN           NaN         5
mean       NaN  38.777778  64300.000000       NaN
std        NaN   7.693793  11681.419244       NaN
min        NaN  27.000000  48000.000000       NaN
25%        NaN  35.000000  55000.000000       NaN
50%        NaN  38.000000  64000.000000       NaN
75%        NaN  44.000000  71250.000000       NaN
max        NaN  50.000000  83000.000000       NaN
(10, 4)


- Variables-cible : Y = 'Purchased'
- Variables explicatives : 'Country', 'Age', 'Salary'
- Nombre d'exemples : 10

- Imputation de valeurs manquantes : 'Age'
- Normalisation : 'Age', 'Salary'
- Encodage des variables catégorielles : 'Country'
- Encodage des labels : 'Purchased'

In [12]:
# Separate target variable Y from features X
print("Separating labels from features...")
features_list = ["Country", "Age", "Salary"]
target_variable = "Purchased"

X = dataset.loc[:,features_list]
Y = dataset.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
print(X.head())

Separating labels from features...
...Done.

Y : 
0     No
1    Yes
2     No
3     No
4    Yes
Name: Purchased, dtype: object

X :
   Country   Age  Salary
0   France  44.0   72000
1    Spain  27.0   48000
2  Germany  30.0   54000
3    Spain  38.0   61000
4  Germany  40.0   69000


In [13]:
idx = 0
numeric_features = []
numeric_indices = []
categorical_features = []
categorical_indices = []
for i,t in X.dtypes.iteritems():
  if ('float' in str(t)) or ('int' in str(t)) :
    numeric_features.append(i)
    numeric_indices.append(idx)
  else :
    categorical_features.append(i)
    categorical_indices.append(idx)

  idx = idx + 1

print('Found numeric features ', numeric_features,' at positions ', numeric_indices)
print('Found categorical features ', categorical_features,' at positions ', categorical_indices)

Found numeric features  ['Age', 'Salary']  at positions  [1, 2]
Found categorical features  ['Country']  at positions  [0]


In [14]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, 
                                                    random_state=0, stratify=Y)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [15]:
# Convert pandas DataFrames to numpy arrays before using scikit-learn
print("Convert pandas DataFrames to numpy arrays...")
X_train = X_train.values
X_test = X_test.values
Y_train = Y_train.tolist()
Y_test = Y_test.tolist()
print("...Done")

print(X_train[0:5,:])
print(X_test[0:2,:])
print()
print(Y_train[0:5])
print(Y_test[0:2])

Convert pandas DataFrames to numpy arrays...
...Done
[['France' 44.0 72000]
 ['Germany' 40.0 69000]
 ['Spain' nan 52000]
 ['France' 37.0 67000]
 ['Spain' 38.0 61000]]
[['Germany' 50.0 83000]
 ['France' 48.0 79000]]

['No', 'Yes', 'No', 'Yes', 'No']
['No', 'Yes']


## Training pipeline

In [16]:
# Missing values
print("Imputing missing values...")
print(X_train[0:5,:])
print()
imputer = SimpleImputer(strategy="mean")
X_train[:,numeric_indices] = imputer.fit_transform(X_train[:,numeric_indices])
print("...Fini!")
print(X_train[0:5,:]) 
print() 

Imputing missing values...
[['France' 44.0 72000]
 ['Germany' 40.0 69000]
 ['Spain' nan 52000]
 ['France' 37.0 67000]
 ['Spain' 38.0 61000]]

...Fini!
[['France' 44.0 72000.0]
 ['Germany' 40.0 69000.0]
 ['Spain' 35.857142857142854 52000.0]
 ['France' 37.0 67000.0]
 ['Spain' 38.0 61000.0]]



In [17]:
# Encoding categorical features and standardizing numerical features
print("Encoding categorical features and standardizing numerical features...")
print()
print(X_train[0:5,:])

# Normalization
numeric_transformer = StandardScaler()

# OHE / dummyfication
categorical_transformer = OneHotEncoder(drop='first')

featureencoder = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_indices),    
        ('num', numeric_transformer, numeric_indices)
        ]
    )

X_train = featureencoder.fit_transform(X_train)
print("...Done")
print(X_train[0:5,:])

Encoding categorical features and standardizing numerical features...

[['France' 44.0 72000.0]
 ['Germany' 40.0 69000.0]
 ['Spain' 35.857142857142854 52000.0]
 ['France' 37.0 67000.0]
 ['Spain' 38.0 61000.0]]
...Done
[[ 0.00000000e+00  0.00000000e+00  1.61706195e+00  1.46885753e+00]
 [ 1.00000000e+00  0.00000000e+00  8.22715727e-01  1.09777773e+00]
 [ 0.00000000e+00  1.00000000e+00 -1.41104234e-15 -1.00500778e+00]
 [ 0.00000000e+00  0.00000000e+00  2.26956063e-01  8.50391200e-01]
 [ 0.00000000e+00  1.00000000e+00  4.25542617e-01  1.08231607e-01]]


In [18]:
# Encoding labels
print("Encoding labels...")
print(Y_train)
print()
labelencoder = LabelEncoder()
Y_train = labelencoder.fit_transform(Y_train)
print("...Done.")
print(Y_train)
print()

Encoding labels...
['No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes']

...Done.
[0 1 0 1 0 1 0 1]



In [19]:
# Train model
print("Train model...")
classifier = LogisticRegression()
classifier.fit(X_train, Y_train)
print("...Done.")

Train model...
...Done.




In [20]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = classifier.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

Predictions on training set...
...Done.
[0 0 0 1 0 1 1 1]



## Test pipeline

In [21]:
# Missing values
print("Imputing missing values...")
print(X_test[0:5,:])
print()

X_test[:,numeric_indices] = imputer.transform(X_test[:,numeric_indices])
print("...Done.")
print(X_test[0:5,:]) 
print() 

Imputing missing values...
[['Germany' 50.0 83000]
 ['France' 48.0 79000]]

...Done.
[['Germany' 50.0 83000.0]
 ['France' 48.0 79000.0]]



In [22]:
# Encoding categorical features and standardizing numerical features
print("Encoding categorical features and standardizing numerical features...")
print()
print(X_test[0:5,:])

X_test = featureencoder.transform(X_test)
print("...Done")
print(X_test[0:5,:])

Encoding categorical features and standardizing numerical features...

[['Germany' 50.0 83000.0]
 ['France' 48.0 79000.0]]
...Done
[[1.         0.         2.80858127 2.82948345]
 [0.         0.         2.41140816 2.33471038]]


In [23]:
# Encoding labels
print("Encoding labels...")
print(Y_test)
print()

Y_test = labelencoder.transform(Y_test)
print("...Done.")
print(Y_test)
print()

Encoding labels...
['No', 'Yes']

...Done.
[0 1]



In [24]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = classifier.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on test set...
...Done.
[0 0]



In [26]:
from sklearn.externals import joblib

joblib.dump(imputer, "imputer.pkl")
joblib.dump(featureencoder, "featureencoder.pkl")
joblib.dump(labelencoder, "labelencoder.pkl")
joblib.dump(classifier, "classification_model.pkl")

['classification_model.pkl']

In [27]:
joblib.dump(imputer, "imputer.pkl")

['imputer.pkl']

In [28]:
joblib.dump(featureencoder, "featureencoder.pkl")

['featureencoder.pkl']

In [29]:
joblib.dump(labelencoder, "labelencoder.pkl")

['labelencoder.pkl']