# Model Fitting: Classification

In [16]:
%matplotlib inline

# Import Libraries
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

In [17]:
df = pd.read_csv('Bike_train_test.csv')
df.head()

Unnamed: 0,online_order,order_status,list_price,standard_cost,gross_profit,margin_percentage,markup_percentage,product_class_label_1,product_class_label_2,product_class_label_3,product_size_label_1,product_size_label_2,product_size_label_3,high_profit_product
0,0,1,71.49,53.62,17.87,25.0,33.33,0,1,0,0,1,0,0
1,1,1,2091.47,388.92,1702.55,81.4,437.76,0,1,0,0,0,1,1
2,0,1,1793.43,248.82,1544.61,86.13,620.77,1,0,0,0,1,0,1
3,0,1,1198.46,381.1,817.36,68.2,214.47,0,1,0,0,1,0,1
4,1,1,1765.3,709.48,1055.82,59.81,148.82,0,1,0,0,0,1,1


In [27]:
df.head()

Unnamed: 0,online_order,order_status,list_price,standard_cost,gross_profit,margin_percentage,markup_percentage,product_class_label_1,product_class_label_2,product_class_label_3,product_size_label_1,product_size_label_2,product_size_label_3,high_profit_product
0,0,1,71.49,53.62,17.87,25.0,33.33,0,1,0,0,1,0,0
1,1,1,2091.47,388.92,1702.55,81.4,437.76,0,1,0,0,0,1,1
2,0,1,1793.43,248.82,1544.61,86.13,620.77,1,0,0,0,1,0,1
3,0,1,1198.46,381.1,817.36,68.2,214.47,0,1,0,0,1,0,1
4,1,1,1765.3,709.48,1055.82,59.81,148.82,0,1,0,0,0,1,1


In [19]:
X = df.drop(['high_profit_product'], axis=1)
y = df['high_profit_product']

In [20]:
y.head()

0    0
1    1
2    1
3    1
4    1
Name: high_profit_product, dtype: int64

In [21]:
# Split-out validation dataset
validation_size = 0.20
seed = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=validation_size, random_state=seed)

In [22]:
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
#models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
models.append(('RandomForest', RandomForestClassifier(n_estimators=100)))
models.append(('neural_network', MLPClassifier()))
# evaluate each model in turn
training_results = []
testing_results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=10, random_state=42)
    train_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    test_results = cross_val_score(model, X_test, y_test, cv=kfold, scoring='accuracy')
    training_results.append(train_results)
    testing_results.append(test_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, train_results.mean(),test_results.mean() )
    print(msg)

LR: 1.000000 (1.000000)
KNN: 1.000000 (1.000000)
CART: 1.000000 (1.000000)
NB: 0.960548 (0.953552)
SVM: 1.000000 (1.000000)
RandomForest: 1.000000 (1.000000)
neural_network: 1.000000 (1.000000)


### Fitting LR model and adding Regularization

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [24]:
logreg = LogisticRegression(solver='lbfgs', max_iter = 200, penalty="l2", C=100)
logreg.fit(X_train, y_train)
y_pred_class = logreg.predict(X_train)
y_pred_class1 = logreg.predict(X_test)
print('accuracy = %.4f' % accuracy_score(y_test, y_pred_class1))
print('accuracy = %.4f' % accuracy_score(y_train, y_pred_class))

accuracy = 1.0000
accuracy = 1.0000


In [25]:
#online_order	order_status	list_price	standard_cost	gross_profit	
#margin_percentage	markup_percentage	product_class_label_1	product_class_label_2	
#product_class_label_3	product_size_label_1	product_size_label_2	product_size_label_3
logreg.coef_

array([[-5.94849269e-03, -7.75841485e-03,  2.27714626e-02,
        -6.71548643e-01,  6.94320106e-01, -1.74703703e-01,
        -8.27515510e-02, -2.74272864e-05,  7.96347409e-03,
        -1.53250427e-02, -1.02940574e-05,  7.88135621e-03,
        -1.52600581e-02]])

In [32]:
df.high_profit_product.describe()

count    19803.000000
mean         0.347776
std          0.476276
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max          1.000000
Name: high_profit_product, dtype: float64