# Logistic Regression Example

Glass identification dataset: https://archive.ics.uci.edu/ml/datasets/Glass+Identification

### Read Data

In [1]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
import sklearn
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('glass.data',names=['Id','RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','type_int'])

In [3]:
df.head()

Unnamed: 0,Id,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,type_int
0,1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,2,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,4,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,5,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [4]:
df['type_int'].unique()

array([1, 2, 3, 5, 6, 7], dtype=int64)

In [5]:
df['type'] = df['type_int'].replace(1,'building_windows_float_processed') \
                              .replace(2,'building_windows_non_float_processed') \
                              .replace(3,'vehicle_windows_float_processed') \
                              .replace(4,'vehicle_windows_non_float_processed') \
                              .replace(5,'containers') \
                              .replace(6,'tableware') \
                              .replace(7,'headlamps')

In [6]:
df.head()

Unnamed: 0,Id,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,type_int,type
0,1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1,building_windows_float_processed
1,2,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1,building_windows_float_processed
2,3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1,building_windows_float_processed
3,4,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1,building_windows_float_processed
4,5,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1,building_windows_float_processed


In [7]:
df['type'].unique()

array(['building_windows_float_processed',
       'building_windows_non_float_processed',
       'vehicle_windows_float_processed', 'containers', 'tableware',
       'headlamps'], dtype=object)

### Build X and y

In [8]:
X = df.loc[:,'RI':'Fe']
X

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0
...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0


In [9]:
# from sklearn.preprocessing import OneHotEncoder
# encoder = OneHotEncoder()
y = df['type']
y

0      building_windows_float_processed
1      building_windows_float_processed
2      building_windows_float_processed
3      building_windows_float_processed
4      building_windows_float_processed
                     ...               
209                           headlamps
210                           headlamps
211                           headlamps
212                           headlamps
213                           headlamps
Name: type, Length: 214, dtype: object

### Split Data

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y) # stratify nem véletlen mintavétel

Passing y will split the data in a stratified fashion, which tries to keep the same portion of classes in train and test outputs:

In [11]:
y_test.value_counts()/y_test.shape[0]

building_windows_non_float_processed    0.348837
building_windows_float_processed        0.325581
headlamps                               0.139535
vehicle_windows_float_processed         0.069767
containers                              0.069767
tableware                               0.046512
Name: type, dtype: float64

In [12]:
y_train.value_counts()/y_train.shape[0]

building_windows_non_float_processed    0.356725
building_windows_float_processed        0.327485
headlamps                               0.134503
vehicle_windows_float_processed         0.081871
containers                              0.058480
tableware                               0.040936
Name: type, dtype: float64

In [13]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((171, 9), (43, 9), (171,), (43,))

### Apply Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
reg = LogisticRegression(max_iter=100000)
reg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
predicted = reg.predict(X_test)
list(zip(predicted,y_test))

[('building_windows_non_float_processed', 'building_windows_float_processed'),
 ('building_windows_non_float_processed', 'building_windows_float_processed'),
 ('headlamps', 'headlamps'),
 ('tableware', 'building_windows_non_float_processed'),
 ('building_windows_float_processed', 'building_windows_float_processed'),
 ('building_windows_non_float_processed',
  'building_windows_non_float_processed'),
 ('containers', 'containers'),
 ('building_windows_non_float_processed',
  'building_windows_non_float_processed'),
 ('building_windows_non_float_processed',
  'building_windows_non_float_processed'),
 ('building_windows_float_processed', 'building_windows_non_float_processed'),
 ('building_windows_non_float_processed', 'building_windows_float_processed'),
 ('building_windows_non_float_processed', 'containers'),
 ('building_windows_float_processed', 'building_windows_float_processed'),
 ('headlamps', 'headlamps'),
 ('tableware', 'tableware'),
 ('building_windows_non_float_processed', 'vehic

In [17]:
from sklearn.metrics import accuracy_score
print("Test accuracy:")
accuracy_score(y_test, predicted)

Test accuracy:


0.6511627906976745

In [18]:
from sklearn.metrics import accuracy_score
print("Train accuracy:")
accuracy_score(y_train, reg.predict(X_train))

Train accuracy:


0.6783625730994152

Not a very good accuracy, the model is too simple, we need a more complex one, or maybe more data, or both!