# IE7275 - Logistic Regression, Linear Discriminant Analysis, and SVM

Note this codes for demonstration purpose in the course IE7275. 

By Yilin Yin and Chun-An Chou

### We demonstrate how to apply logisitic regression for classification

In [53]:
import requests
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import io
from sklearn.metrics import confusion_matrix

In [54]:
# import data about Housing Conditions in Copenhagen online
response = requests.get('https://data.princeton.edu/wws509/datasets/copen.dat')
newdata = pd.read_csv(io.StringIO(response.text),delim_whitespace=True)

# data rearrangement 
newdata =pd.get_dummies(newdata, columns=["housing","influence","contact"])
mapping = {'low': 1, 'medium': 2,'high': 3}
newdata = newdata.replace({'satisfaction': mapping})

In [55]:
newdata

Unnamed: 0,satisfaction,n,housing_apartments,housing_atrium,housing_terraced,housing_tower,influence_high,influence_low,influence_medium,contact_high,contact_low
1,1,21,0,0,0,1,0,1,0,0,1
2,2,21,0,0,0,1,0,1,0,0,1
3,3,28,0,0,0,1,0,1,0,0,1
4,1,14,0,0,0,1,0,1,0,1,0
5,2,19,0,0,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
68,2,5,0,0,1,0,1,0,0,0,1
69,3,11,0,0,1,0,1,0,0,0,1
70,1,5,0,0,1,0,1,0,0,1,0
71,2,6,0,0,1,0,1,0,0,1,0


In [56]:
# Impliment LR 
X = newdata[newdata.columns[1:13]]
Y = newdata.iloc[:,0]

X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.30, random_state=42)

model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

clf = model.fit(X_train.iloc[:,1:11], y_train, sample_weight=X_train['n'])

y_pred = clf.predict(X_test.iloc[:,1:11])

print(confusion_matrix(y_test, y_pred))

[[1 2 5]
 [1 0 7]
 [1 2 3]]


### We demonstrate how to apply LDA for classification

In [57]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import numpy as np

In [58]:
mydata = pd.read_csv('RidingMowers.csv')
X = mydata[mydata.columns[0:2]]
y = mydata.iloc[:,2]

# Use LDA library 
clf = LinearDiscriminantAnalysis().fit(X, y)

clf.predict_proba(X)

array([[0.78203155, 0.21796845],
       [0.49449211, 0.50550789],
       [0.15236751, 0.84763249],
       [0.31924493, 0.68075507],
       [0.00402325, 0.99597675],
       [0.0124668 , 0.9875332 ],
       [0.05188913, 0.94811087],
       [0.01554353, 0.98445647],
       [0.29300716, 0.70699284],
       [0.01956031, 0.98043969],
       [0.3436552 , 0.6563448 ],
       [0.11070233, 0.88929767],
       [0.2371929 , 0.7628071 ],
       [0.52865847, 0.47134153],
       [0.8505169 , 0.1494831 ],
       [0.80075893, 0.19924107],
       [0.37757951, 0.62242049],
       [0.95203727, 0.04796273],
       [0.96165849, 0.03834151],
       [0.66288177, 0.33711823],
       [0.98387005, 0.01612995],
       [0.97514892, 0.02485108],
       [0.99644001, 0.00355999],
       [0.97819391, 0.02180609]])

In [59]:
# manual LDA
nonowndata = mydata[mydata.Ownership == 'Nonowner']
owndata = mydata[mydata.Ownership == 'Owner']

# compute sample means and variances for individual classes. 
mean_own = owndata.iloc[:,0:2].mean().to_numpy()
mean_nonown = nonowndata.iloc[:,0:2].mean().to_numpy()

cov_own = owndata.iloc[:,0:2].cov().to_numpy()
cov_nonown = nonowndata.iloc[:,0:2].cov().to_numpy()

# compute the projected space
B = np.outer((mean_own - mean_nonown),(mean_own - mean_nonown).T)

S = cov_own+cov_nonown

inv_S = np.linalg.inv(S)

S_B = np.multiply(inv_S,B)

values, W = np.linalg.eig(S_B)

W = W[:,0]
W


array([0.77200473, 0.63561679])

In [60]:
proj_mean_own = np.multiply(W, mean_own)
proj_mean_nonown = np.multiply(W, mean_nonown)

# compute bias
W0 = np.mean([proj_mean_own,proj_mean_nonown])
W0

32.439505834266114

In [61]:
def disc(W,x,W0):
    Y = W[0] * x.iloc[:,0] + W[1] * x.iloc[:,1]
    Pred_Y = np.where(Y >= W0, 'Owner','Nonowner')
    return Pred_Y

Pred_Y = disc(W,mydata.iloc[:,0:2],W0)

In [62]:
print(Pred_Y)

['Owner' 'Owner' 'Owner' 'Owner' 'Owner' 'Owner' 'Owner' 'Owner' 'Owner'
 'Owner' 'Owner' 'Owner' 'Owner' 'Owner' 'Owner' 'Owner' 'Owner' 'Owner'
 'Owner' 'Owner' 'Owner' 'Owner' 'Owner' 'Owner']


### We demonstrate how to apply SVM for classification

In [72]:
from sklearn import svm

In [73]:
# The data used in the lecture note

d = {'x1': [3.5,4,4,4.5,4.9,5,5.5,5.5,0.5,1,1.25,1.5,2,2.5], 'x2': [4.25,3,4,1.75,4.5,4,2.5,3.5,1.5,2.5,0.5,1.5,2,0.75],
     'y' : [1,1,1,1,1,1,1,1,-1,-1,-1,-1,-1,-1]}
mydata = pd.DataFrame(d)
mydata

Unnamed: 0,x1,x2,y
0,3.5,4.25,1
1,4.0,3.0,1
2,4.0,4.0,1
3,4.5,1.75,1
4,4.9,4.5,1
5,5.0,4.0,1
6,5.5,2.5,1
7,5.5,3.5,1
8,0.5,1.5,-1
9,1.0,2.5,-1


In [74]:
d2 = {'x1': [4,2,3,5], 'x2': [2,3,2,3], 'y' : [1,1,-1,-1]}
testdata = pd.DataFrame(d2)
testdata

Unnamed: 0,x1,x2,y
0,4,2,1
1,2,3,1
2,3,2,-1
3,5,3,-1


In [75]:
X = mydata.drop(['y'],axis= 1)
y = mydata['y']

In [76]:
clf = svm.SVC(kernel="linear").fit(X, y)
sv = clf.support_vectors_

In [77]:
print(sv)

[[2.   2.  ]
 [2.5  0.75]
 [4.5  1.75]]


In [78]:
print(clf.predict(testdata.iloc[:,0:2]))

[ 1 -1 -1  1]


In [83]:
# Another example of large data
mydata = pd.read_csv('UniversalBank.csv')
cols = [0,4]
mydata = mydata.drop(mydata.columns[cols], axis=1)
X = mydata.drop(['Personal Loan'],axis= 1)
y = mydata['Personal Loan']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

clf = svm.SVC(kernel="linear").fit(X_train, y_train)
sv = clf.support_vectors_