## Data preparation

In [16]:
%matplotlib notebook
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split


np.set_printoptions(precision =2)

from sklearn.datasets import make_regression
plt.figure()
plt.title('Sample regression')
X_R1, y_R1 = make_regression(n_samples = 100, n_features = 1, n_informative = 1, bias = 150.0,
                            noise = 30, random_state = 0)
plt.scatter(X_R1, y_R1, marker='o', s= 50)
plt.show()

<IPython.core.display.Javascript object>

In [11]:
# classification
fruits = pd.read_table('fruit_data_with_colors.txt')

X_fruits = fruits[['height', 'width','mass','color_score']]
y_fruits = fruits['fruit_label']

X_fruits_2d = fruits[['height','width']]
y_fruits_2d = fruits['fruit_label']

# cancer dataset, evaluation
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
(X_cancer, y_cancer) = load_breast_cancer(return_X_y=True)

In [12]:
X_fruits.head()

Unnamed: 0,height,width,mass,color_score
0,7.3,8.4,192,0.55
1,6.8,8.0,180,0.59
2,7.2,7.4,176,0.6
3,4.7,6.2,86,0.8
4,4.6,6.0,84,0.79


In [13]:
X_cancer

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [14]:
y_cancer

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

## Linear Regression

In [21]:
from sklearn.linear_model import LinearRegression

# train_test_split : training 70% test 30 %로 나눠줌
X_train, X_test, y_train, y_test = train_test_split(X_R1,y_R1, random_state = 0)
linreg = LinearRegression().fit(X_train,y_train)

# 기울기
# linreg= 리니어 리그레션 결과,  coef_ = 언더바는 미리 정의돼있는 배리어블
print('linear model coeff(w): {}', format(linreg.coef_))

# y 절편
print('linear model intercept(b): {:.2f}'.format(linreg.intercept_))

# score (트레이닝 셋과 테스트 셋의 정확도)
print('R=squared score(training): {:.3f}'.format(linreg.score(X_train,y_train)))
print('R=squared score(training): {:.3f}'.format(linreg.score(X_test,y_test)))

linear model coeff(w): {} [45.71]
linear model intercept(b): 148.45
R=squared score(training): 0.679
R=squared score(training): 0.492


In [24]:
plt.figure(figsize=(5,4))
plt.scatter(X_R1,y_R1, marker = 'o', s = 50, alpha = 0.8)
plt.plot(X_R1, linreg.coef_ * X_R1 + linreg.intercept_, 'r-')
plt.title('Least-squares linear regression')
plt.xlabel('Feature value(x)')
plt.ylabel('Target value(y)')
plt.show()

<IPython.core.display.Javascript object>

## Logistic Regresstion

In [29]:
from sklearn.linear_model import LogisticRegression

y_fruits_apple = y_fruits_2d == 1
X_train,X_test, y_train, y_test = train_test_split(X_fruits_2d, y_fruits_apple,random_state = 0)

clf = LogisticRegression().fit(X_train, y_train)

print("Accuracy of Logistic regression classifier on training set: {:.2f}".format(clf.score(X_train,y_train)))
print("Accuracy of Logistic regression classifier on test set: {:.2f}".format(clf.score(X_test,y_test)))

Accuracy of Logistic regression classifier on training set: 0.75
Accuracy of Logistic regression classifier on test set: 0.67


In [30]:
# 예측
clf.predict([[6,8]])

array([ True])

In [36]:

X_train,X_test, y_train, y_test = train_test_split(X_cancer, y_cancer,random_state = 0)

clf = LogisticRegression().fit(X_train, y_train)

print("Accuracy of Logistic regression classifier on training set: {:.2f}".format(clf.score(X_train,y_train)))
print("Accuracy of Logistic regression classifier on test set: {:.2f}".format(clf.score(X_test,y_test)))

Accuracy of Logistic regression classifier on training set: 0.95
Accuracy of Logistic regression classifier on test set: 0.94


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## Support Vector Machine

In [35]:
from sklearn.svm import SVC

X_train,X_test, y_train, y_test = train_test_split(X_fruits_2d, y_fruits_apple,random_state = 0)

clf = SVC(kernel = 'linear').fit(X_train, y_train)

print("Accuracy of Logistic regression classifier on training set: {:.2f}".format(clf.score(X_train,y_train)))
print("Accuracy of Logistic regression classifier on test set: {:.2f}".format(clf.score(X_test,y_test)))

Accuracy of Logistic regression classifier on training set: 0.84
Accuracy of Logistic regression classifier on test set: 0.67


In [34]:

X_train,X_test, y_train, y_test = train_test_split(X_cancer, y_cancer,random_state = 0)

clf = SVC(kernel = 'linear').fit(X_train, y_train)

print("Accuracy of Logistic regression classifier on training set: {:.2f}".format(clf.score(X_train,y_train)))
print("Accuracy of Logistic regression classifier on test set: {:.2f}".format(clf.score(X_test,y_test)))

Accuracy of Logistic regression classifier on training set: 0.97
Accuracy of Logistic regression classifier on test set: 0.96


## Decision Tree

In [39]:
from sklearn.tree import DecisionTreeClassifier


X_train,X_test, y_train, y_test = train_test_split(X_fruits_2d, y_fruits_apple,random_state = 0)

clf = DecisionTreeClassifier().fit(X_train, y_train)

print("Accuracy of Logistic regression classifier on training set: {:.2f}".format(clf.score(X_train,y_train)))
print("Accuracy of Logistic regression classifier on test set: {:.2f}".format(clf.score(X_test,y_test)))

Accuracy of Logistic regression classifier on training set: 1.00
Accuracy of Logistic regression classifier on test set: 0.67


In [45]:

X_train,X_test, y_train, y_test = train_test_split(X_cancer, y_cancer,random_state = 0)

clf = DecisionTreeClassifier().fit(X_train, y_train)

print("Accuracy of Logistic regression classifier on training set: {:.2f}".format(clf.score(X_train,y_train)))
print("Accuracy of Logistic regression classifier on test set: {:.2f}".format(clf.score(X_test,y_test)))

Accuracy of Logistic regression classifier on training set: 1.00
Accuracy of Logistic regression classifier on test set: 0.87


## Random Forest

In [51]:
# Dicision tree를 합친다(bec, 디시젼 트리는 overfitting이 많이 일어나기 때문에 여러 트리를 합침으로써 문제 해결)
# Random Forest Split

from sklearn.ensemble import RandomForestClassifier


X_train,X_test, y_train, y_test = train_test_split(X_fruits_2d, y_fruits_apple,random_state = 0)

clf = RandomForestClassifier(n_estimators = 10, random_state = 0).fit(X_train, y_train)

print("Accuracy of Logistic regression classifier on training set: {:.2f}".format(clf.score(X_train,y_train)))
print("Accuracy of Logistic regression classifier on test set: {:.2f}".format(clf.score(X_test,y_test)))

Accuracy of Logistic regression classifier on training set: 1.00
Accuracy of Logistic regression classifier on test set: 0.67


In [53]:

X_train,X_test, y_train, y_test = train_test_split(X_cancer, y_cancer,random_state = 0)

clf = RandomForestClassifier(max_features = 8, n_estimators = 10, random_state = 0).fit(X_train, y_train)

print("Accuracy of Logistic regression classifier on training set: {:.2f}".format(clf.score(X_train,y_train)))
print("Accuracy of Logistic regression classifier on test set: {:.2f}".format(clf.score(X_test,y_test)))

Accuracy of Logistic regression classifier on training set: 1.00
Accuracy of Logistic regression classifier on test set: 0.99
