In [1]:
from sklearn import datasets
from sklearn.preprocessing import PolynomialFeatures # 다항식 회귀 tool
from sklearn.model_selection import train_test_split # 데이터 분할
from sklearn.linear_model import LinearRegression  # 선형 회귀
from sklearn.linear_model import LogisticRegression # 로지스틱 회귀
from sklearn.metrics import mean_squared_error
import pandas as pd

In [2]:
boston_dataset = datasets.load_boston()
print(boston_dataset.DESCR)
print(boston_dataset.data.shape)


.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [3]:
print(boston_dataset.feature_names)

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


In [10]:
x = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
print(x)

        CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0    0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1    0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2    0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3    0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4    0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   
..       ...   ...    ...   ...    ...    ...   ...     ...  ...    ...   
501  0.06263   0.0  11.93   0.0  0.573  6.593  69.1  2.4786  1.0  273.0   
502  0.04527   0.0  11.93   0.0  0.573  6.120  76.7  2.2875  1.0  273.0   
503  0.06076   0.0  11.93   0.0  0.573  6.976  91.0  2.1675  1.0  273.0   
504  0.10959   0.0  11.93   0.0  0.573  6.794  89.3  2.3889  1.0  273.0   
505  0.04741   0.0  11.93   0.0  0.573  6.030  80.8  2.5050  1.0  273.0   

     PTRATIO       B  LSTAT  
0       15.3  396.90   4.98  
1       17.8  396.90   9.14  
2       1

**단순회귀분석**

In [5]:
x = x[['NOX']]
print(x)

       NOX
0    0.538
1    0.469
2    0.469
3    0.458
4    0.458
..     ...
501  0.573
502  0.573
503  0.573
504  0.573
505  0.573

[506 rows x 1 columns]


In [6]:
y = pd.DataFrame(boston_dataset.target, columns=['MEDV'])
print(y)

     MEDV
0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
..    ...
501  22.4
502  20.6
503  23.9
504  22.0
505  11.9

[506 rows x 1 columns]


In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)
print(x_train, x_test, y_train, y_test)

       NOX
33   0.538
283  0.401
418  0.679
502  0.573
402  0.693
..     ...
486  0.583
189  0.437
495  0.585
206  0.489
355  0.413

[404 rows x 1 columns]        NOX
226  0.504
292  0.411
90   0.489
373  0.668
273  0.464
..     ...
349  0.429
212  0.489
156  0.871
480  0.532
248  0.431

[102 rows x 1 columns]      MEDV
33   13.1
283  50.0
418   8.8
502  20.6
402  12.1
..    ...
486  19.1
189  34.9
495  23.1
206  24.4
355  20.6

[404 rows x 1 columns]      MEDV
226  37.6
292  27.9
90   22.6
373  13.8
273  35.2
..    ...
349  26.6
212  22.4
156  13.1
480  23.0
248  24.5

[102 rows x 1 columns]


In [8]:
model = LinearRegression()
model.fit(x_train, y_train)
y_test_prediction = model.predict(x_test)
print(y_test_prediction)

[[24.28982638]
 [27.5015958 ]
 [24.8078537 ]
 [18.62606094]
 [25.67123258]
 [18.24617423]
 [21.90690067]
 [24.18622091]
 [22.80481471]
 [21.66515459]
 [26.32740053]
 [23.90993967]
 [22.90842017]
 [25.39495134]
 [25.67123258]
 [26.43100599]
 [21.07805695]
 [11.61542445]
 [21.49247881]
 [21.07805695]
 [20.66363509]
 [20.14560776]
 [24.28982638]
 [22.90842017]
 [27.88148251]
 [16.13952977]
 [24.66971308]
 [22.80481471]
 [23.90993967]
 [27.5015958 ]
 [21.63061943]
 [20.66363509]
 [17.76268206]
 [17.07197896]
 [26.74182239]
 [20.14560776]
 [24.18622091]
 [18.24617423]
 [19.3512992 ]
 [26.91449816]
 [17.76268206]
 [20.14560776]
 [24.18622091]
 [17.07197896]
 [26.60368177]
 [27.88148251]
 [17.76268206]
 [21.63061943]
 [11.61542445]
 [20.14560776]
 [23.73726389]
 [24.66971308]
 [26.60368177]
 [23.1156311 ]
 [11.61542445]
 [27.53613096]
 [26.32740053]
 [17.07197896]
 [26.60368177]
 [26.53461146]
 [21.49247881]
 [24.08261545]
 [20.14560776]
 [20.80177571]
 [15.10347512]
 [17.76268206]
 [23.11563

**다중회귀분석**

In [11]:
x = x[['NOX','AGE']]
print(x)

       NOX   AGE
0    0.538  65.2
1    0.469  78.9
2    0.469  61.1
3    0.458  45.8
4    0.458  54.2
..     ...   ...
501  0.573  69.1
502  0.573  76.7
503  0.573  91.0
504  0.573  89.3
505  0.573  80.8

[506 rows x 2 columns]


In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)
model = LinearRegression()
model.fit(x_train, y_train)
print(model.coef_)
print(model.intercept_)

[[-25.89408203  -0.04866708]]
[40.22513315]


In [13]:
y_test_prediction = model.predict(x_test)
print(y_test_prediction)
print(mean_squared_error(y_test, y_test_prediction)**0.5)

[[22.96481372]
 [28.44385585]
 [24.3460333 ]
 [18.06117874]
 [25.68932454]
 [18.30681497]
 [20.95912022]
 [23.860473  ]
 [22.08983687]
 [21.55653486]
 [25.66057439]
 [24.68019565]
 [22.40598778]
 [26.0029096 ]
 [25.35352172]
 [26.23466081]
 [19.89965856]
 [12.80468009]
 [21.52926531]
 [19.89965856]
 [19.68626373]
 [19.30271921]
 [23.36388374]
 [22.86832501]
 [28.85522515]
 [16.36227289]
 [24.81672847]
 [22.53270726]
 [25.0354653 ]
 [27.8355174 ]
 [20.4599651 ]
 [19.67166361]
 [17.67176219]
 [16.930012  ]
 [28.11752143]
 [19.31245262]
 [23.37380223]
 [18.00021239]
 [19.01862461]
 [28.37352624]
 [17.77396305]
 [19.20051835]
 [23.16453381]
 [17.48481667]
 [26.42739842]
 [27.56068093]
 [17.41382669]
 [20.65950012]
 [12.93608119]
 [19.51198764]
 [22.61377561]
 [25.33259948]
 [26.71940087]
 [21.57827734]
 [12.8776807 ]
 [27.8711449 ]
 [26.90645153]
 [17.55295058]
 [27.39587323]
 [26.63354576]
 [21.19832919]
 [24.72206532]
 [19.39518665]
 [19.81904031]
 [16.22785584]
 [17.60849499]
 [21.69994

**다항회귀분석**

In [14]:
# 2항. 10항 -> (10)
polynomial_transformer = PolynomialFeatures(2)
polynomial_data = polynomial_transformer.fit_transform(boston_dataset.data)
print(polynomial_data.shape)

(506, 105)


In [15]:
polynomial_feature_names = polynomial_transformer.get_feature_names(boston_dataset.feature_names)
print(polynomial_feature_names)


['1', 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'CRIM^2', 'CRIM ZN', 'CRIM INDUS', 'CRIM CHAS', 'CRIM NOX', 'CRIM RM', 'CRIM AGE', 'CRIM DIS', 'CRIM RAD', 'CRIM TAX', 'CRIM PTRATIO', 'CRIM B', 'CRIM LSTAT', 'ZN^2', 'ZN INDUS', 'ZN CHAS', 'ZN NOX', 'ZN RM', 'ZN AGE', 'ZN DIS', 'ZN RAD', 'ZN TAX', 'ZN PTRATIO', 'ZN B', 'ZN LSTAT', 'INDUS^2', 'INDUS CHAS', 'INDUS NOX', 'INDUS RM', 'INDUS AGE', 'INDUS DIS', 'INDUS RAD', 'INDUS TAX', 'INDUS PTRATIO', 'INDUS B', 'INDUS LSTAT', 'CHAS^2', 'CHAS NOX', 'CHAS RM', 'CHAS AGE', 'CHAS DIS', 'CHAS RAD', 'CHAS TAX', 'CHAS PTRATIO', 'CHAS B', 'CHAS LSTAT', 'NOX^2', 'NOX RM', 'NOX AGE', 'NOX DIS', 'NOX RAD', 'NOX TAX', 'NOX PTRATIO', 'NOX B', 'NOX LSTAT', 'RM^2', 'RM AGE', 'RM DIS', 'RM RAD', 'RM TAX', 'RM PTRATIO', 'RM B', 'RM LSTAT', 'AGE^2', 'AGE DIS', 'AGE RAD', 'AGE TAX', 'AGE PTRATIO', 'AGE B', 'AGE LSTAT', 'DIS^2', 'DIS RAD', 'DIS TAX', 'DIS PTRATIO', 'DIS B', 'DIS LSTAT', 'RAD^2', 'RAD TAX',

In [16]:
x = pd.DataFrame(polynomial_data, columns = polynomial_feature_names)
print(x)

       1     CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD  ...  \
0    1.0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  ...   
1    1.0  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  ...   
2    1.0  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  ...   
3    1.0  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  ...   
4    1.0  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  ...   
..   ...      ...   ...    ...   ...    ...    ...   ...     ...  ...  ...   
501  1.0  0.06263   0.0  11.93   0.0  0.573  6.593  69.1  2.4786  1.0  ...   
502  1.0  0.04527   0.0  11.93   0.0  0.573  6.120  76.7  2.2875  1.0  ...   
503  1.0  0.06076   0.0  11.93   0.0  0.573  6.976  91.0  2.1675  1.0  ...   
504  1.0  0.10959   0.0  11.93   0.0  0.573  6.794  89.3  2.3889  1.0  ...   
505  1.0  0.04741   0.0  11.93   0.0  0.573  6.030  80.8  2.5050  1.0  ...   

       TAX^2  TAX PTRATIO      TAX B  TAX LSTAT  PTRATIO^2  PTR

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)
model = LinearRegression()
model.fit(x_train, y_train)
y_test_prediction = model.predict(x_test)
print(y_test_prediction)

[[42.00919669]
 [28.22372855]
 [25.52688254]
 [ 9.05454749]
 [33.5903981 ]
 [10.49656653]
 [23.0597188 ]
 [30.35330832]
 [24.2246103 ]
 [22.30153137]
 [33.11791368]
 [20.74987646]
 [20.19664018]
 [32.37012974]
 [27.35813074]
 [20.46264672]
 [13.68533393]
 [12.51890372]
 [15.88730519]
 [12.47719001]
 [ 3.72827179]
 [20.49819423]
 [44.09350121]
 [23.31624023]
 [33.2791559 ]
 [ 9.43400657]
 [24.71325022]
 [21.79459245]
 [24.06481669]
 [27.42603119]
 [15.3289399 ]
 [ 6.80742072]
 [16.76243454]
 [13.13446141]
 [25.10746984]
 [22.92666538]
 [29.58310464]
 [10.66362649]
 [47.75889196]
 [35.24353036]
 [19.90943076]
 [15.25028015]
 [28.15702648]
 [14.02415348]
 [26.11171521]
 [28.60093424]
 [15.67989876]
 [18.96653866]
 [20.2094799 ]
 [14.03412949]
 [21.76275324]
 [22.26202924]
 [22.01832275]
 [15.44624876]
 [15.75821427]
 [26.30451802]
 [40.97147443]
 [14.50276205]
 [29.38387137]
 [21.7344614 ]
 [18.72108633]
 [24.58475299]
 [14.30971895]
 [36.2957301 ]
 [26.41019983]
 [10.60479846]
 [16.95708

In [18]:
print(mean_squared_error(y_test, y_test_prediction) ** 0.5)

3.196527651079712


In [19]:
diabetes_dataset = datasets.load_diabetes()

polynomial_transformer = PolynomialFeatures(2)
polynomial_data = polynomial_transformer.fit_transform(diabetes_dataset.data)
polynomial_feature_names = polynomial_transformer.get_feature_names(diabetes_dataset.feature_names)
x = pd.DataFrame(polynomial_data, columns = polynomial_feature_names)

y = pd.DataFrame(diabetes_dataset.target, columns=['diabetes'])


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)
model = LinearRegression()
model.fit(x_train, y_train)
y_test_predict = model.predict(x_test)

mse = mean_squared_error(y_test, y_test_predict)**0.5
print(mse)

57.87704902724908


In [20]:
iris_data = datasets.load_iris()
print(iris_data.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [21]:
x = pd.DataFrame(iris_data.data, columns=iris_data.feature_names)
print(x)

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                  5.1               3.5                1.4               0.2
1                  4.9               3.0                1.4               0.2
2                  4.7               3.2                1.3               0.2
3                  4.6               3.1                1.5               0.2
4                  5.0               3.6                1.4               0.2
..                 ...               ...                ...               ...
145                6.7               3.0                5.2               2.3
146                6.3               2.5                5.0               1.9
147                6.5               3.0                5.2               2.0
148                6.2               3.4                5.4               2.3
149                5.9               3.0                5.1               1.8

[150 rows x 4 columns]


In [22]:
y = pd.DataFrame(iris_data.target, columns=['class'])
print(y)

     class
0        0
1        0
2        0
3        0
4        0
..     ...
145      2
146      2
147      2
148      2
149      2

[150 rows x 1 columns]


In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)
y_train = y_train.values.ravel()

In [24]:
model = LogisticRegression(solver='saga', max_iter=3500)
model.fit(x_train, y_train)
print(model.predict(x_test))
print(model.score(x_test, y_test))

[1 2 2 0 2 1 0 2 0 1 1 2 2 2 0 0 2 2 0 0 1 2 0 1 1 2 1 1 1 2]
0.9666666666666667
