## Machine Leaning 실습

### Regression With Python

In [1]:
%config Completer.use_jedi = False

In [38]:
# 1. 패키지 호출
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

sr_data = pd.read_csv('../202106_MachineLearningClass/01SR_Data.csv')
print('pandas 버젼 : ',pd.__version__)
print('numpy 버젼 : ',np.__version__)

pandas 버젼 :  1.2.4
numpy 버젼 :  1.19.5


In [3]:
# 2. 데이터 with pandas DataFrame
sr_data

Unnamed: 0,Country,Age,Year,Salary
0,Spain,27.0,3.0,48000
1,Spain,,6.0,52000
2,Germany,30.0,2.0,54000
3,France,35.0,,58000
4,Spain,38.0,,61000
5,Germany,40.0,10.0,61000
6,France,37.0,7.0,67000
7,France,44.0,15.0,72000
8,France,48.0,,79000
9,Germany,50.0,21.0,83000


In [4]:
# 3-1. 데이터 살펴보기
sr_data.head()

Unnamed: 0,Country,Age,Year,Salary
0,Spain,27.0,3.0,48000
1,Spain,,6.0,52000
2,Germany,30.0,2.0,54000
3,France,35.0,,58000
4,Spain,38.0,,61000


In [5]:
# 3-2. 데이터 정보확인
sr_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  10 non-null     object 
 1   Age      9 non-null      float64
 2   Year     7 non-null      float64
 3   Salary   10 non-null     int64  
dtypes: float64(2), int64(1), object(1)
memory usage: 448.0+ bytes


In [6]:
# 3-3. 데이터 설명보기
sr_data.describe(include='all')

Unnamed: 0,Country,Age,Year,Salary
count,10,9.0,7.0,10.0
unique,3,,,
top,France,,,
freq,4,,,
mean,,38.777778,9.142857,63500.0
std,,7.693793,6.817345,11597.413505
min,,27.0,2.0,48000.0
25%,,35.0,4.5,55000.0
50%,,38.0,7.0,61000.0
75%,,44.0,12.5,70750.0


In [7]:
# 4. feature/label 나누기
feature = sr_data.drop('Salary', axis=1)
#feature = sr_data.iloc[:,:3]
label = sr_data[['Salary']]
#label = sr_data.iloc[:,-1]

print(feature)
print('-'*77)
print(label)

   Country   Age  Year
0    Spain  27.0   3.0
1    Spain   NaN   6.0
2  Germany  30.0   2.0
3   France  35.0   NaN
4    Spain  38.0   NaN
5  Germany  40.0  10.0
6   France  37.0   7.0
7   France  44.0  15.0
8   France  48.0   NaN
9  Germany  50.0  21.0
-----------------------------------------------------------------------------
   Salary
0   48000
1   52000
2   54000
3   58000
4   61000
5   61000
6   67000
7   72000
8   79000
9   83000


In [8]:
# 5. 비어있는 값 채우기(mean)
sr_data.isnull().sum()

Country    0
Age        1
Year       3
Salary     0
dtype: int64

In [9]:
mean_imp = SimpleImputer(strategy='mean')

mean_imp.fit(feature.iloc[:,1:])
feature.iloc[:,1:] = mean_imp.transform(feature.iloc[:,1:])

feature.isnull().sum()

Country    0
Age        0
Year       0
dtype: int64

In [10]:
# 6. One hot encoding
# 0번째 칼럼을 기준으로.
ct = ColumnTransformer([('one_hot_encoder', OneHotEncoder(),[0])], remainder='passthrough')

feature = ct.fit_transform(feature)

print(feature)

[[ 0.          0.          1.         27.          3.        ]
 [ 0.          0.          1.         38.77777778  6.        ]
 [ 0.          1.          0.         30.          2.        ]
 [ 1.          0.          0.         35.          9.14285714]
 [ 0.          0.          1.         38.          9.14285714]
 [ 0.          1.          0.         40.         10.        ]
 [ 1.          0.          0.         37.          7.        ]
 [ 1.          0.          0.         44.         15.        ]
 [ 1.          0.          0.         48.          9.14285714]
 [ 0.          1.          0.         50.         21.        ]]


In [46]:
# 7. Split Data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0.2, random_state=172)

print(X_train)
print('-'*77)
print(y_train)

[[ 1.          0.          0.         35.          9.14285714]
 [ 1.          0.          0.         48.          9.14285714]
 [ 0.          0.          1.         38.          9.14285714]
 [ 0.          1.          0.         40.         10.        ]
 [ 1.          0.          0.         37.          7.        ]
 [ 1.          0.          0.         44.         15.        ]
 [ 0.          1.          0.         30.          2.        ]
 [ 0.          1.          0.         50.         21.        ]]
-----------------------------------------------------------------------------
   Salary
3   58000
8   79000
4   61000
5   61000
6   67000
7   72000
2   54000
9   83000


In [47]:
# 8. Train

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

LinearRegression()

In [48]:
# 8-1. Train_2

tree_model = DecisionTreeRegressor().fit(X_train, y_train)

In [49]:
# 9. Score
y_pred = linear_model.predict(X_test)
print(y_pred)
print(y_test)

[[62333.30604545]
 [45188.74009869]]
   Salary
1   52000
0   48000


In [50]:
y_pred_tree = tree_model.predict(X_test)

In [51]:
# 10. Evalute
# Regression로 돌려본 값
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('mae = ',mae)
print('mse = ',mse)
print('r2 = ',r2)

mae =  6572.282973379846
mse =  57340198.03082107
r2 =  -13.335049507705268


In [55]:
# DecisionTree로 돌려본 값
maet = mean_absolute_error(y_test, y_pred_tree)
mset = mean_squared_error(y_test, y_pred_tree, squared=False)
r2t = r2_score(y_test, y_pred_tree)

print('mae = ',maet)
print('mse = ',mset)
print('r2 = ',r2t)

mae =  10500.0
mse =  11423.659658795863
r2 =  -31.625
