In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression,SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.pipeline import  make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.family']='Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] = False

In [2]:
data = {
    '수치형_특징': [10, 20, 30, 40, 50],
    '범주형_특징': ['A', 'B', 'A', 'C', 'B'],
    '그대로_유지': [1, 0, 1, 0, 1]
}
df = pd.DataFrame(data)
df

Unnamed: 0,수치형_특징,범주형_특징,그대로_유지
0,10,A,1
1,20,B,0
2,30,A,1
3,40,C,0
4,50,B,1


In [3]:
x_data = df.iloc[:,:-1].values
y_data = df.iloc[:,-1].values

In [4]:
# ordinalencoder, logisticregression
#model_pipe = make_pipeline(OrdinalEncoder(),LogisticRegression() )
model_pipe = Pipeline( [ ('encode', OrdinalEncoder()),
                        ('logi',LogisticRegression(max_iter=500) ) ] )
model_pipe.fit(x_data,y_data)

0,1,2
,steps,"[('encode', ...), ('logi', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,500


In [5]:
model_pipe.predict([[10,'A']])

array([1])

In [6]:
enc = model_pipe.named_steps['encode']
enc.categories_
### 모든 특성데이터 일괄 라벨인코더 적용

[array([10, 20, 30, 40, 50], dtype=object),
 array(['A', 'B', 'C'], dtype=object)]

### ColumnTransformer 이용

In [7]:
data = {
    '수치형_특징': [10, 20, 30, 40, 50],
    '범주형_특징': ['A', 'B', 'A', 'C', 'B'],
    '그대로_유지': [1, 0, 1, 0, 1]
}
df = pd.DataFrame(data)
df

Unnamed: 0,수치형_특징,범주형_특징,그대로_유지
0,10,A,1
1,20,B,0
2,30,A,1
3,40,C,0
4,50,B,1


### 특성데이터 가 데이터프레임인경우

In [8]:
x_data = df.iloc[:,:-1]
y_data = df.iloc[:,-1]

In [9]:
column_preprocessor= ColumnTransformer( [ ('enc',OrdinalEncoder(), ['범주형_특징'] ) ],
                                       remainder='passthrough' )

model_cpipe = Pipeline([ ('ct',column_preprocessor ),
                        ('logi',LogisticRegression(max_iter=500) ) ] )

model_cpipe.fit(x_data, y_data)

0,1,2
,steps,"[('ct', ...), ('logi', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('enc', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,500


In [10]:
xd = pd.DataFrame( {'수치형_특징':[10],'범주형_특징':['A']} )
model_cpipe.predict(xd )

array([1])

### 특성데이터 넘파이

In [11]:
x_data = df.iloc[:,:-1].values
y_data = df.iloc[:,-1].values

In [12]:
x_data

array([[10, 'A'],
       [20, 'B'],
       [30, 'A'],
       [40, 'C'],
       [50, 'B']], dtype=object)

In [13]:
column_preprocessor= ColumnTransformer( [ ('enc',OrdinalEncoder(), [1] ) ],
                                       remainder='passthrough' )

model_cpipe = Pipeline([ ('ct',column_preprocessor ),
                        ('logi',LogisticRegression(max_iter=500) ) ] )

model_cpipe.fit(x_data, y_data)

0,1,2
,steps,"[('ct', ...), ('logi', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('enc', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,500


In [14]:
model_cpipe.predict([[10,'A']])

array([1])

In [None]:
file_path = '../data/hyundaiCar.xlsx'
df_train=pd.read_excel(file_path,sheet_name='train')
df_test=pd.read_excel(file_path,sheet_name='test')


In [35]:
x_train = df_train.iloc[:,1:].values
y_train = df_train.iloc[:, 0].values

x_test = df_test.iloc[:,1:].values
y_test = df_test.iloc[:, 0].values
y_train

array([ 1885,  2190,  1135,  1645,  1960,  3277,  3065,  1459,  2695,
        1492,  2080,  2595,  2250,  1915,  1104,  1960,  1960,  4650,
        3373,  5710,  1915,  2366,  1610,  2150,  2745,  3195,  3585,
        1270,  1270,  1845,  3024,  3990,  1542,  1850,  1560,  1630,
        1410,  1850,  2054,  1519,  1149,  1430,  1895,  3361,  2160,
        1955,  1104,  5463,  2340,  1149,  2845,  2495,  4190,  2430,
        1885,  1895,  3091,  2110,  2150,  4058,  2250,  3450,  1270,
        3838, 11150,  1164,  3802,  1270,  2430,  2870,  3254])

In [37]:
transformers=[
    ('car1',OrdinalEncoder(), [1] ) ,
    ('car2',OrdinalEncoder(), [5] ) ,
    ('car3',OrdinalEncoder(), [9] ) ]

column_preprocessor= ColumnTransformer( transformers,remainder='passthrough' )

model = Pipeline([ ('ct',column_preprocessor ),
                    ('scale',StandardScaler()),
                    ('logi',SGDRegressor(max_iter=500) ) ] )

In [38]:
model.fit(x_train,y_train)

0,1,2
,steps,"[('ct', ...), ('scale', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('car1', ...), ('car2', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,loss,'squared_error'
,penalty,'l2'
,alpha,0.0001
,l1_ratio,0.15
,fit_intercept,True
,max_iter,500
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [39]:
model.predict([[2015, '준중형', 11.8, 172, 21.0, '가솔린', 0, 1999, 1300, '자동']])

array([2370.89095465])

In [40]:
model.score(x_test,y_test)

0.7586980825464115