# 자동차 연료 소비 데이터

## 로드 및 전처리

In [1]:
import pandas as pd
import numpy as np

In [3]:
x_test = pd.read_csv("data/mpg_X_test.csv")
x_train = pd.read_csv("data//mpg_X_train.csv")
y_train = pd.read_csv("data//mpg_y_train.csv")

In [4]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278 entries, 0 to 277
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          278 non-null    object 
 1   mpg           278 non-null    float64
 2   cylinders     278 non-null    int64  
 3   displacement  278 non-null    float64
 4   horsepower    274 non-null    float64
 5   weight        278 non-null    int64  
 6   acceleration  278 non-null    float64
 7   model_year    278 non-null    int64  
dtypes: float64(4), int64(3), object(1)
memory usage: 17.5+ KB


In [5]:
x_train.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
count,278.0,278.0,278.0,274.0,278.0,278.0,278.0
mean,23.732734,5.374101,189.994604,103.383212,2948.464029,15.580216,76.057554
std,7.647295,1.677084,105.471423,38.977911,862.949746,2.745907,3.605591
min,10.0,3.0,68.0,46.0,1613.0,8.0,70.0
25%,18.0,4.0,98.0,75.0,2206.25,14.0,73.0
50%,23.0,4.0,140.5,90.0,2737.5,15.5,76.0
75%,29.0,6.0,258.0,120.0,3560.0,17.0,79.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0


In [8]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
x_train[['horsepower']] = imputer.fit_transform(x_train[['horsepower']])
x_test[['horsepower']] = imputer.fit_transform(x_test[['horsepower']])

In [9]:
x_train.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
count,278.0,278.0,278.0,278.0,278.0,278.0,278.0
mean,23.732734,5.374101,189.994604,103.383212,2948.464029,15.580216,76.057554
std,7.647295,1.677084,105.471423,38.695458,862.949746,2.745907,3.605591
min,10.0,3.0,68.0,46.0,1613.0,8.0,70.0
25%,18.0,4.0,98.0,75.0,2206.25,14.0,73.0
50%,23.0,4.0,140.5,90.5,2737.5,15.5,76.0
75%,29.0,6.0,258.0,118.75,3560.0,17.0,79.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0


In [10]:
COL_DEL = ['name']
COL_NUM = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year']
COL_CAT = []
COL_Y = ['isUSA']

x_train = x_train.iloc[:, 1:]
x_test = x_test.iloc[:, 1:]

In [11]:
x_train

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
0,31.0,4,112.0,85.0,2575,16.2,82
1,13.0,8,400.0,175.0,5140,12.0,71
2,37.0,4,91.0,68.0,2025,18.2,82
3,12.0,8,350.0,180.0,4499,12.5,73
4,19.0,4,120.0,88.0,3270,21.9,76
...,...,...,...,...,...,...,...
273,36.1,4,91.0,60.0,1800,16.4,78
274,30.0,4,97.0,67.0,1985,16.4,77
275,33.5,4,98.0,83.0,2075,15.9,77
276,18.0,6,171.0,97.0,2984,14.5,75


## 모형 구축 및 평가

In [12]:
from sklearn.model_selection import train_test_split

x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size=0.3, random_state=42)

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_tr)

In [14]:
x_tr = scaler.transform(x_tr)
x_val = scaler.transform(x_val)
x_test = scaler.transform(x_test)

In [17]:
from sklearn.neighbors import KNeighborsClassifier

modelKNN = KNeighborsClassifier(n_neighbors = 5, metric='euclidean')
modelKNN.fit(x_tr, y_tr)

  return self._fit(X, y)


In [18]:
from sklearn.tree import DecisionTreeClassifier
modelDT = DecisionTreeClassifier(max_depth=10)
modelDT.fit(x_tr, y_tr)

In [19]:
y_val_pred = modelKNN.predict(x_val)

y_val_pred_probaKNN = modelKNN.predict_proba(x_val)
y_val_pred_probaDT = modelDT.predict_proba(x_val)

In [26]:
from sklearn.metrics import roc_auc_score

scoreKNN = roc_auc_score(y_val, y_val_pred_probaKNN[:,1])
scoreDT = roc_auc_score(y_val, y_val_pred_probaDT[:,1])

print(scoreKNN, scoreDT)

0.9346064814814815 0.8159722222222223


In [28]:
best_model = None
best_score = 0 

for i in range(2,20):
    model = KNeighborsClassifier(n_neighbors = i, metric='euclidean')
    model.fit(x_tr, y_tr)
    y_val_pred_proba = model.predict_proba(x_val)
    score = roc_auc_score(y_val, y_val_pred_proba[:,1])
    print(i,"개의 이웃 : ", score)
    if best_score <= score:
        best_model = model

2 개의 이웃 :  0.8634259259259259
3 개의 이웃 :  0.8900462962962963
4 개의 이웃 :  0.9221643518518519
5 개의 이웃 :  0.9346064814814815
6 개의 이웃 :  0.933449074074074
7 개의 이웃 :  0.9293981481481481
8 개의 이웃 :  0.9302662037037037
9 개의 이웃 :  0.9337384259259259
10 개의 이웃 :  0.9320023148148148
11 개의 이웃 :  0.9409722222222222
12 개의 이웃 :  0.9406828703703702
13 개의 이웃 :  0.939236111111111
14 개의 이웃 :  0.937210648148148
15 개의 이웃 :  0.9432870370370369
16 개의 이웃 :  0.9441550925925926
17 개의 이웃 :  0.9412615740740741
18 개의 이웃 :  0.9357638888888888
19 개의 이웃 :  0.9302662037037036


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


In [29]:
pred = best_model.predict_proba(x_test)[:,1]
pred

array([0.31578947, 0.68421053, 1.        , 0.94736842, 0.26315789,
       0.31578947, 0.52631579, 0.36842105, 0.31578947, 0.52631579,
       0.89473684, 0.52631579, 1.        , 0.57894737, 0.36842105,
       0.36842105, 1.        , 1.        , 0.31578947, 1.        ,
       1.        , 0.89473684, 1.        , 1.        , 1.        ,
       0.42105263, 0.26315789, 1.        , 1.        , 0.42105263,
       0.26315789, 0.21052632, 0.52631579, 0.73684211, 0.94736842,
       0.57894737, 0.68421053, 0.94736842, 0.05263158, 0.73684211,
       1.        , 0.26315789, 0.15789474, 0.47368421, 0.36842105,
       0.47368421, 0.52631579, 1.        , 0.36842105, 1.        ,
       0.15789474, 0.21052632, 0.89473684, 0.26315789, 1.        ,
       1.        , 0.94736842, 0.63157895, 0.94736842, 0.47368421,
       0.78947368, 0.78947368, 1.        , 0.94736842, 0.63157895,
       0.89473684, 1.        , 0.31578947, 0.73684211, 0.21052632,
       0.94736842, 1.        , 0.63157895, 0.26315789, 1.     

In [30]:
pd.DataFrame({'isUSA':pred}).to_csv('submit/004000000.csv', index = False)

# 펭귄

## 데이터 로드 및 전처리

In [32]:
import pandas as pd

x_test = pd.read_csv('data/penguin_X_test.csv')
x_train = pd.read_csv('data/penguin_X_train.csv')
y_train = pd.read_csv('data/penguin_y_train.csv')

In [33]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            240 non-null    object 
 1   island             240 non-null    object 
 2   sex                232 non-null    object 
 3   bill_length_mm     238 non-null    float64
 4   bill_depth_mm      238 non-null    float64
 5   flipper_length_mm  238 non-null    float64
dtypes: float64(3), object(3)
memory usage: 11.4+ KB


In [34]:
train = pd.concat([x_train, y_train], axis = 1)
train = train.dropna()
train.reset_index(drop = True, inplace=True)
train.head()

Unnamed: 0,species,island,sex,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,Gentoo,Biscoe,FEMALE,43.5,15.2,213.0,4650.0
1,Adelie,Torgersen,MALE,42.8,18.5,195.0,4250.0
2,Chinstrap,Dream,MALE,53.5,19.9,205.0,4500.0
3,Gentoo,Biscoe,MALE,50.2,14.3,218.0,5700.0
4,Adelie,Dream,FEMALE,36.5,18.0,182.0,3150.0


In [35]:
x_train = train[['species','island', 'sex','bill_length_mm', 'bill_depth_mm', 'flipper_length_mm']]
y_train = train[['body_mass_g']]

In [36]:
x_train.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm
count,232.0,232.0,232.0
mean,43.990948,17.226293,200.681034
std,5.50976,1.964677,14.064231
min,32.1,13.2,172.0
25%,39.2,15.7,190.0
50%,44.95,17.35,197.0
75%,48.775,18.725,212.25
max,58.0,21.5,231.0


In [37]:
COL_DEL = []
COL_NUM = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm']
COL_CAT = ['species','island', 'sex']
COL_Y = ['body_mass_g']

In [44]:
x = pd.concat([x_train, x_test])

Unnamed: 0,species,island,sex
0,Gentoo,Biscoe,FEMALE
1,Adelie,Torgersen,MALE
2,Chinstrap,Dream,MALE
3,Gentoo,Biscoe,MALE
4,Adelie,Dream,FEMALE
...,...,...,...
96,Gentoo,Biscoe,MALE
97,Adelie,Torgersen,FEMALE
98,Gentoo,Biscoe,FEMALE
99,Gentoo,Biscoe,MALE


In [46]:
x = pd.concat([x_train, x_test])

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
ohe.fit(x[COL_CAT])

x_train_res = ohe.transform(x_train[COL_CAT])
x_test_res = ohe.transform(x_test[COL_CAT])

In [49]:
x_train_ohe = pd.DataFrame(x_train_res.todense(), columns = ohe.get_feature_names_out())
x_test_ohe = pd.DataFrame(x_test_res.todense(), columns = ohe.get_feature_names_out())
print(x_train_ohe)

x_train_fin = pd.concat([x_train[COL_NUM], x_train_ohe], axis = 1)
x_test_fin = pd.concat([x_test[COL_NUM], x_test_ohe], axis = 1)

     species_Adelie  species_Chinstrap  species_Gentoo  island_Biscoe  \
0               0.0                0.0             1.0            1.0   
1               1.0                0.0             0.0            0.0   
2               0.0                1.0             0.0            0.0   
3               0.0                0.0             1.0            1.0   
4               1.0                0.0             0.0            0.0   
..              ...                ...             ...            ...   
227             0.0                1.0             0.0            0.0   
228             0.0                0.0             1.0            1.0   
229             1.0                0.0             0.0            0.0   
230             0.0                1.0             0.0            0.0   
231             0.0                0.0             1.0            1.0   

     island_Dream  island_Torgersen  sex_FEMALE  sex_MALE  
0             0.0               0.0         1.0       0.0  
1  

## 모델링

In [51]:
from sklearn.model_selection import train_test_split

x_tr, x_val, y_tr, y_val = train_test_split(x_train_fin, y_train, test_size = 0.3)

In [52]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(x_tr[COL_NUM])
x_tr[COL_NUM]=scaler.transform(x_tr[COL_NUM])
x_val[COL_NUM]=scaler.transform(x_val[COL_NUM])
x_test_fin[COL_NUM]=scaler.transform(x_test_fin[COL_NUM])

from sklearn.linear_model import LinearRegression

modelLR = LinearRegression()
modelLR.fit(x_tr, y_tr)

y_val_pred = modelLR.predict(x_val)
print(y_val_pred)

[[4168.69498094]
 [4147.21395049]
 [4680.27379749]
 [3335.56424088]
 [4683.58336208]
 [4673.82263396]
 [5260.67001749]
 [3384.01897063]
 [4050.62306994]
 [4313.23496717]
 [4134.85592836]
 [4002.25384923]
 [4644.45899304]
 [4064.35035116]
 [3687.02105921]
 [5195.48888568]
 [4095.37900167]
 [3295.8296245 ]
 [4152.72260231]
 [4777.1856231 ]
 [5241.34436101]
 [5714.43002972]
 [3545.38113597]
 [3498.16371526]
 [3315.06186609]
 [4728.26608458]
 [4004.72882672]
 [4243.10670914]
 [4071.49852499]
 [4257.46703823]
 [3594.22876609]
 [5159.00686543]
 [4256.56329439]
 [4132.23646829]
 [3117.25082103]
 [5576.33651946]
 [5499.50767569]
 [4829.06622107]
 [5741.52137418]
 [4664.75695574]
 [3113.03421067]
 [3801.53711443]
 [4327.41010992]
 [4747.49698463]
 [3689.96351234]
 [4187.06286542]
 [3999.28784441]
 [3372.97745293]
 [4005.8711907 ]
 [4797.45379129]
 [3296.92916811]
 [4922.62775382]
 [4795.6419771 ]
 [3402.02014879]
 [3312.50691929]
 [4608.00048039]
 [4077.50408041]
 [3576.32667029]
 [3478.024126 

In [53]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_val, y_val_pred)
rmse = mean_squared_error(y_val, y_val_pred, squared = False)

print(mse, rmse)

90483.84672404586 300.8053302786469


In [58]:
y_pred = modelLR.predict(x_test_fin)
pd.DataFrame({'body_mass_g':y_pred[:,0]}).to_csv('submit/004000001.csv', index = False)

# Census income

## 데이터 로드 및 전처리

In [59]:
import pandas as pd
import numpy as np

In [60]:
x_train = pd.read_csv('data/census_X_train.csv')
y_train = pd.read_csv('data/census_y_train.csv')
x_test = pd.read_csv('data/census_X_test.csv')

In [61]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30162 entries, 0 to 30161
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             30162 non-null  int64 
 1   workclass       30162 non-null  object
 2   education_num   30162 non-null  int64 
 3   marital_status  30162 non-null  object
 4   occupation      30162 non-null  object
 5   relationship    30162 non-null  object
 6   race            30162 non-null  object
 7   sex             30162 non-null  object
 8   capital_gain    30162 non-null  int64 
 9   capital_loss    30162 non-null  int64 
 10  hours_per_week  30162 non-null  int64 
 11  native_country  30162 non-null  object
dtypes: int64(5), object(7)
memory usage: 2.8+ MB


In [62]:
x_train.describe()

Unnamed: 0,age,education_num,capital_gain,capital_loss,hours_per_week
count,30162.0,30162.0,30162.0,30162.0,30162.0
mean,38.437902,10.121312,1092.007858,88.372489,40.931238
std,13.134665,2.549995,7406.346497,404.29837,11.979984
min,17.0,1.0,0.0,0.0,1.0
25%,28.0,9.0,0.0,0.0,40.0
50%,37.0,10.0,0.0,0.0,40.0
75%,47.0,13.0,0.0,0.0,45.0
max,90.0,16.0,99999.0,4356.0,99.0


In [63]:
x_train.isnull().sum()

age               0
workclass         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
dtype: int64

In [64]:
x_train['capital_gain'].quantile([q/20 for q in range(15,21)])

0.75        0.0
0.80        0.0
0.85        0.0
0.90        0.0
0.95     5013.0
1.00    99999.0
Name: capital_gain, dtype: float64

In [65]:
x_train['capital_loss'].quantile([q/20 for q in range(15,21)])

0.75       0.0
0.80       0.0
0.85       0.0
0.90       0.0
0.95       0.0
1.00    4356.0
Name: capital_loss, dtype: float64

In [66]:
x_train.head()

Unnamed: 0,age,workclass,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [67]:
x_train['capital_gain_yn'] = np.where(x_train['capital_gain']>0,1,0)
x_train['capital_loss_yn'] = np.where(x_train['capital_loss']>0,1,0)

x_test['capital_gain_yn'] = np.where(x_test['capital_gain']>0, 1, 0)
x_test['capital_loss_yn'] = np.where(x_test['capital_loss']>0, 1, 0)

In [68]:
COL_DEL = []
COL_NUM = ['age', 'education_num', 'hours_per_week', 'capital_gain', 'capital_loss']
COL_CAT = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'capital_gain_yn', 'capital_loss_yn']
COL_Y = ['target']

In [69]:
from sklearn.preprocessing import LabelEncoder

x = pd.concat([x_train, x_test])

for col in COL_CAT:
    le = LabelEncoder()
    le.fit(x[col])
    x_train[col] = le.transform(x_train[col])
    x_test[col] = le.transform(x_test[col])

In [70]:
from sklearn.model_selection import train_test_split

x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size=0.3, stratify = y_train)

In [72]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_tr[COL_NUM] = scaler.fit_transform(x_tr[COL_NUM])
x_val[COL_NUM] = scaler.fit_transform(x_val[COL_NUM])
x_test[COL_NUM] = scaler.fit_transform(x_test[COL_NUM])

In [73]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier()
model_rf.fit(x_tr, y_tr)

  model_rf.fit(x_tr, y_tr)


In [76]:
from xgboost import XGBClassifier

model_xgb1 = XGBClassifier(n_estimators = 1000, learning_rate = 0.01, max_depth = 10)
model_xgb1.fit(x_tr,y_tr)

In [77]:
from sklearn.metrics import roc_auc_score

y_pred_rf = model_rf.predict_proba(x_val)
y_pred_xgb = model_xgb1.predict_proba(x_val)

score_rf = roc_auc_score(y_val, y_pred_rf[:,1])
score_xgb = roc_auc_score(y_val, y_pred_xgb[:,1])

print(score_rf, score_xgb)

0.8902789170648111 0.9130410880257224


In [79]:
pred = model_xgb1.predict(x_test)
print(pred)

[0 0 0 ... 1 0 1]


In [85]:
pd.DataFrame({'target':pred}).to_csv('submit/004000002.csv', index = False)