In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv("mobiles.csv")
df.head(2)

Unnamed: 0,screen_size,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales
0,Very Small,64,2,1,1,1800,4.5,38645,32999,0.17,127.52
1,Small,64,4,2,1,2815,4.5,244,57149,0.04,1.39


### Q1.

In [4]:
stat_mean = df["sales"].mean()
stat_std  = df["sales"].std()
stat_out  = stat_mean + 2 * stat_std
stat_out

146.55150129273218

In [None]:
df_q1 = df.loc[df["sales"] > stat_out, ].reset_index(drop = True)
df_q1.head(2)

In [7]:
len(df_q1)

16

In [8]:
df_q1["idx"] = (df_q1["ROM"] / 32) + (df_q1["RAM"] / 2) +\
(df_q1["num_front_camera"] + df_q1["num_rear_camera"]) + (df_q1["battery_capacity"] / 1000)

In [9]:
df_q1["idx"].mean()

11.01125

### Q2.

In [16]:
df["num_rear_camera"].unique()

array([1, 2, 3, 4], dtype=int64)

In [15]:
df_q2 = df.loc[df["num_rear_camera"] != 1, "battery_capacity":]
df_q2.head(2)

Unnamed: 0,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales
1,2815,4.5,244,57149,0.04,1.39
4,2815,4.6,745,69149,0.02,5.15


In [22]:
df_q2.corr().abs().round(2)["sales"][:-1].max()

0.95

In [24]:
df_corr = df_q2.corr()
df_corr

Unnamed: 0,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales
battery_capacity,1.0,-0.424129,0.034902,-0.503019,0.257373,0.02568
ratings,-0.424129,1.0,0.191655,0.151153,-0.118578,0.226075
num_of_ratings,0.034902,0.191655,1.0,-0.260279,0.212442,0.949114
sales_price,-0.503019,0.151153,-0.260279,1.0,-0.09864,-0.24776
discount_percent,0.257373,-0.118578,0.212442,-0.09864,1.0,0.223471
sales,0.02568,0.226075,0.949114,-0.24776,0.223471,1.0


In [26]:
df_corr.replace(1, np.nan).max()

battery_capacity    0.257373
ratings             0.226075
num_of_ratings      0.949114
sales_price         0.151153
discount_percent    0.257373
sales               0.949114
dtype: float64

In [30]:
df_corr_melt = df_corr.reset_index().melt(id_vars = "index")
df_corr_melt.loc[df_corr_melt["index"] != df_corr_melt["variable"], ].pivot_table(index = "index",
                                                                                  columns = "variable",
                                                                                  values = "value")

variable,battery_capacity,discount_percent,num_of_ratings,ratings,sales,sales_price
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
battery_capacity,,0.257373,0.034902,-0.424129,0.02568,-0.503019
discount_percent,0.257373,,0.212442,-0.118578,0.223471,-0.09864
num_of_ratings,0.034902,0.212442,,0.191655,0.949114,-0.260279
ratings,-0.424129,-0.118578,0.191655,,0.226075,0.151153
sales,0.02568,0.223471,0.949114,0.226075,,-0.24776
sales_price,-0.503019,-0.09864,-0.260279,0.151153,-0.24776,


In [31]:
print(round(0.949114, 2))

0.95


### Q3.

In [32]:
df.head(2)

Unnamed: 0,screen_size,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales
0,Very Small,64,2,1,1,1800,4.5,38645,32999,0.17,127.52
1,Small,64,4,2,1,2815,4.5,244,57149,0.04,1.39


In [39]:
# df_dum = pd.get_dummies(df, columns = ["screen_size"]) # 시험버전
df_dum = pd.get_dummies(df, columns = ["screen_size"], dtype = "int") # 2.0.0 이상 버전
df_dum = df_dum.set_index("sales").reset_index()
df_dum.head(2)

Unnamed: 0,sales,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,screen_size_Large,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
0,127.52,64,2,1,1,1800,4.5,38645,32999,0.17,0,0,0,0,1
1,1.39,64,4,2,1,2815,4.5,244,57149,0.04,0,0,1,0,0


In [38]:
df_dum.columns.str.replace(" ", "_")

Index(['ROM', 'RAM', 'num_rear_camera', 'num_front_camera', 'battery_capacity',
       'ratings', 'num_of_ratings', 'sales_price', 'discount_percent', 'sales',
       'screen_size_Large', 'screen_size_Medium', 'screen_size_Small',
       'screen_size_Very_Large', 'screen_size_Very_Small'],
      dtype='object')

`pd.get_dummies()`는 다른 메서드/함수/클래스와 다르게 "columns" 인자에 단일 값을 할당하는 경우에도 반드시 리스트 객체를 사용하여 할당해야 한다. 단순 문자열을 할당할 경우 에러가 난다.

그리고 원핫인코딩을 실시할 때 변수명에 띄어쓰기가 있을 수 있는데 `statsmodels` 라이브러리 기반 모델링을 하면서 formula 를 사용하는 경우 변수명에 띄어쓰기를 제거하지 않은 채로 formula를 작성하면 반드시 에러가 발생함. 그리고 이 이슈는 이전 시험에서 응시자가 어려움을 겪은 사례가 있음.  
※ 다음의 코드 결과에서는 "screen_size_Very Large"

In [40]:
df_dum.shape

(430, 15)

In [41]:
df_train, df_test = train_test_split(df_dum, train_size = 0.8, random_state = 123)
len(df_train), len(df_test)

(344, 86)

In [None]:
model_nor = MinMaxScaler().fit(df_train)
arr_train_nor = model_nor.transform(df_train)
arr_test_nor  = model_nor.transform(df_test)

In [43]:
arr_train_nor[:1, ]

array([[0.00394753, 0.04761905, 0.18181818, 0.33333333, 0.        ,
        0.42307692, 0.625     , 0.00396262, 0.02607261, 0.09302326,
        0.        , 0.        , 1.        , 0.        , 0.        ]])

In [48]:
ls_k = [3, 5, 7, 9, 11]
k = ls_k[0]

model_knn = KNeighborsRegressor(n_neighbors = k)
model_knn.fit(X = arr_train_nor[:, 1:],
              y = arr_train_nor[:, 0])
pred = model_knn.predict(arr_test_nor[:, 1:])

val_rmse = mean_squared_error(y_true = arr_test_nor[:, 0], y_pred = pred) ** 0.5
val_rmse

0.08186677375964535

In [49]:
ls_k = [3, 5, 7, 9, 11]
k = ls_k[0]

ls_rmse = []
for k in ls_k:
    model_knn = KNeighborsRegressor(n_neighbors = k)
    model_knn.fit(X = arr_train_nor[:, 1:],
                  y = arr_train_nor[:, 0])
    pred = model_knn.predict(arr_test_nor[:, 1:])

    val_rmse = mean_squared_error(y_true = arr_test_nor[:, 0], y_pred = pred) ** 0.5
    ls_rmse = ls_rmse + [val_rmse]

In [51]:
ser_rmse = pd.Series(ls_rmse, index = ls_k)
ser_rmse

3     0.081867
5     0.098791
7     0.107670
9     0.112321
11    0.113690
dtype: float64

In [53]:
best_k = ser_rmse.idxmin()
best_k

3

### Q3. 추가 지시사항
다음은 저번달에 신규 출시된 경쟁사의 스마트폰 정보이다. 해당 스마트폰의 판매지수는 얼마로 예상되는가?  
※ 정규화 되지 않은 값으로 반올림하여 소수점 첫째 자리까지 출력하시오  
※ KNN 모델을 사용하며 이웃 개수는 직전에 최적이라고 판단한 k값을 사용하시오.  
※ 정답은 반올림하여 소수 둘째 자리까지 출력하시오.  
(정답 예시: 0.12)
* ROM: 256
* RAM: 6
* num_rear_camera: 4
* num_front_camera: 1
* battery_capacity: 4000
* ratings: 4.3
* num_of_ratings: 25000
* sales_price: 85000
* discount_percent: 0.05
* screen_size: "Large"


https://datadoctorblog.com/2023/07/30/Py-ML-One-Hot-Encoding/#sklearn-1

In [58]:
# df_t1 = pd.DataFrame(dict(ROM = 256, ~~~))
df_t1 = df_test.head(1).reset_index(drop = True)
df_t1["RAM"] = 6
df_t1["num_rear_camera"] = 4
df_t1["battery_capacity"] = 4000
df_t1["ratings"] = 4.3
df_t1["num_of_ratings"] = 25000
df_t1["sales_price"] = 85000
df_t1["discount_percent"] = 0.05
df_t1["screen_size_Large"] = 1
df_t1["screen_size_Medium"] = 0

In [59]:
df_t1

Unnamed: 0,sales,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,screen_size_Large,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
0,5.9,256,6,4,1,4000,4.3,25000,85000,0.05,1,0,0,0,0


In [64]:
arr_t1_nor = model_nor.transform(df_t1)
arr_t1_nor

array([[0.0119438 , 0.49206349, 0.45454545, 1.        , 0.        ,
        0.42307692, 0.625     , 0.05308122, 0.51815842, 0.09302326,
        1.        , 0.        , 0.        , 0.        , 0.        ]])

In [54]:
model_knn.kneighbors

<bound method KNeighborsMixin.kneighbors of KNeighborsRegressor(n_neighbors=11)>

In [65]:
model_knn_b = KNeighborsRegressor(n_neighbors = best_k)
model_knn_b.fit(X = arr_train_nor[:, 1:],
                y = arr_train_nor[:, 0])
pred_t1 = model_knn_b.predict(arr_t1_nor[:, 1:])
pred_t1

array([0.00132259])

In [69]:
# model_nor.inverse_transform(pred_t1) # Error!!!
arr_t1_nor[0, 0] = pred_t1
arr_t1_nor

array([[0.00132259, 0.49206349, 0.45454545, 1.        , 0.        ,
        0.42307692, 0.625     , 0.05308122, 0.51815842, 0.09302326,
        1.        , 0.        , 0.        , 0.        , 0.        ]])

In [70]:
arr_t1_inv = model_nor.inverse_transform(arr_t1_nor)
arr_t1_inv

array([[6.53333333e-01, 2.56000000e+02, 6.00000000e+00, 4.00000000e+00,
        1.00000000e+00, 4.00000000e+03, 4.30000000e+00, 2.50000000e+04,
        8.50000000e+04, 5.00000000e-02, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [71]:
df_t1_inv = pd.DataFrame(arr_t1_inv, columns = df_t1.columns)
df_t1_inv

Unnamed: 0,sales,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,screen_size_Large,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
0,0.653333,256.0,6.0,4.0,1.0,4000.0,4.3,25000.0,85000.0,0.05,1.0,0.0,0.0,0.0,0.0


In [76]:
round(df_t1_inv["sales"][0], 2)

0.65