In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv("mobiles.csv")
df.head(2)

Unnamed: 0,screen_size,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales
0,Very Small,64,2,1,1,1800,4.5,38645,32999,0.17,127.52
1,Small,64,4,2,1,2815,4.5,244,57149,0.04,1.39


### Q1.

In [3]:
df_q1 = df.copy()

In [5]:
stat_mean = df_q1["sales"].mean()
stat_std  = df_q1["sales"].std()
stat_out  = stat_mean + 2 * stat_std
stat_out

146.55150129273218

In [6]:
df_q1_sub = df_q1.loc[df_q1["sales"] > stat_out, ].reset_index(drop = True)
len(df_q1_sub)

16

In [8]:
df_q1["sales"].std(), np.std(df_q1["sales"]), np.std(df_q1["sales"], ddof = 1)

(58.39958785566842, 58.3316418316656, 58.39958785566842)

In [9]:
df_q1_sub.head(2)

Unnamed: 0,screen_size,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales
0,Medium,128,6,2,1,4000,4.6,122001,18999,0.09,231.79
1,Large,128,6,4,2,4500,4.5,267028,15999,0.2,427.22


In [10]:
df_q1_sub["idx"] = (df_q1_sub["ROM"] / 32) + (df_q1_sub["RAM"] / 2) + \
(df_q1_sub["num_front_camera"] + df_q1_sub["num_rear_camera"]) + \
(df_q1_sub["battery_capacity"] / 1000)

In [11]:
round(df_q1_sub["idx"].mean(), 2)

11.01

### Q2.

In [16]:
df_q2 = df.loc[df["num_rear_camera"] != 1, ]
df_q2 = df_q2[["sales", "battery_capacity", "ratings", "num_of_ratings", "sales_price", "discount_percent"]]
df_q2 = df_q2.reset_index(drop = True)
df_q2.head(2)

Unnamed: 0,sales,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent
0,1.39,2815,4.5,244,57149,0.04
1,5.15,2815,4.6,745,69149,0.02


In [17]:
df_q2.corr().round(2)

Unnamed: 0,sales,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent
sales,1.0,0.03,0.23,0.95,-0.25,0.22
battery_capacity,0.03,1.0,-0.42,0.03,-0.5,0.26
ratings,0.23,-0.42,1.0,0.19,0.15,-0.12
num_of_ratings,0.95,0.03,0.19,1.0,-0.26,0.21
sales_price,-0.25,-0.5,0.15,-0.26,1.0,-0.1
discount_percent,0.22,0.26,-0.12,0.21,-0.1,1.0


In [19]:
df_q2.corr().round(2).max()

sales               1.0
battery_capacity    1.0
ratings             1.0
num_of_ratings      1.0
sales_price         1.0
discount_percent    1.0
dtype: float64

In [21]:
df_q2.corr().round(2).replace(1, np.nan).max()

sales               0.95
battery_capacity    0.26
ratings             0.23
num_of_ratings      0.95
sales_price         0.15
discount_percent    0.26
dtype: float64

In [24]:
df_corr = df_q2.corr().reset_index().melt(id_vars = "index")
df_corr

Unnamed: 0,index,variable,value
0,sales,sales,1.0
1,battery_capacity,sales,0.02568
2,ratings,sales,0.226075
3,num_of_ratings,sales,0.949114
4,sales_price,sales,-0.24776
5,discount_percent,sales,0.223471
6,sales,battery_capacity,0.02568
7,battery_capacity,battery_capacity,1.0
8,ratings,battery_capacity,-0.424129
9,num_of_ratings,battery_capacity,0.034902


In [25]:
df_corr.loc[df_corr["index"] != df_corr["variable"], ]

Unnamed: 0,index,variable,value
1,battery_capacity,sales,0.02568
2,ratings,sales,0.226075
3,num_of_ratings,sales,0.949114
4,sales_price,sales,-0.24776
5,discount_percent,sales,0.223471
6,sales,battery_capacity,0.02568
8,ratings,battery_capacity,-0.424129
9,num_of_ratings,battery_capacity,0.034902
10,sales_price,battery_capacity,-0.503019
11,discount_percent,battery_capacity,0.257373


### Q3.

In [26]:
df_q3 = df.copy()

In [27]:
df_q3.head(2)

Unnamed: 0,screen_size,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales
0,Very Small,64,2,1,1,1800,4.5,38645,32999,0.17,127.52
1,Small,64,4,2,1,2815,4.5,244,57149,0.04,1.39


In [28]:
df_q3_dum = pd.get_dummies(df_q3)
df_q3_dum.head(2)

Unnamed: 0,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales,screen_size_Large,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
0,64,2,1,1,1800,4.5,38645,32999,0.17,127.52,0,0,0,0,1
1,64,4,2,1,2815,4.5,244,57149,0.04,1.39,0,0,1,0,0


In [29]:
df_q3_dum.shape

(430, 15)

In [30]:
df_q3_dum = df_q3_dum.set_index("sales").reset_index()

In [32]:
df_train, df_test = train_test_split(df_q3_dum, train_size = 0.8, random_state = 123)
len(df_train), len(df_test)

(344, 86)

In [33]:
model_nor = MinMaxScaler().fit(df_train)
arr_train_nor = model_nor.transform(df_train)
arr_test_nor  = model_nor.transform(df_test)

In [34]:
arr_train_nor[:1, ]

array([[0.00394753, 0.04761905, 0.18181818, 0.33333333, 0.        ,
        0.42307692, 0.625     , 0.00396262, 0.02607261, 0.09302326,
        0.        , 0.        , 1.        , 0.        , 0.        ]])

In [41]:
ls_k = [3, 5, 7, 9, 11]

k = ls_k[0]

model_knn = KNeighborsRegressor(n_neighbors = k)
model_knn.fit(X = arr_train_nor[:, 1:],
              y = arr_train_nor[:, 0 ])
pred = model_knn.predict(arr_test_nor[:, 1:])
val_rmse = mean_squared_error(y_true = arr_test_nor[:, 0],
                              y_pred = pred) ** 0.5
val_rmse

0.08186677375964535

In [42]:
ls_k = [3, 5, 7, 9, 11]

ls_rmse = []
for k in ls_k:
    model_knn = KNeighborsRegressor(n_neighbors = k)
    model_knn.fit(X = arr_train_nor[:, 1:],
                  y = arr_train_nor[:, 0 ])
    pred = model_knn.predict(arr_test_nor[:, 1:])
    val_rmse = mean_squared_error(y_true = arr_test_nor[:, 0],
                                  y_pred = pred) ** 0.5
    ls_rmse = ls_rmse + [val_rmse]

In [59]:
k

11

In [44]:
ls_rmse

In [46]:
ser_k = pd.Series(ls_rmse, index = ls_k)
ser_k.idxmin() # 최소값의 index

3

In [47]:
ser_k

3     0.081867
5     0.098791
7     0.107670
9     0.112321
11    0.113690
dtype: float64

다음은 저번달에 신규 출시된 경쟁사의 스마트폰 정보이다. 해당 스마트폰의 판매지수는 얼마로 예상되는가?  
※ 정규화 되지 않은 값으로 반올림하여 소수점 첫째 자리까지 출력하시오  
※ KNN 모델을 사용하며 이웃 개수는 직전에 최적이라고 판단한 k값을 사용하시오.  
* ROM: 256
* RAM: 6
* num_rear_camera: 4
* num_front_camera: 1
* battery_capacity: 4000
* ratings: 4.3
* sales_price: 85000
* discount_percent: 0.05
* screen_size: "Large"

In [50]:
# df_test1 = pd.DataFrame(dict(ROM = 256,
#                              RAM = 6))
df_test1 = pd.DataFrame(dict(ROM = [256],
                             RAM = [6]))
df_test1 

Unnamed: 0,ROM,RAM
0,256,6


In [56]:
from sklearn.preprocessing import OneHotEncoder

In [57]:
df_test1 = df_test.head(1).reset_index(drop = True)
df_test1["ROM"] = 256
df_test1["RAM"] = 6
df_test1["num_rear_camera"] = 4
df_test1["num_front_camera"] = 1
df_test1["battery_capacity"] = 4000
df_test1["ratings"] = 4.3
df_test1["sales_price"] = 85000
df_test1["discount_percent"] = 0.05
df_test1["screen_size_Large"] = 1
df_test1["screen_size_Medium"] = 0
df_test1 

Unnamed: 0,sales,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,screen_size_Large,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
0,5.9,256,6,4,1,4000,4.3,745,85000,0.05,1,0,0,0,0


In [58]:
arr_test1_nor = model_nor.transform(df_test1)
arr_test1_nor

array([[0.0119438 , 0.49206349, 0.45454545, 1.        , 0.        ,
        0.42307692, 0.625     , 0.00157358, 0.51815842, 0.09302326,
        1.        , 0.        , 0.        , 0.        , 0.        ]])

In [61]:
model_knn.predict(arr_test1_nor[:, 1:]) # k = 11

array([0.00429167])

In [62]:
model_knn = KNeighborsRegressor(n_neighbors = 3)
model_knn.fit(X = arr_train_nor[:, 1:],
              y = arr_train_nor[:, 0 ])
pred = model_knn.predict(arr_test1_nor[:, 1:])
pred

array([0.00132259])

In [65]:
# model_nor.inverse_transform(pred)
arr_test1_nor[0, 0] = pred
arr_test1_nor

array([[0.00132259, 0.49206349, 0.45454545, 1.        , 0.        ,
        0.42307692, 0.625     , 0.00157358, 0.51815842, 0.09302326,
        1.        , 0.        , 0.        , 0.        , 0.        ]])

In [67]:
model_nor.inverse_transform(arr_test1_nor)[0, 0]

0.6533333333333334

In [68]:
model_nor.inverse_transform

<bound method MinMaxScaler.inverse_transform of MinMaxScaler()>

In [69]:
import sklearn
sklearn.__version__

'1.2.1'

In [70]:
np.r_[:15]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])