In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv("02_Test_Data_Set/mobiles.csv")
df.head(2)

Unnamed: 0,screen_size,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales
0,Very Small,64,2,1,1,1800,4.5,38645,32999,0.17,127.52
1,Small,64,4,2,1,2815,4.5,244,57149,0.04,1.39


## Q1.

In [8]:
df_q1 = df[["sales", "ROM", "RAM", "num_rear_camera", "num_front_camera", "battery_capacity"]]
df_q1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 430 entries, 0 to 429
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   sales             430 non-null    float64
 1   ROM               430 non-null    int64  
 2   RAM               430 non-null    int64  
 3   num_rear_camera   430 non-null    int64  
 4   num_front_camera  430 non-null    int64  
 5   battery_capacity  430 non-null    int64  
dtypes: float64(1), int64(5)
memory usage: 20.3 KB


In [6]:
th = df_q1["sales"].mean() + 2 * df_q1["sales"].std()
th

146.5515012927322

In [7]:
df_outlier = df_q1.loc[df_q1["sales"] > th, ]
df_outlier.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16 entries, 98 to 418
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   sales             16 non-null     float64
 1   ROM               16 non-null     int64  
 2   RAM               16 non-null     int64  
 3   num_rear_camera   16 non-null     int64  
 4   num_front_camera  16 non-null     int64  
 5   battery_capacity  16 non-null     int64  
dtypes: float64(1), int64(5)
memory usage: 896.0 bytes


In [48]:
# \쓰고 줄바꿈하면 이어짐
df_outlier["performance"] = df_outlier["ROM"] / 32 + df_outlier["RAM"] / 2 \
+ df_outlier["num_front_camera"] + df_outlier["num_rear_camera"] \
+ df_outlier["battery_capacity"] / 1000
df_outlier["performance"]

98     14.00
110    17.50
158    10.00
159    10.00
193     8.50
198     9.00
243     8.50
381    12.00
382    12.00
395     8.60
396    10.00
402    11.00
404    12.00
407    12.00
412    11.00
418    10.08
Name: performance, dtype: float64

In [10]:
round(df_outlier["performance"].mean(), 2)

11.01

## Q2.

In [84]:
df_q2 = df.loc[df["num_rear_camera"] != 1, :] # 후면 카메라 1개인거 뺌
df_q2 = df_q2[["sales", "battery_capacity", "ratings", "num_of_ratings", "sales_price", "discount_percent"]]
df_q2.head(2)

Unnamed: 0,sales,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent
1,1.39,2815,4.5,244,57149,0.04
4,5.15,2815,4.6,745,69149,0.02


In [81]:
df_q2.corr()

Unnamed: 0,sales,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent
sales,1.0,0.02568,0.226075,0.949114,-0.24776,0.223471
battery_capacity,0.02568,1.0,-0.424129,0.034902,-0.503019,0.257373
ratings,0.226075,-0.424129,1.0,0.191655,0.151153,-0.118578
num_of_ratings,0.949114,0.034902,0.191655,1.0,-0.260279,0.212442
sales_price,-0.24776,-0.503019,0.151153,-0.260279,1.0,-0.09864
discount_percent,0.223471,0.257373,-0.118578,0.212442,-0.09864,1.0


In [82]:
round(0.949114, 2)

0.95

## Q3.

In [20]:
df["screen_size"].unique()

array(['Very Small', 'Small', 'Medium', 'Large', 'Very Large'],
      dtype=object)

In [41]:
df_q3 = pd.get_dummies(data = df, columns = ["screen_size"])
df_q3.head(2)

Unnamed: 0,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales,screen_size_Large,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
0,64,2,1,1,1800,4.5,38645,32999,0.17,127.52,0,0,0,0,1
1,64,4,2,1,2815,4.5,244,57149,0.04,1.39,0,0,1,0,0


In [45]:
# set_index활용해서 y를 맨 앞으로
df_q3 = df_q3.set_index("sales").reset_index()
df_q3.head(2)

Unnamed: 0,sales,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,screen_size_Large,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
0,127.52,64,2,1,1,1800,4.5,38645,32999,0.17,0,0,0,0,1
1,1.39,64,4,2,1,2815,4.5,244,57149,0.04,0,0,1,0,0


In [47]:
len(df_q3.columns)

15

In [50]:
df_q3_tr, df_q3_t = train_test_split(df_q3, train_size=0.8, random_state=123)

In [64]:
model_nor = MinMaxScaler().fit(df_q3_tr)
df_q3_tr_nor = pd.DataFrame(model_nor.transform(df_q3_tr), columns=df_q3_tr.columns)
df_q3_t_nor = pd.DataFrame(model_nor.transform(df_q3_t), columns=df_q3_t.columns)

In [67]:
k_list = list(range(3, 12, 2))
rmse = []

In [68]:
for k in k_list:
    model = KNeighborsRegressor(n_neighbors = k).fit(X = df_q3_tr_nor.iloc[:, 1:], y = df_q3_tr_nor["sales"])
    pred = model.predict(df_q3_t_nor.iloc[:, 1:])
    rmse.append(mean_squared_error(y_true=df_q3_t_nor["sales"], y_pred=pred) ** 0.5)
rmse

[0.08186677375964535,
 0.09879109824384892,
 0.107669855645971,
 0.11232111394853059,
 0.1136902366621185]

In [79]:
# drop 메소드로도 가능
df_q3_tr_nor.iloc[:, 1:] == df_q3_tr_nor.drop("sales", axis = 1)

Unnamed: 0,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,screen_size_Large,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
0,True,True,True,True,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,True,True,True,True,True,True,True,True,True,True,True,True,True,True
340,True,True,True,True,True,True,True,True,True,True,True,True,True,True
341,True,True,True,True,True,True,True,True,True,True,True,True,True,True
342,True,True,True,True,True,True,True,True,True,True,True,True,True,True


In [78]:
k_list[rmse.index(min(rmse))]

3