In [211]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [2]:
gsm = pd.read_csv("gsm.csv", index_col =0).reset_index()

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
gsm.head()

Unnamed: 0,oem,model,network_technology,network_2g_bands,network_gprs,network_edge,launch_announced,launch_status,body_dimensions,body_weight,...,main_camera_dual_or_triple,battery_music_play,selfie_camera_triple,main_camera_v1,selfie_camera,camera,main_camera,network,battery_talk_time,battery_stand.by
0,Benefon,Vega,GSM,GSM 900,No,No,1999,Discontinued,145 x 56 x 23 mm (5.71 x 2.20 x 0.91 in),190 g (6.70 oz),...,,,,,,No,,,4 - 10 h,3 - 6 days
1,Garmin-Asus,nuvifone M10,GSM / HSPA,GSM 900 / 1800 / 1900,,,"2010, January. Released 2010, March",Discontinued,-,-,...,,,,,V2,,,GSM 850 / 1800 / 1900 - US version,Up to 8 h,Up to 600 h (2G) / Up to 600 h (3G)
2,Gigabyte,GSmart G1305 Boston,GSM / HSPA,GSM 850 / 900 / 1800 / 1900,,,"2010, April. Released 2010, April",Discontinued,116 x 56.8 x 12.4 mm (4.57 x 2.24 x 0.49 in),118 g (4.16 oz),...,,,,,V2,,,,Up to 7 h 10 min,Up to 410 h
3,Gigabyte,GSmart,GSM / HSPA,GSM 900 / 1800,,,Not officially announced yet,Cancelled,103 x 54 x 13.4 mm (4.06 x 2.13 x 0.53 in),-,...,,,,,V2,,,,,
4,Google,Pixel 4 XL,GSM / CDMA / HSPA / EVDO / LTE,GSM 850 / 900 / 1800 / 1900,,,"2019, October 15","Available. Released 2019, October 22",160.4 x 75.1 x 8.2 mm (6.31 x 2.96 x 0.32 in),193 g (6.81 oz),...,,,,,,,,CDMA 800 / 1900,,


In [4]:
gsm.shape

(10679, 86)

In [5]:
gsm.columns

Index(['oem', 'model', 'network_technology', 'network_2g_bands',
       'network_gprs', 'network_edge', 'launch_announced', 'launch_status',
       'body_dimensions', 'body_weight', 'body_sim', 'display_type',
       'display_size', 'display_resolution', 'display', 'memory_card_slot',
       'memory_phonebook', 'memory_call_records', 'sound_loudspeaker',
       'sound_alert_types', 'sound_3.5mm_jack', 'comms_wlan',
       'comms_bluetooth', 'comms_gps', 'comms_radio', 'comms_usb',
       'features_sensors', 'features_messaging', 'features_browser',
       'features_clock', 'features_alarm', 'features_games', 'features_java',
       'features', 'misc_colors', 'network_3g_bands', 'network_speed',
       'platform_os', 'platform_chipset', 'platform_cpu', 'platform_gpu',
       'memory_internal', 'main_camera_single', 'main_camera_video',
       'misc_price', 'main_camera_features', 'body', 'network_4g_bands',
       'body_build', 'display_protection', 'memory', 'main_camera_dual',
       

In [6]:
gsm["battery_talk_time"].value_counts()

Up to 5 h                                         558
Up to 4 h                                         546
Up to 3 h                                         514
Up to 6 h                                         337
Up to 7 h                                         250
                                                 ... 
Up to 50 h (2G) / Up to 18 h (3G)                   1
Up to 6 h 40 min (2G) / Up to 5 h 40 min (3G)       1
Up to 11 h 30 min (2G) / Up to 6 h 50 min (3G)      1
Up to 4 h 30 min (2G) / Up to 4 h (3G)              1
Up to 25 h (2G) / Up to 16 h 30 min (3G)            1
Name: battery_talk_time, Length: 1225, dtype: int64

In [7]:
gsm["main_camera_dual_or_triple"].isnull().sum()

10676

## Important variables
### One-hot
1. oem
2. network_technology
3. platform_os
4. main_camera_video

### Flag
1. network_gprs (yes vs no)
2. network_edge (yes vs no)
3. launch_status (discontinued vs avialable)
4. body_sim (dual vs single)
5. sound_loudspeaker (yes vs no)
6. comms_wlan (yes vs no)
7. comms_bluetooth (yes vs no)
8. comms_radio (yes vs no)
9. comms_gps (yes vs no)
10. network_2g_bands (value vs null)
11. network_3g_bands (value vs null)
12. network_4g_bands (value vs null)
13. network_5g_bands (value vs null)
14. main_camera_five/quad/triple (yes vs no)

### Numeric
1. launch_announced
2. body_dimensions
3. body_weight
4. display_resolution
5. memory_internal (drop)
6. main_camera_single (drop)
7. battery (drop)



#### Target var 
- misc_price


### Flag variables

In [8]:
gsm1 = gsm.copy()

In [9]:
flag_v = pd.DataFrame()

In [10]:
gsm1["network_gprs"] = gsm1["network_gprs"].fillna("No")
flag_v["network_gprs"] = [0 if i == "No" else 1 for i in gsm1["network_gprs"] ]

gsm1["network_edge"] = gsm1["network_edge"].fillna("No")
flag_v["network_edge"] = [0 if i == "No" else 1 for i in gsm1["network_edge"] ]

gsm1["launch_status"] = gsm1["launch_status"].fillna("Discontinued")
flag_v["launch_status"] = [0 if i == "Discontinued" or i == "Cancelled" else 1 for i in gsm1["launch_status"] ]

gsm1["body_sim"] = gsm1["body_sim"].fillna("Micro-SIM")
flag_v["body_sim"] = [1 if "Dual" in i or "dual" in i else 0 for i in gsm1["body_sim"] ]

gsm1["sound_loudspeaker"] = gsm1["sound_loudspeaker"].fillna("No")
flag_v["sound_loudspeaker"] = [0 if i == "No" else 1 for i in gsm1["sound_loudspeaker"] ]

gsm1["comms_wlan"] = gsm1["comms_wlan"].fillna("No")
gsm1["comms_bluetooth"] = gsm1["comms_bluetooth"].fillna("No")
gsm1["comms_radio"] = gsm1["comms_radio"].fillna("No")
gsm1["comms_gps"] = gsm1["comms_gps"].fillna("No")
flag_v["comms_wlan"] = [0 if i == "No" else 1 for i in gsm1["comms_wlan"] ]
flag_v["comms_bluetooth"] = [0 if i == "No" else 1 for i in gsm1["comms_bluetooth"] ]
flag_v["comms_radio"] = [0 if i == "No" else 1 for i in gsm1["comms_radio"] ]
flag_v["comms_gps"] = [0 if i == "No" else 1 for i in gsm1["comms_gps"] ]

gsm1["network_2g_bands"] = gsm1["network_2g_bands"].fillna("No")
gsm1["network_3g_bands"] = gsm1["network_3g_bands"].fillna("No")
gsm1["network_4g_bands"] = gsm1["network_4g_bands"].fillna("No")
gsm1["network_5g_bands"] = gsm1["network_5g_bands"].fillna("No")
flag_v["network_2g_bands"] = [0 if i == "No" else 1 for i in gsm1["network_2g_bands"] ]
flag_v["network_3g_bands"] = [0 if i == "No" else 1 for i in gsm1["network_3g_bands"] ]
flag_v["network_4g_bands"] = [0 if i == "No" else 1 for i in gsm1["network_4g_bands"] ]
flag_v["network_5g_bands"] = [0 if i == "No" else 1 for i in gsm1["network_5g_bands"] ]

gsm1["main_camera_five"] = gsm1["main_camera_five"].fillna("No")
gsm1["main_camera_quad"] = gsm1["main_camera_quad"].fillna("No")
gsm1["main_camera_triple"] = gsm1["main_camera_triple"].fillna("No")
flag_v["main_camera_five"] = [0 if i == "No" else 1 for i in gsm1["main_camera_five"] ]
flag_v["main_camera_quad"] = [0 if i == "No" else 1 for i in gsm1["main_camera_quad"] ]
flag_v["main_camera_triple"] = [0 if i == "No" else 1 for i in gsm1["main_camera_triple"] ]
flag_v["two_plus_cameras"] = flag_v["main_camera_five"] + flag_v["main_camera_quad"] + flag_v["main_camera_triple"]
flag_v = flag_v.drop(["main_camera_five", "main_camera_quad", "main_camera_triple"], axis=1)

In [11]:
flag_v

Unnamed: 0,network_gprs,network_edge,launch_status,body_sim,sound_loudspeaker,comms_wlan,comms_bluetooth,comms_radio,comms_gps,network_2g_bands,network_3g_bands,network_4g_bands,network_5g_bands,two_plus_cameras
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,1,1,1,0,1,1,1,0,0,0
2,0,0,0,0,1,1,1,0,1,1,1,0,0,0
3,0,0,0,0,1,1,1,0,1,1,1,0,0,0
4,0,0,1,0,1,1,1,0,1,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10674,1,1,0,0,1,0,1,1,0,1,1,0,0,0
10675,1,1,0,0,1,0,1,1,0,1,1,0,0,0
10676,1,1,0,0,1,0,1,1,0,1,1,0,0,0
10677,1,1,0,0,1,0,1,1,0,1,1,0,0,0


### One hot variables

In [150]:
ohe = pd.DataFrame()

In [151]:
gsm1["network_technology"] = gsm1["network_technology"].fillna("No cellular connectivity")
ohe["nt_split"] = [i.split(" / ") for i in gsm1["network_technology"]]

In [152]:
nt_set = set()
for i in ohe["nt_split"]:
    nt_set = nt_set.union(set(i))

In [153]:
for i in list(nt_set):
    t = []
    for j in ohe["nt_split"]:
        if i in j:
            t.append(1)
        else:
            t.append(0)
    ohe[i] = t

In [154]:
ohe = ohe.drop("nt_split", axis=1)

In [155]:
gsm1["platform_os"] = gsm1["platform_os"].fillna("No")
os = []
for i in gsm1["platform_os"]:
    if "Android" in i:
        os.append("Android")
    elif "Microsoft" in i or "Windows" in i:
        os.append("Windows")
    elif "iOS " in i:
        os.append("iOS")
    else:
        os.append("Others_os")
ohe["os"] = os

In [156]:
gsm1["main_camera_video"] = gsm1["main_camera_video"].fillna("No")
video = []
for i in gsm1["main_camera_video"]:
    if "No" in i:
        video.append("None")
    elif "8K" in i:
        video.append("8K")
    elif "4K" in i:
        video.append("4K")
    elif "2160p" in i:
        video.append("2160p")
    elif "1080p" in i:
        video.append("1080p")
    elif "720p" in i:
        video.append("720p")
    elif "480p" in i:
        video.append("480p")
    elif "320p" in i:
        video.append("320p")
    else:
        video.append("Others_vid")
ohe["video"] = video

In [157]:
ohe["video"].value_counts()

Others_vid    3529
1080p         2522
None          1991
720p          1021
480p           484
2160p          471
4K             371
320p           270
8K              20
Name: video, dtype: int64

### Numeric variables

In [158]:
num = pd.DataFrame()

In [159]:
la = []
s = 0
c = 0
for i in gsm["launch_announced"]:
    try:
        n = 2020 - int(i[:4])
        la.append(n)
        s += n
        c += 1
    except:
        la.append("NA")

In [160]:
m = s/c
la = list(pd.Series(la).replace("NA", np.round(m,0)))

In [161]:
num["launch"] = la

In [162]:
d1 = []
d2 = []
d3 = []
s1 = 0
s2 = 0
s3 = 0
c1 = 0
c2 = 0
c3 = 0
for i in gsm1["body_dimensions"]:
    t = i.split()
    if len(t) >= 5 and t[1] == "x" and t[3] == "x":
        try:
            d1.append(float(t[0]))
            s1 += float(t[0])
            c1 += 1
        except:
            d1.append("NA")
        try:
            d2.append(float(t[2]))
            s2 += float(t[2])
            c2 += 1
        except:
            d2.append("NA")
        try:
            d3.append(float(t[4]))
            s3 += float(t[4])
            c3 += 1
        except:
            d3.append("NA")
    else:
        d1.append("NA")
        d2.append("NA")
        d3.append("NA")

In [163]:
m1 = s1/c1
d1 = list(pd.Series(d1).replace("NA", np.round(m1,0)))
m2 = s2/c2
d2 = list(pd.Series(d2).replace("NA", np.round(m2,0)))
m3 = s3/c3
d3 = list(pd.Series(d3).replace("NA", np.round(m3,0)))

In [164]:
num["d1"] = d1
num["d2"] = d2
num["d3"] = d3

In [165]:
w1 = []
s1 = 0
s2 = 0
for i in gsm1["body_weight"]:
    t = str(i).split()
    if len(t) > 1 and t[1] == "g":
        try:
            w1.append(float(t[0]))
            s1 += float(t[0])
            c1 += 1
        except:
            w1.append("NA")
    else:
        w1.append("NA")

In [166]:
m1 = s1/c1
w1 = list(pd.Series(w1).replace("NA", np.round(m1,0)))

In [167]:
num["w1"] = w1

In [168]:
num

Unnamed: 0,launch,d1,d2,d3,w1
0,21.0,145.0,56.0,23.0,190.0
1,10.0,129.0,66.0,13.0,71.0
2,10.0,116.0,56.8,12.4,118.0
3,8.0,103.0,54.0,13.4,71.0
4,1.0,160.4,75.1,8.2,193.0
...,...,...,...,...,...
10674,11.0,103.0,48.0,13.5,90.0
10675,11.0,105.0,46.0,12.6,90.0
10676,11.0,105.0,46.0,12.6,90.0
10677,11.0,105.0,46.0,12.6,90.0


In [169]:
gsm1["display_resolution"].value_counts()

720 x 1280 pixels, 16:9 ratio (~294 ppi density)            547
480 x 800 pixels, 5:3 ratio (~233 ppi density)              414
240 x 320 pixels, 4:3 ratio (~167 ppi density)              342
128 x 160 pixels (~114 ppi density)                         275
720 x 1280 pixels, 16:9 ratio (~267 ppi density)            241
                                                           ... 
176 x 220 pixels, 7 lines (~128 ppi density)                  1
1536 x 2560 pixels, 5:3 ratio (~546 ppi density)              1
480 x 854 pixels, 16:9 ratio (~251 ppi density)               1
1620 x 2160 pixels, 4:3 ratio (~264 ppi density)              1
240 x 320 pixels, 2.0 inch, 4:3 ratio (~200 ppi density)      1
Name: display_resolution, Length: 1136, dtype: int64

In [170]:
r1 = []
r2 = []
s1 = 0
s2 = 0
c1 = 0
c2 = 0
for i in gsm1["display_resolution"]:
    t = str(i).split()
    if len(t) >= 3 and t[1] == "x":
        try:
            r1.append(float(t[0]))
            s1 += float(t[0])
            c1 += 1
        except:
            r1.append("NA")
        try:
            r2.append(float(t[2]))
            s2 += float(t[2])
            c2 += 1
        except:
            r2.append("NA")
    else:
        r1.append("NA")
        r2.append("NA")

In [171]:
m1 = s1/c1
r1 = list(pd.Series(r1).replace("NA", np.round(m1,0)))
m2 = s2/c2
r2 = list(pd.Series(r2).replace("NA", np.round(m2,0)))

In [172]:
num["r1"] = r1
num["r2"] = r2

### Target Var

In [173]:
p_uni = gsm1["misc_price"].value_counts()[gsm1["misc_price"].value_counts() >= 3].index.tolist()

In [174]:
p_uni_2 = list(set(pd.Series(p_uni).replace("$<e2><80><89>199.99", "About 200 USD")))

### Main df

In [175]:
flag_v.shape

(10679, 14)

In [176]:
flag_v.columns

Index(['network_gprs', 'network_edge', 'launch_status', 'body_sim',
       'sound_loudspeaker', 'comms_wlan', 'comms_bluetooth', 'comms_radio',
       'comms_gps', 'network_2g_bands', 'network_3g_bands', 'network_4g_bands',
       'network_5g_bands', 'two_plus_cameras'],
      dtype='object')

In [177]:
flag_v.head()

Unnamed: 0,network_gprs,network_edge,launch_status,body_sim,sound_loudspeaker,comms_wlan,comms_bluetooth,comms_radio,comms_gps,network_2g_bands,network_3g_bands,network_4g_bands,network_5g_bands,two_plus_cameras
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,1,1,1,0,1,1,1,0,0,0
2,0,0,0,0,1,1,1,0,1,1,1,0,0,0
3,0,0,0,0,1,1,1,0,1,1,1,0,0,0
4,0,0,1,0,1,1,1,0,1,1,1,1,0,0


In [178]:
ohe.shape

(10679, 11)

In [179]:
ohe.columns

Index(['GSM', 'HSPA', 'CDMA', '5G', 'LTE', 'UMTS', 'CDMA2000',
       'No cellular connectivity', 'EVDO', 'os', 'video'],
      dtype='object')

In [180]:
ohe.head()

Unnamed: 0,GSM,HSPA,CDMA,5G,LTE,UMTS,CDMA2000,No cellular connectivity,EVDO,os,video
0,1,0,0,0,0,0,0,0,0,Others_os,
1,1,1,0,0,0,0,0,0,0,Windows,480p
2,1,1,0,0,0,0,0,0,0,Android,Others_vid
3,1,1,0,0,0,0,0,0,0,Android,Others_vid
4,1,1,1,0,1,0,0,0,1,Android,4K


In [181]:
o = OneHotEncoder()
o.fit(ohe[["os", "video"]])

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [182]:
ohe_c = o.categories_[0].tolist() + o.categories_[1].tolist()
ohe_2 = pd.DataFrame(o.transform(ohe[["os", "video"]]).toarray(), columns = ohe_c)

In [183]:
ohe = ohe.drop(["os", "video"], axis = 1)
ohe_2 = ohe_2.drop(["None"], axis = 1)

In [184]:
num.shape

(10679, 7)

In [185]:
num.columns

Index(['launch', 'd1', 'd2', 'd3', 'w1', 'r1', 'r2'], dtype='object')

In [186]:
num.head()

Unnamed: 0,launch,d1,d2,d3,w1,r1,r2
0,21.0,145.0,56.0,23.0,190.0,515.0,859.0
1,10.0,129.0,66.0,13.0,71.0,480.0,800.0
2,10.0,116.0,56.8,12.4,118.0,320.0,480.0
3,8.0,103.0,54.0,13.4,71.0,240.0,320.0
4,1.0,160.4,75.1,8.2,193.0,1440.0,3040.0


In [187]:
df = pd.concat([flag_v, ohe, ohe_2, num, gsm1["misc_price"]], axis = 1)

In [188]:
df

Unnamed: 0,network_gprs,network_edge,launch_status,body_sim,sound_loudspeaker,comms_wlan,comms_bluetooth,comms_radio,comms_gps,network_2g_bands,...,8K,Others_vid,launch,d1,d2,d3,w1,r1,r2,misc_price
0,0,0,0,0,0,0,0,0,0,1,...,0.0,0.0,21.0,145.0,56.0,23.0,190.0,515.0,859.0,
1,0,0,0,0,1,1,1,0,1,1,...,0.0,0.0,10.0,129.0,66.0,13.0,71.0,480.0,800.0,About 310 EUR
2,0,0,0,0,1,1,1,0,1,1,...,0.0,1.0,10.0,116.0,56.8,12.4,118.0,320.0,480.0,About 110 EUR
3,0,0,0,0,1,1,1,0,1,1,...,0.0,1.0,8.0,103.0,54.0,13.4,71.0,240.0,320.0,
4,0,0,1,0,1,1,1,0,1,1,...,0.0,0.0,1.0,160.4,75.1,8.2,193.0,1440.0,3040.0,<c2><a3><e2><80><89>679.00 / <e2><82><ac><e2><...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10674,1,1,0,0,1,0,1,1,0,1,...,0.0,1.0,11.0,103.0,48.0,13.5,90.0,240.0,320.0,
10675,1,1,0,0,1,0,1,1,0,1,...,0.0,1.0,11.0,105.0,46.0,12.6,90.0,176.0,220.0,
10676,1,1,0,0,1,0,1,1,0,1,...,0.0,1.0,11.0,105.0,46.0,12.6,90.0,176.0,220.0,
10677,1,1,0,0,1,0,1,1,0,1,...,0.0,1.0,11.0,105.0,46.0,12.6,90.0,176.0,220.0,


In [189]:
df_2 = df[df["misc_price"].isin(p_uni_2)]

In [190]:
df_2["misc_price"].unique()

array(['About 310 EUR', 'About 110 EUR', 'About 250 EUR', 'About 360 EUR',
       'About 280 EUR', 'About 140 EUR', 'About 160 EUR', 'About 90 EUR',
       'About 180 EUR', 'About 200 EUR', 'About 120 EUR', 'About 500 EUR',
       'About 220 EUR', 'About 130 EUR', 'About 240 EUR', 'About 150 EUR',
       'About 420 EUR', 'About 260 EUR', 'About 100 EUR', 'About 320 EUR',
       'About 550 EUR', 'About 290 EUR', 'About 450 EUR', 'About 650 EUR',
       'About 190 EUR', 'About 300 EUR', 'About 350 EUR', 'About 390 EUR',
       'About 50 EUR', 'About 7000 INR', 'About 80 EUR', 'About 60 EUR',
       'About 400 EUR', 'About 270 EUR', 'About 700 EUR', 'About 340 EUR',
       'About 1100 EUR', 'About 530 EUR', 'About 40 EUR', 'About 70 EUR',
       'About 1200 EUR', 'About 230 EUR', 'About 30 EUR', 'About 20 EUR',
       'About 10 EUR', 'About 800 EUR', 'About 470 EUR', 'About 370 EUR',
       'About 520 EUR', 'About 440 EUR', 'About 330 EUR', 'About 880 EUR',
       'About 900 EUR', 'About 

In [191]:
y = [float(i.split()[1])*(0.014) if i.split()[2] == "INR" else float(i.split()[1])*(1.12) if i.split()[2] == "EUR" else float(i.split()[1]) for i in df_2["misc_price"]]

In [192]:
X = df_2.drop("misc_price", axis=1)

In [193]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [194]:
X_train.shape

(4556, 42)

In [195]:
len(y_train)

4556

In [196]:
X_test.shape

(1519, 42)

In [197]:
len(y_test)

1519

In [198]:
xgb = XGBRegressor(n_estimators = 50, eta = 0.1)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

In [199]:
y_pred

array([142.88258, 147.32964, 129.54192, ..., 363.94012, 117.63385,
       342.99844], dtype=float32)

In [210]:
pd.DataFrame({"feature_name" : X_train.columns.tolist(), "importance" : xgb.feature_importances_.tolist()}).sort_values(by = "importance", ascending=False)

Unnamed: 0,feature_name,importance
40,r1,0.290668
12,network_5g_bands,0.067943
10,network_3g_bands,0.052109
22,EVDO,0.051809
28,2160p,0.051011
41,r2,0.045966
26,iOS,0.034975
27,1080p,0.033947
39,w1,0.029191
8,comms_gps,0.02721


In [212]:
mean_squared_error(y_test, y_pred)

6706.288120254423

In [217]:
np.min(y_test), np.max(y_test), len(y_test)

(11.200000000000001, 1344.0000000000002, 1519)

In [218]:
np.mean(y_test)

198.3938117182357

In [219]:
y_pred_2 = [200]*(len(y_test))

In [220]:
mean_squared_error(y_test, y_pred_2)

22012.30549045425