In [52]:
import pandas as pd 
import numpy as np

# read in all our data
laptop_train = pd.read_csv("datasets/laptops_train.csv")
laptop_test = pd.read_csv("datasets/laptops_test.csv")

# set seed for reproducibility
np.random.seed(0)

In [6]:
print(laptop_train.head())
print(laptop_test.head())

  Manufacturer   Model Name   Category Screen Size  \
0        Apple  MacBook Pro  Ultrabook       13.3"   
1        Apple  Macbook Air  Ultrabook       13.3"   
2           HP       250 G6   Notebook       15.6"   
3        Apple  MacBook Pro  Ultrabook       15.4"   
4        Apple  MacBook Pro  Ultrabook       13.3"   

                               Screen                         CPU   RAM  \
0  IPS Panel Retina Display 2560x1600        Intel Core i5 2.3GHz   8GB   
1                            1440x900        Intel Core i5 1.8GHz   8GB   
2                   Full HD 1920x1080  Intel Core i5 7200U 2.5GHz   8GB   
3  IPS Panel Retina Display 2880x1800        Intel Core i7 2.7GHz  16GB   
4  IPS Panel Retina Display 2560x1600        Intel Core i5 3.1GHz   8GB   

               Storage                           GPU Operating System  \
0            128GB SSD  Intel Iris Plus Graphics 640            macOS   
1  128GB Flash Storage        Intel HD Graphics 6000            macOS   
2    

In [53]:
frames = [laptop_train,laptop_test]
df = pd.concat(frames)

In [3]:
df.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,11912523.48
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,7993374.48
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,5112900.0
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,22563005.4
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,16037611.2


In [54]:
# get the number of missing data points per column
missing_values_count = df.isnull().sum()
missing_values_count

Manufacturer                  0
Model Name                    0
Category                      0
Screen Size                   0
Screen                        0
CPU                           0
RAM                           0
 Storage                      0
GPU                           0
Operating System              0
Operating System Version    170
Weight                        0
Price                         0
dtype: int64

In [15]:
df['Operating System Version'].value_counts()

10      1071
7         45
X          8
10 S       8
Name: Operating System Version, dtype: int64

In [18]:
df[df['Operating System Version'].isna()].groupby('Operating System').groups

{'Android': [50, 136], 'Chrome OS': [290, 317, 430, 437, 472, 504, 584, 619, 677, 690, 697, 745, 762, 817, 828, 838, 846, 888, 907, 949, 953, 959, 71, 110, 124, 213, 237], 'Linux': [36, 40, 42, 59, 69, 96, 97, 102, 180, 210, 220, 235, 236, 272, 276, 281, 296, 338, 340, 379, 394, 403, 415, 423, 427, 439, 446, 524, 543, 555, 579, 604, 612, 613, 614, 616, 651, 669, 711, 712, 728, 769, 832, 852, 876, 886, 896, 965, 112, 145, 204, 210, 220, 249, 264, 272, 285, 289, 299, 303, 313, 317], 'No OS': [2, 10, 11, 18, 22, 46, 62, 76, 122, 125, 136, 138, 142, 172, 184, 187, 192, 193, 205, 212, 213, 216, 242, 261, 267, 279, 289, 303, 305, 356, 364, 367, 381, 463, 509, 514, 527, 547, 594, 627, 643, 654, 704, 725, 752, 783, 850, 859, 863, 871, 932, 947, 14, 78, 83, 113, 150, 172, 174, 186, 195, 196, 218, 223, 239, 270], 'macOS': [0, 1, 3, 4, 7, 12, 14, 15, 17, 45, 81, 249, 270]}

In [21]:
grouped = df.groupby('Operating System')

for name,group in grouped:
    print(name)
    print(group['Operating System Version'].unique())

Android
[nan]
Chrome OS
[nan]
Linux
[nan]
Mac OS
['X']
No OS
[nan]
Windows
['10' '10 S' '7']
macOS
[nan]


In [63]:
#Drop the Operating System Version because the laptop can just be updated by the user
df = df.drop('Operating System Version', axis=1)

In [6]:
df.columns

Index(['Manufacturer', 'Model Name', 'Category', 'Screen Size', 'Screen',
       'CPU', 'RAM', ' Storage', 'GPU', 'Operating System', 'Weight', 'Price'],
      dtype='object')

In [10]:
df[['Screen Size','Screen']]

Unnamed: 0,Screen Size,Screen
0,"13.3""",IPS Panel Retina Display 2560x1600
1,"13.3""",1440x900
2,"15.6""",Full HD 1920x1080
3,"15.4""",IPS Panel Retina Display 2880x1800
4,"13.3""",IPS Panel Retina Display 2560x1600
...,...,...
320,"14.0""",IPS Panel Full HD / Touchscreen 1920x1080
321,"13.3""",IPS Panel Quad HD+ / Touchscreen 3200x1800
322,"14.0""",1366x768
323,"15.6""",1366x768


Data Preprocessing

In [12]:
#Treat Manufacturer as an nominal variable
df['Manufacturer'].unique()

array(['Apple', 'HP', 'Acer', 'Asus', 'Dell', 'Lenovo', 'Chuwi', 'MSI',
       'Microsoft', 'Toshiba', 'Huawei', 'Xiaomi', 'Vero', 'Razer',
       'Mediacom', 'Samsung', 'Google', 'Fujitsu', 'LG'], dtype=object)

In [15]:
# gives a tuple of column name and series
# for each column in the dataframe
for (columnName, columnData) in df.items():
    print('Column Name : ', columnName)
    print('Column Contents : ', columnData.unique())

Column Name :  Manufacturer
Column Contents :  ['Apple' 'HP' 'Acer' 'Asus' 'Dell' 'Lenovo' 'Chuwi' 'MSI' 'Microsoft'
 'Toshiba' 'Huawei' 'Xiaomi' 'Vero' 'Razer' 'Mediacom' 'Samsung' 'Google'
 'Fujitsu' 'LG']
Column Name :  Model Name
Column Contents :  ['MacBook Pro' 'Macbook Air' '250 G6' 'Aspire 3' 'ZenBook UX430UN'
 'Swift 3' 'Inspiron 3567' 'MacBook 12"' 'IdeaPad 320-15IKB' 'XPS 13'
 'Vivobook E200HA' 'Legion Y520-15IKBN' '255 G6' 'Inspiron 5379'
 '15-BS101nv (i7-8550U/8GB/256GB/FHD/W10)' 'MacBook Air' 'Inspiron 5570'
 'Latitude 5590' 'ProBook 470' 'LapBook 15.6"'
 'E402WA-GA010T (E2-6110/2GB/32GB/W10)'
 '17-ak001nv (A6-9220/4GB/500GB/Radeon' 'IdeaPad 120S-14IAP'
 'Inspiron 5770' 'ProBook 450' 'X540UA-DM186 (i3-6006U/4GB/1TB/FHD/Linux)'
 'Inspiron 7577' 'X542UQ-GO005 (i5-7200U/8GB/1TB/GeForce'
 'Aspire A515-51G' 'Inspiron 7773' 'IdeaPad 320-15ISK' 'Rog Strix'
 'X751NV-TY001T (N4200/4GB/1TB/GeForce' 'Yoga Book' 'ProBook 430'
 'Inspiron 3576' '15-bs002nv (i3-6006U/4GB/128GB/FHD/W10)'

In [13]:
df.dtypes

Manufacturer         object
Model Name           object
Category             object
Screen Size          object
Screen               object
CPU                  object
RAM                  object
 Storage             object
GPU                  object
Operating System     object
Weight               object
Price               float64
dtype: object

In [19]:
model_name = df['Model Name'].unique()
model_name.sort()
model_name

array(['110-15ACL (A6-7310/4GB/500GB/W10)',
       '14-am079na (N3710/8GB/2TB/W10)',
       '15-AC110nv (i7-6500U/6GB/1TB/Radeon',
       '15-AY023na (N3710/8GB/2TB/W10)',
       '15-BA015wm (E2-7110/4GB/500GB/W10)',
       '15-BS026nv (i5-7200U/8GB/256GB/Radeon',
       '15-BS028nv (i3-6006U/4GB/1TB/Radeon',
       '15-BS078nr (i7-7500U/8GB/1TB/W10)',
       '15-BS101nv (i7-8550U/8GB/256GB/FHD/W10)',
       '15-BS103nv (i5-8250U/6GB/256GB/Radeon',
       '15-BW004nv (A9-9420/4GB/256GB/Radeon',
       '15-BW037na (A9-9420/4GB/1TB/Radeon',
       '15-BW091ND (A9-9420/6GB/1TB',
       '15-BW094nd (A6-9220/8GB/128GB/W10)',
       '15-ay047nv (i3-6006U/6GB/1TB/Radeon',
       '15-ba043na (A12-9700P/8GB/2TB/W10)',
       '15-bs002nv (i3-6006U/4GB/128GB/FHD/W10)',
       '15-bs005nv (i3-6006U/4GB/1TB',
       '15-bs011nv (i7-7500U/4GB/500GB/Radeon',
       '15-bs012nv (i7-7500U/8GB/1TB/Radeon',
       '15-bs015dx (i5-7200U/8GB/1TB/W10)',
       '15-bs017nv (i7-7500U/8GB/256GB/Radeon',
      

In [20]:
df[['Model Name','CPU']]

Unnamed: 0,Model Name,CPU
0,MacBook Pro,Intel Core i5 2.3GHz
1,Macbook Air,Intel Core i5 1.8GHz
2,250 G6,Intel Core i5 7200U 2.5GHz
3,MacBook Pro,Intel Core i7 2.7GHz
4,MacBook Pro,Intel Core i5 3.1GHz
...,...,...
320,Yoga 500-14ISK,Intel Core i7 6500U 2.5GHz
321,Yoga 900-13ISK,Intel Core i7 6500U 2.5GHz
322,IdeaPad 100S-14IBR,Intel Celeron Dual Core N3050 1.6GHz
323,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Intel Core i7 6500U 2.5GHz


In [30]:
df['Model Name'].str.contains('/')

0      False
1      False
2      False
3      False
4      False
       ...  
320    False
321    False
322    False
323     True
324     True
Name: Model Name, Length: 1302, dtype: bool

In [31]:
print('Yoga' in df['Model Name'].unique())

False


In [32]:
df[df['Model Name'].str.contains('/')]

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Weight,Price
24,HP,15-BS101nv (i7-8550U/8GB/256GB/FHD/W10),Ultrabook,"15.6""",Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,8GB,256GB SSD,Intel HD Graphics 620,Windows,1.91kg,5859828.00
31,Asus,E402WA-GA010T (E2-6110/2GB/32GB/W10),Notebook,"14.0""",1366x768,AMD E-Series E2-6110 1.5GHz,2GB,32GB Flash Storage,AMD Radeon R2,Windows,1.65kg,1769508.00
32,HP,17-ak001nv (A6-9220/4GB/500GB/Radeon,Notebook,"17.3""",Full HD 1920x1080,AMD A6-Series 9220 2.5GHz,4GB,500GB HDD,AMD Radeon 530,Windows,2.71kg,3903588.00
40,Asus,X540UA-DM186 (i3-6006U/4GB/1TB/FHD/Linux),Notebook,"15.6""",Full HD 1920x1080,Intel Core i3 6006U 2GHz,4GB,1TB HDD,Intel HD Graphics 620,Linux,2kg,3458988.00
42,Asus,X542UQ-GO005 (i5-7200U/8GB/1TB/GeForce,Notebook,"15.6""",1366x768,Intel Core i5 7200U 2.5GHz,8GB,1TB HDD,Nvidia GeForce 940MX,Linux,2.3kg,4650427.08
...,...,...,...,...,...,...,...,...,...,...,...,...
309,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,"15.6""",1366x768,Intel Core i7 6500U 2.5GHz,6GB,1TB HDD,AMD Radeon R5 M330,Windows,2.19kg,6793488.00
310,Asus,X553SA-XX031T (N3050/4GB/500GB/W10),Notebook,"15.6""",1366x768,Intel Celeron Dual Core N3050 1.6GHz,4GB,500GB HDD,Intel HD Graphics,Windows,2.2kg,3281148.00
319,Asus,X556UJ-XO044T (i7-6500U/4GB/500GB/GeForce,Notebook,"15.6""",1366x768,Intel Core i7 6500U 2.5GHz,4GB,500GB HDD,Nvidia GeForce 920M,Windows,2.2kg,6405085.44
323,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,"15.6""",1366x768,Intel Core i7 6500U 2.5GHz,6GB,1TB HDD,AMD Radeon R5 M330,Windows,2.19kg,6793488.00


In [33]:
category = df['Category'].unique()
category.sort()
category

array(['2 in 1 Convertible', 'Gaming', 'Netbook', 'Notebook', 'Ultrabook',
       'Workstation'], dtype=object)

In [36]:
# gives a tuple of column name and series
# for each column in the dataframe
for (columnName, columnData) in df.items():
    print('Column Name : ', columnName)
    print('Column Contents : ', columnData.nunique())

Column Name :  Manufacturer
Column Contents :  19
Column Name :  Model Name
Column Contents :  617
Column Name :  Category
Column Contents :  6
Column Name :  Screen Size
Column Contents :  18
Column Name :  Screen
Column Contents :  40
Column Name :  CPU
Column Contents :  118
Column Name :  RAM
Column Contents :  9
Column Name :   Storage
Column Contents :  38
Column Name :  GPU
Column Contents :  110
Column Name :  Operating System
Column Contents :  7
Column Name :  Weight
Column Contents :  179
Column Name :  Price
Column Contents :  791


In [37]:
screen_size = df['Screen Size'].unique()
screen_size.sort()
screen_size

array(['10.1"', '11.3"', '11.6"', '12.0"', '12.3"', '12.5"', '13.0"',
       '13.3"', '13.5"', '13.9"', '14.0"', '14.1"', '15.0"', '15.4"',
       '15.6"', '17.0"', '17.3"', '18.4"'], dtype=object)

In [39]:
screen = df['Screen'].unique()
screen.sort()
print(df['Screen'].value_counts())
screen

Full HD 1920x1080                                507
1366x768                                         281
IPS Panel Full HD 1920x1080                      230
IPS Panel Full HD / Touchscreen 1920x1080         53
Full HD / Touchscreen 1920x1080                   47
1600x900                                          23
Touchscreen 1366x768                              16
Quad HD+ / Touchscreen 3200x1800                  15
IPS Panel 4K Ultra HD 3840x2160                   12
IPS Panel 4K Ultra HD / Touchscreen 3840x2160     11
4K Ultra HD / Touchscreen 3840x2160                9
4K Ultra HD 3840x2160                              7
Touchscreen 2560x1440                              7
IPS Panel 1366x768                                 7
IPS Panel Quad HD+ / Touchscreen 3200x1800         6
IPS Panel Retina Display 2560x1600                 6
IPS Panel Retina Display 2304x1440                 6
Touchscreen 2256x1504                              6
IPS Panel Touchscreen 2560x1440               

array(['1366x768', '1440x900', '1600x900', '1920x1080', '2560x1440',
       '4K Ultra HD / Touchscreen 3840x2160', '4K Ultra HD 3840x2160',
       'Full HD / Touchscreen 1920x1080', 'Full HD 1920x1080',
       'IPS Panel 1366x768', 'IPS Panel 2560x1440',
       'IPS Panel 4K Ultra HD / Touchscreen 3840x2160',
       'IPS Panel 4K Ultra HD 3840x2160',
       'IPS Panel Full HD / Touchscreen 1920x1080',
       'IPS Panel Full HD 1366x768', 'IPS Panel Full HD 1920x1080',
       'IPS Panel Full HD 1920x1200', 'IPS Panel Full HD 2160x1440',
       'IPS Panel Full HD 2560x1440',
       'IPS Panel Quad HD+ / Touchscreen 3200x1800',
       'IPS Panel Quad HD+ 2560x1440', 'IPS Panel Quad HD+ 3200x1800',
       'IPS Panel Retina Display 2304x1440',
       'IPS Panel Retina Display 2560x1600',
       'IPS Panel Retina Display 2736x1824',
       'IPS Panel Retina Display 2880x1800',
       'IPS Panel Touchscreen / 4K Ultra HD 3840x2160',
       'IPS Panel Touchscreen 1366x768',
       'IPS Panel T

In [40]:
CPU = df['CPU'].unique()
CPU.sort()
print(df['CPU'].value_counts())
CPU

Intel Core i5 7200U 2.5GHz       190
Intel Core i7 7700HQ 2.8GHz      146
Intel Core i7 7500U 2.7GHz       133
Intel Core i7 8550U 1.8GHz        73
Intel Core i5 8250U 1.6GHz        72
                                ... 
Intel Core M M3-6Y30 0.9GHz        1
AMD A9-Series 9420 2.9GHz          1
Intel Core i3 6006U 2.2GHz         1
AMD A6-Series 7310 2GHz            1
Intel Xeon E3-1535M v6 3.1GHz      1
Name: CPU, Length: 118, dtype: int64


array(['AMD A10-Series 9600P 2.4GHz', 'AMD A10-Series 9620P 2.5GHz',
       'AMD A10-Series A10-9620P 2.5GHz', 'AMD A12-Series 9700P 2.5GHz',
       'AMD A12-Series 9720P 2.7GHz', 'AMD A12-Series 9720P 3.6GHz',
       'AMD A4-Series 7210 2.2GHz', 'AMD A6-Series 7310 2GHz',
       'AMD A6-Series 9220 2.5GHz', 'AMD A6-Series 9220 2.9GHz',
       'AMD A6-Series A6-9220 2.5GHz', 'AMD A8-Series 7410 2.2GHz',
       'AMD A9-Series 9410 2.9GHz', 'AMD A9-Series 9420 2.9GHz',
       'AMD A9-Series 9420 3GHz', 'AMD A9-Series A9-9420 3GHz',
       'AMD E-Series 6110 1.5GHz', 'AMD E-Series 7110 1.8GHz',
       'AMD E-Series 9000 2.2GHz', 'AMD E-Series 9000e 1.5GHz',
       'AMD E-Series E2-6110 1.5GHz', 'AMD E-Series E2-9000 2.2GHz',
       'AMD E-Series E2-9000e 1.5GHz', 'AMD FX 8800P 2.1GHz',
       'AMD FX 9830P 3GHz', 'AMD Ryzen 1600 3.2GHz',
       'AMD Ryzen 1700 3GHz', 'Intel Atom X5-Z8350 1.44GHz',
       'Intel Atom Z8350 1.92GHz', 'Intel Atom x5-Z8300 1.44GHz',
       'Intel Atom x5-Z835

In [41]:
RAM = df['RAM'].unique()
RAM.sort()
print(df['RAM'].value_counts())
RAM

8GB     619
4GB     375
16GB    199
6GB      41
12GB     25
2GB      22
32GB     17
24GB      3
64GB      1
Name: RAM, dtype: int64


array(['12GB', '16GB', '24GB', '2GB', '32GB', '4GB', '64GB', '6GB', '8GB'],
      dtype=object)

In [45]:
storage = df[' Storage'].unique()
storage.sort()
print(df[' Storage'].value_counts())
storage

256GB SSD                        412
1TB HDD                          224
500GB HDD                        132
512GB SSD                        118
128GB SSD +  1TB HDD              94
128GB SSD                         76
256GB SSD +  1TB HDD              73
32GB Flash Storage                38
2TB HDD                           16
64GB Flash Storage                15
512GB SSD +  1TB HDD              14
1TB SSD                           14
256GB SSD +  2TB HDD              10
1TB Hybrid                         9
256GB Flash Storage                8
16GB Flash Storage                 7
32GB SSD                           6
1GB SSD                            5
128GB Flash Storage                4
16GB SSD                           3
512GB Flash Storage                2
1TB SSD +  1TB HDD                 2
512GB SSD +  2TB HDD               2
256GB SSD +  500GB HDD             2
128GB SSD +  2TB HDD               2
256GB SSD +  256GB SSD             2
512GB SSD +  256GB SSD             1
5

array(['128GB Flash Storage', '128GB HDD', '128GB SSD',
       '128GB SSD +  1TB HDD', '128GB SSD +  2TB HDD',
       '16GB Flash Storage', '16GB SSD', '1GB SSD', '1TB HDD',
       '1TB HDD +  1TB HDD', '1TB Hybrid', '1TB SSD',
       '1TB SSD +  1TB HDD', '240GB SSD', '256GB Flash Storage',
       '256GB SSD', '256GB SSD +  1TB HDD', '256GB SSD +  1TB Hybrid',
       '256GB SSD +  256GB SSD', '256GB SSD +  2TB HDD',
       '256GB SSD +  500GB HDD', '2TB HDD', '32GB Flash Storage',
       '32GB HDD', '32GB SSD', '500GB HDD', '508GB Hybrid',
       '512GB Flash Storage', '512GB SSD', '512GB SSD +  1TB HDD',
       '512GB SSD +  1TB Hybrid', '512GB SSD +  256GB SSD',
       '512GB SSD +  2TB HDD', '512GB SSD +  512GB SSD',
       '64GB Flash Storage', '64GB Flash Storage +  1TB HDD', '64GB SSD',
       '8GB SSD'], dtype=object)

In [46]:
GPU = df['GPU'].unique()
GPU.sort()
print(df['GPU'].value_counts())
GPU

Intel HD Graphics 620      281
Intel HD Graphics 520      185
Intel UHD Graphics 620      68
Nvidia GeForce GTX 1050     66
Nvidia GeForce GTX 1060     48
                          ... 
AMD Radeon R5 520            1
AMD Radeon R7                1
Intel HD Graphics 540        1
AMD Radeon 540               1
ARM Mali T860 MP4            1
Name: GPU, Length: 110, dtype: int64


array(['AMD FirePro W4190M', 'AMD FirePro W4190M ', 'AMD FirePro W5130M',
       'AMD FirePro W6150M', 'AMD R17M-M1-70', 'AMD R4 Graphics',
       'AMD Radeon 520', 'AMD Radeon 530', 'AMD Radeon 540',
       'AMD Radeon Pro 455', 'AMD Radeon Pro 555', 'AMD Radeon Pro 560',
       'AMD Radeon R2', 'AMD Radeon R2 Graphics', 'AMD Radeon R3',
       'AMD Radeon R4', 'AMD Radeon R4 Graphics', 'AMD Radeon R5',
       'AMD Radeon R5 430', 'AMD Radeon R5 520', 'AMD Radeon R5 M315',
       'AMD Radeon R5 M330', 'AMD Radeon R5 M420', 'AMD Radeon R5 M420X',
       'AMD Radeon R5 M430', 'AMD Radeon R7', 'AMD Radeon R7 Graphics',
       'AMD Radeon R7 M360', 'AMD Radeon R7 M365X', 'AMD Radeon R7 M440',
       'AMD Radeon R7 M445', 'AMD Radeon R7 M460', 'AMD Radeon R7 M465',
       'AMD Radeon R9 M385', 'AMD Radeon RX 540', 'AMD Radeon RX 550',
       'AMD Radeon RX 560', 'AMD Radeon RX 580', 'ARM Mali T860 MP4',
       'Intel Graphics 620', 'Intel HD Graphics', 'Intel HD Graphics 400',
       'Inte

In [87]:
OS = df['Operating System'].unique()
OS.sort()
print(df['Operating System'].value_counts())
OS

Windows      1124
No OS          66
Linux          62
Chrome OS      27
Mac OS         21
Android         2
Name: Operating System, dtype: int64


array(['Android', 'Chrome OS', 'Linux', 'Mac OS', 'No OS', 'Windows'],
      dtype=object)

In [80]:
weight = df['Weight'].unique()
weight.sort()
print(df['Weight'].value_counts())
weight

2.2     121
2.1      58
2.4      44
2.3      40
2.5      38
       ... 
3.25      1
4.7       1
1.55      1
1.18      1
4.0       1
Name: Weight, Length: 179, dtype: int64


array(['0.69', '0.81', '0.91', '0.92', '0.920', '0.97', '0.98', '0.99',
       '1.05', '1.08', '1.09', '1.1', '1.10', '1.11', '1.12', '1.13',
       '1.14', '1.15', '1.16', '1.17', '1.18', '1.19', '1.2', '1.21',
       '1.22', '1.23', '1.24', '1.25', '1.252', '1.26', '1.27', '1.28',
       '1.29', '1.3', '1.31', '1.32', '1.34', '1.35', '1.36', '1.37',
       '1.38', '1.39', '1.4', '1.41', '1.42', '1.43', '1.44', '1.45',
       '1.47', '1.48', '1.49', '1.5', '1.54', '1.55', '1.56', '1.58',
       '1.59', '1.6', '1.62', '1.63', '1.64', '1.65', '1.68', '1.7',
       '1.70', '1.71', '1.74', '1.75', '1.76', '1.78', '1.79', '1.8',
       '1.83', '1.84', '1.85', '1.86', '1.87', '1.88', '1.89', '1.9',
       '1.90', '1.91', '1.93', '1.94', '1.95', '1.96', '1.98', '1.99',
       '2', '2.0', '2.02', '2.03', '2.04', '2.05', '2.06', '2.07', '2.08',
       '2.09', '2.1', '2.13', '2.14', '2.15', '2.16', '2.17', '2.18',
       '2.19', '2.191', '2.2', '2.20', '2.21', '2.23', '2.24', '2.25',
       '2.

In [55]:
price = df['Price'].unique()
price.sort()
print(df['Price'].value_counts())
price

13329108.0    14
9772308.0     14
15996708.0    13
7993908.0     11
10661508.0    11
              ..
18584280.0     1
2703168.0      1
6749028.0      1
16521336.0     1
23999508.0     1
Name: Price, Length: 791, dtype: int64


array([ 1547208.  ,  1706374.8 ,  1742832.  ,  1769508.  ,  1804186.8 ,
        1858428.  ,  1874433.6 ,  1991808.  ,  2036268.  ,  2125188.  ,
        2178451.08,  2178540.  ,  2213218.8 ,  2214108.  ,  2243985.12,
        2267460.  ,  2294136.  ,  2303028.  ,  2311920.  ,  2356380.  ,
        2391948.  ,  2406353.04,  2418624.  ,  2436408.  ,  2444410.8 ,
        2445211.08,  2445300.  ,  2471887.08,  2471976.  ,  2480868.  ,
        2543112.  ,  2560006.8 ,  2568898.8 ,  2569788.  ,  2596464.  ,
        2623140.  ,  2640924.  ,  2649816.  ,  2658708.  ,  2703168.  ,
        2707169.4 ,  2720952.  ,  2747628.  ,  2827656.  ,  2836548.  ,
        2863135.08,  2881008.  ,  2889900.  ,  2925468.  ,  2934360.  ,
        2961036.  ,  3014388.  ,  3023280.  ,  3058848.  ,  3067651.08,
        3076543.08,  3085524.  ,  3094416.  ,  3103308.  ,  3156660.  ,
        3192228.  ,  3201031.08,  3217125.6 ,  3232330.92,  3244690.8 ,
        3263364.  ,  3272256.  ,  3281148.  ,  3334500.  ,  3370

In [44]:
df.columns

Index(['Manufacturer', 'Model Name', 'Category', 'Screen Size', 'Screen',
       'CPU', 'RAM', ' Storage', 'GPU', 'Operating System', 'Weight', 'Price'],
      dtype='object')

In [61]:
#1 INR = 0.012203 USD Conversion Rate as of May 10,2023
df['Price_USD'] = df.Price/81.9433
df.Price_USD

0      145375.198216
1       97547.627201
2       62395.583287
3      275348.996196
4      195715.954813
           ...      
320     69231.968935
321    162662.572779
322     24849.719257
323     82904.740229
324     40041.687362
Name: Price, Length: 1302, dtype: float64

In [62]:
df[['Price','Price_USD']]

Unnamed: 0,Price,Price_USD
0,11912523.48,145375.198216
1,7993374.48,97547.627201
2,5112900.00,62395.583287
3,22563005.40,275348.996196
4,16037611.20,195715.954813
...,...,...
320,5673096.00,69231.968935
321,13329108.00,162662.572779
322,2036268.00,24849.719257
323,6793488.00,82904.740229


In [None]:
#Target Encoding for Manfacturer


In [84]:
df['Weight_LBS'] = df.Weight*2.204623
df.Weight_LBS

0      3.020334
1      2.954195
2      4.100599
3      4.034460
4      3.020334
         ...   
320    3.968321
321    2.866010
322    3.306935
323    4.828124
324    4.850171
Name: Weight_LBS, Length: 1302, dtype: float64

In [69]:
df.Weight = df.Weight.str.replace('kg','')

In [83]:
df.Weight = df.Weight.astype(float)

In [81]:
#4s is a typo. Google search the weight of the laptop
df.Weight = df.Weight.replace('4s','4.04')

In [85]:
df[['Weight','Weight_LBS']]

Unnamed: 0,Weight,Weight_LBS
0,1.37,3.020334
1,1.34,2.954195
2,1.86,4.100599
3,1.83,4.034460
4,1.37,3.020334
...,...,...
320,1.80,3.968321
321,1.30,2.866010
322,1.50,3.306935
323,2.19,4.828124


In [86]:
#Operating System Typo
df['Operating System'] = df['Operating System'].replace('macOS','Mac OS')