In [234]:
import pandas as pd
import numpy as np
import ast

In [5]:
df1=pd.read_csv("pre_cleaning.csv")
df=df1.copy()

In [6]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

## kind of projects we make with it
- Exploratory Data Analysis (EDA): Perform EDA to uncover insights about the laptop market, such as popular brands, common features, and price ranges.
- Price Prediction: Build a machine learning model to predict the price of a laptop based on its specifications, such as CPU, RAM, storage, graphics card, etc.
- Recommendation System: Create a recommendation system that suggests laptops to users based on their preferences, budget, and usage requirements.
- Classification: Build a classification model to categorize laptops into different classes, such as gaming laptops, business laptops, ultrabooks, etc., based on their attributes.
- Customer Segmentation: Segment customers based on their laptop preferences and behaviors to help manufacturers and retailers target specific market segments effectively.

In [20]:
with pd.ExcelWriter("temp.xlsx") as writer:
    df.to_excel(writer,sheet_name="laptop_dataset")

In [485]:
df.columns

Index(['price', 'Brand', 'num_votes', 'Ratings', 'utility', 'thickness',
       'weight', 'warranty', 'screen_size', 'resolution', 'ppi', 'processor',
       'threads', 'graphic_card', 'ram', 'hard_disk', 'battery1', 'hdmi',
       'wifi', 'usb', 'camera', 'others', 'antiglare1', 'aspect_ratio1',
       'touch_screen1', 'battery', 'cores', 'cores1', 'threads1',
       'battery_capacity', 'battery_cell', 'col1', 'col2', 'col3', 'col4',
       'col5', 'hdmi1', 'ethernet', 'multi_card_reader', 'thunderbolt',
       'display_port', 'vga'],
      dtype='object')

## Data Accessing
**Quality Issues**
1. price-There is no problem in price column.
2. Brand-There is no problem in brand column.
3. num_votes
    - there is "," between values
    - type of column is object
4. num_reviews
    - 766 null values i think we should delete this column
    - type is float
5. Ratings- There is no problem
6. os
    - In os column approx thousand columns have windows then i don't think that it is an important column. so i think i will delete it.
7. utility
    - contains values of thickness remove it and place null value.
8. thickness
    - thickness is given in mm so only keep the digit and remove everything 
    - any value that doesnot start with thickness remove it
9. weight
    - remove anything that is not weight
    - some values are in gm so convert it to kg 
    - convert its data type to float
10. warranty
    - remove anything that is not year
    - convert it to integer values are given in year 
11. screen_size
    - values are given in inches remove everything except int value
    - convert data type to integer
12. resolution
    - remove any other value
    - keep only resolution value
    - 528,777 move it one side right
13. ppi
    - 528,777 take value from resolution
    - covert it to int
14. antiglare
    - remove every value except antiglare
    - convert it to binary value where 0 means not available and 1 means available
15. touchscreen
    - remove every value except touchscreen
    - convert it to binary value where 0 means not available and 1 means available
16. processor
    - make three columns (processor_brand, processor_gen, processor model)
17. cores
    - in column 363 value of thread is given 
18. threads
    - in column 363 value of cache is given
19. cache
    - some values are null cache are given in mb
    - one value is smart cache
20. graphic_card
       - make three columns (graphics_brand, graphics_capacity, graphics_model)
21. ram
    - take only capacity value make it numerical
22. hard_disk
    - 99% hard disk are ssd so i think it to convert it all t ssd and store only capacity in gb
    - convert tb values also to gb
23. battery1
    - covert this column to list using avl library so that this column becomes clean
    - make two columns out of it cell, wh
24. hdmi
    - convert this column to list and make a new column for every category and store values in it
25. wifi
    - make two columns one for wifi and one for bluetooth and store values in it
    - at some places there is no wifi value
26. usb
    - make three columns usb2 ,  usb3 , type c and store number of ports in it
    
27. camera
    - only 37 values are given so i think i will remove this column
28. others 
    - 4 null values
    - make 3 new columns (backlight keyboard, inbuilt microphone , fingerprint sensor)
    
    
    


utility - contains four categories(everyday use, bussiness, performance, gaming)

In [56]:
#her i replace "," with nothing
df["num_votes"]=df["num_votes"].str.replace(",","")

In [66]:
#here we drop num_reviews column
df.drop("num_reviews",axis=1,inplace=True)

In [68]:
#here i drop os column
df.drop("os",axis=1,inplace=True)

In [148]:
# here i make a copy of df just in case any mistake happens
temp_df=df.copy()

In [149]:
#in this case we fetch all the values which contains "thickness" in "utility" column store it in thickness column and return null value
x=df[df["utility"].str.startswith("Thickness:")]
temp_df.loc[x.index,"thickness"]=x["utility"]
temp_df.loc[x.index,"utility"]=np.nan

In [151]:
#we fetch values of weight from warranty column store it in weight and return null values
x=df[~df["warranty"].str.contains("Year")]
temp_df.loc[x.index,"weight"]=x["warranty"]
temp_df.loc[x.index,"warranty"]=np.nan

In [152]:
#in this we replace any value which not contain thickness in thickness column with null value.
x=temp_df[~temp_df["thickness"].str.startswith("Thickness:")]
temp_df.loc[x.index,"thickness"]=np.nan

In [153]:
#we replace any value which not contain weight in "weight" column with null value
x=temp_df[temp_df["weight"].str.contains("|".join(["Thickness:","Utility:"]))]
temp_df.loc[x.index,"weight"]=np.nan

In [156]:
#here we again store temp_df to df
df=temp_df

In [178]:
#this step is to be done so that masking should be applied
df["antiglare"].fillna("False",inplace=True)
df["aspect_ratio"].fillna("False",inplace=True)
df["touchscreen"].fillna("False",inplace=True)

In [181]:

temp_df=df

In [185]:
#first shift 2 antiglare values in ppi to antiglare column
x=temp_df[temp_df["ppi"].str.contains("Anti Glare")]
temp_df.loc[x.index,"antiglare"]=x["ppi"]

In [189]:
#now do the same thing for the ppi column 
x=temp_df[temp_df["ppi"].str.contains("Anti Glare")]
temp_df.loc[x.index,"ppi"]=x["resolution"]

In [192]:
#here we set the values in resolution in which ppi is given to null
temp_df.loc[temp_df[temp_df["resolution"].str.contains("PPI")].index,"resolution"]=np.nan

In [210]:
#in this we make a new column "antiglare1" and store all antiglare values from antiglare touchscreen and aspect ratio column
x=temp_df[temp_df["antiglare"].str.contains("Anti Glare")]
y=temp_df[temp_df["aspect_ratio"].str.contains("Anti Glare")]
z=temp_df[temp_df["touchscreen"].str.contains("Anti Glare")]
temp_df["antiglare1"]=x["antiglare"]
temp_df.loc[y.index,"antiglare1"]=y["aspect_ratio"]
temp_df.loc[z.index,"antiglare1"]=z["touchscreen"]

In [211]:
#in this we make a new column "aspect_ratio1" and store all antiglare values from antiglare touchscreen and aspect ratio column
x=temp_df[temp_df["antiglare"].str.contains("Aspect Ratio")]
y=temp_df[temp_df["aspect_ratio"].str.contains("Aspect Ratio")]
z=temp_df[temp_df["touchscreen"].str.contains("Aspect Ratio")]
temp_df["aspect_ratio1"]=x["antiglare"]
temp_df.loc[y.index,"aspect_ratio1"]=y["aspect_ratio"]
temp_df.loc[z.index,"aspect_ratio1"]=z["touchscreen"]

In [215]:
#in this we make a new column "touch_screen1" and store all antiglare values from antiglare touchscreen and aspect ratio column
x=temp_df[temp_df["antiglare"].str.contains("Touch")]
y=temp_df[temp_df["aspect_ratio"].str.contains("Touch")]
z=temp_df[temp_df["touchscreen"].str.contains("Touch")]
temp_df["touch_screen1"]=x["antiglare"]
temp_df.loc[y.index,"touch_screen1"]=y["aspect_ratio"]
temp_df.loc[z.index,"touch_screen1"]=z["touchscreen"]

In [220]:
# here we drop extra columns
temp_df.drop(["antiglare","aspect_ratio","touchscreen"],axis=1,inplace=True)

## Cores

In [256]:
#in this we delete "cores" column because same information is in "threads" column.
df.drop("cores",axis=1,inplace=True)

## Threads

In [283]:
#in this we split this column into two columns "cores","threads"
df["cores"]=df["threads"].str.split(",").str[0]
df["cores"].fillna("False",inplace=True)
df["cores1"]=df[df["cores"].str.contains("Core")]["cores"]
    


In [298]:
temp_df=df.copy()

In [299]:
temp_df["threads"]=temp_df["threads"].str.split(",")

In [317]:
temp_df.fillna("False",inplace=True)

In [None]:
list=[]
for i in temp_df["threads"]:
    if len(i)==2:
        list.append(i[1])
    elif i[0].endswith("Threads"):
        list.append(i[0])
    else:
        list.append(np.nan)

temp_df["threads"]=pd.Series(list)

In [323]:
df["threads"]=temp_df["threads"] 

## Cache

In [325]:
df.drop("cache",axis=1,inplace=True)

## Battery

In [329]:
df["battery1"].fillna("False",inplace=True)
new_list=[]
for i in df["battery1"]:
    my_list = ast.literal_eval(i)
    new_list.append(my_list)
df["battery"]=pd.Series(new_list)

In [338]:
temp_df=df.copy()

In [406]:
capacity=[]
cell=[]
for i in temp_df["battery"]:
    if i is not False:
        for j in i:
            if len(j.split(","))==2:
                for k in j.split(","):
                    if "Wh" in k:
                        capacity.append(k)
                    elif "Cell" in k:
                        cell.append(k)
            elif len(j.split(","))==1:
                for k in j.split(","):
                    if "Wh" in k:
                        capacity.append(k)
                        cell.append(np.nan)
                    elif "Cell" in k:
                        cell.append(k)
                        capacity.append(np.nan)
                    else:
                        capacity.append(np.nan)
                        cell.append(np.nan)
                
            
    else:
        capacity.append(np.nan)
        cell.append(np.nan)
temp_df[["battery_capacity","battery_cell"]]=pd.Series({"battery_capacity":capacity,"battery_cell":cell})

        

In [413]:
df=temp_df

## HDMI

In [424]:
df["hdmi"].str.split(",").str.len().value_counts()

2.0    437
1.0    303
3.0    222
4.0     35
5.0      4
Name: hdmi, dtype: int64

In [425]:
temp_df=df.copy()

In [431]:
temp_df["hdmi"]=temp_df["hdmi"].str.split(",")

In [450]:
temp_df["hdmi"].fillna("False",inplace=True)

In [455]:
col1=[]
col2=[]
col3=[]
col4=[]
col5=[]
for i in temp_df["hdmi"]:
    if i is not False:
        if len(i)==5:
            col1.append(i[0])
            col2.append(i[1])
            col3.append(i[2])
            col4.append(i[3])
            col5.append(i[4])
        elif len(i)==4:
            col1.append(i[0])
            col2.append(i[1])
            col3.append(i[2])
            col4.append(i[3])
            col5.append(np.nan)
        elif len(i)==3:
            col1.append(i[0])
            col2.append(i[1])
            col3.append(i[2])
            col4.append(np.nan)
            col5.append(np.nan)
        elif len(i)==2:
            col1.append(i[0])
            col2.append(i[1])
            col3.append(np.nan)
            col4.append(np.nan)
            col5.append(np.nan)
        elif len(i)==1:
            col1.append(i[0])
            col2.append(np.nan)
            col3.append(np.nan)
            col4.append(np.nan)
            col5.append(np.nan)
    else:
        col1.append(np.nan)
        col2.append(np.nan)
        col3.append(np.nan)
        col4.append(np.nan)
        col5.append(np.nan)

temp_df["col1"]=col1
temp_df["col2"]=col2
temp_df["col3"]=col3
temp_df["col4"]=col4
temp_df["col5"]=col5
       

In [464]:
temp_df["col1"].fillna("False",inplace=True)
temp_df["col2"].fillna("False",inplace=True)
temp_df["col3"].fillna("False",inplace=True)
temp_df["col4"].fillna("False",inplace=True)
temp_df["col5"].fillna("False",inplace=True)

In [467]:
a=temp_df[temp_df["col1"].str.contains("HDMI")]
b=temp_df[temp_df["col2"].str.contains("HDMI")]
c=temp_df[temp_df["col3"].str.contains("HDMI")]
d=temp_df[temp_df["col4"].str.contains("HDMI")]
e=temp_df[temp_df["col5"].str.contains("HDMI")]
temp_df["hdmi1"]=a["col1"]
temp_df.loc[b.index,"hdmi1"]=b["col2"]
temp_df.loc[c.index,"hdmi1"]=c["col3"]
temp_df.loc[d.index,"hdmi1"]=d["col4"]
temp_df.loc[e.index,"hdmi1"]=e["col5"]

In [471]:
a=temp_df[temp_df["col1"].str.contains("Ethernet")]
b=temp_df[temp_df["col2"].str.contains("Ethernet")]
c=temp_df[temp_df["col3"].str.contains("Ethernet")]
d=temp_df[temp_df["col4"].str.contains("Ethernet")]
e=temp_df[temp_df["col5"].str.contains("Ethernet")]
temp_df["ethernet"]=a["col1"]
temp_df.loc[b.index,"ethernet"]=b["col2"]
temp_df.loc[c.index,"ethernet"]=c["col3"]
temp_df.loc[d.index,"ethernet"]=d["col4"]
temp_df.loc[e.index,"ethernet"]=e["col5"]

In [472]:
a=temp_df[temp_df["col1"].str.contains("Multi")]
b=temp_df[temp_df["col2"].str.contains("Multi")]
c=temp_df[temp_df["col3"].str.contains("Multi")]
d=temp_df[temp_df["col4"].str.contains("Multi")]
e=temp_df[temp_df["col5"].str.contains("Multi")]
temp_df["multi_card_reader"]=a["col1"]
temp_df.loc[b.index,"multi_card_reader"]=b["col2"]
temp_df.loc[c.index,"multi_card_reader"]=c["col3"]
temp_df.loc[d.index,"multi_card_reader"]=d["col4"]
temp_df.loc[e.index,"multi_card_reader"]=e["col5"]

In [475]:
a=temp_df[temp_df["col1"].str.contains("VGA")]
b=temp_df[temp_df["col2"].str.contains("VGA")]
c=temp_df[temp_df["col3"].str.contains("VGA")]
d=temp_df[temp_df["col4"].str.contains("VGA")]
e=temp_df[temp_df["col5"].str.contains("VGA")]
temp_df["vga"]=a["col1"]
temp_df.loc[b.index,"vga"]=b["col2"]
temp_df.loc[c.index,"vga"]=c["col3"]
temp_df.loc[d.index,"vga"]=d["col4"]
temp_df.loc[e.index,"vga"]=e["col5"]

In [473]:
a=temp_df[temp_df["col1"].str.contains("Thunderbolt")]
b=temp_df[temp_df["col2"].str.contains("Thunderbolt")]
c=temp_df[temp_df["col3"].str.contains("Thunderbolt")]
d=temp_df[temp_df["col4"].str.contains("Thunderbolt")]
e=temp_df[temp_df["col5"].str.contains("Thunderbolt")]
temp_df["thunderbolt"]=a["col1"]
temp_df.loc[b.index,"thunderbolt"]=b["col2"]
temp_df.loc[c.index,"thunderbolt"]=c["col3"]
temp_df.loc[d.index,"thunderbolt"]=d["col4"]
temp_df.loc[e.index,"thunderbolt"]=e["col5"]

In [474]:
a=temp_df[temp_df["col1"].str.contains("Display")]
b=temp_df[temp_df["col2"].str.contains("Display")]
c=temp_df[temp_df["col3"].str.contains("Display")]
d=temp_df[temp_df["col4"].str.contains("Display")]
e=temp_df[temp_df["col5"].str.contains("Display")]
temp_df["display_port"]=a["col1"]
temp_df.loc[b.index,"display_port"]=b["col2"]
temp_df.loc[c.index,"display_port"]=c["col3"]
temp_df.loc[d.index,"display_port"]=d["col4"]
temp_df.loc[e.index,"display_port"]=e["col5"]

In [484]:
df=temp_df

## Wifi

In [487]:
temp_df=df.copy()

In [492]:
temp_df["wifi"].value_counts()
temp_df["wifi"].isnull().sum()
# only three laptops which dont have wifi so it is a useless column
df.drop("wifi",axis=1,inplace=True)

## USB

In [618]:
temp_df=df.copy()

In [619]:
temp_df["usb"]=temp_df["usb"].str.split(",")


In [620]:
col1=[]
col2=[]
col3=[]
temp_df["usb"].fillna("False",inplace=True)
for i in temp_df["usb"]:
    if i is not False:
        if len(i)==3:
            col1.append(i[0])
            col2.append(i[1])
            col3.append(i[2])
        elif len(i)==2:
            col1.append(i[0])
            col2.append(i[1])
            col3.append(np.nan)
        elif len(i)==1:
            col1.append(i[0])
            col2.append(np.nan)
            col3.append(np.nan)
        else:
            col1.append(np.nan)
            col2.append(np.nan)
            col3.append(np.nan)
    else:
        col1.append(np.nan)
        col2.append(np.nan)
        col3.append(np.nan)
temp_df["usb_2"]=col1
temp_df["usb_3"]=col2
temp_df["type_c"]=col3
    

In [621]:
temp_df["usb_2"].fillna("False",inplace=True)
temp_df["usb_3"].fillna("False",inplace=True)
temp_df["type_c"].fillna("False",inplace=True)

In [622]:
a=temp_df[temp_df["usb_2"].str.contains("USB 2.0")]
b=temp_df[temp_df["usb_3"].str.contains("USB 2.0")]
c=temp_df[temp_df["type_c"].str.contains("USB 2.0")]
temp_df["usb2"]=a["usb_2"]
temp_df.loc[b.index,"usb2"]=b["usb_3"]
temp_df.loc[c.index,"usb2"]=c["type_c"]


In [623]:
a=temp_df[temp_df["usb_2"].str.contains("USB 3.0")]
b=temp_df[temp_df["usb_3"].str.contains("USB 3.0")]
c=temp_df[temp_df["type_c"].str.contains("USB 3.0")]
temp_df["usb3"]=a["usb_2"]
temp_df.loc[b.index,"usb3"]=b["usb_3"]
temp_df.loc[c.index,"usb3"]=c["type_c"]

In [624]:
a=temp_df[temp_df["usb_2"].str.contains("Type-C")]
b=temp_df[temp_df["usb_3"].str.contains("Type-C")]
c=temp_df[temp_df["type_c"].str.contains("Type-C")]
temp_df["typec"]=a["usb_2"]
temp_df.loc[b.index,"typec"]=b["usb_3"]
temp_df.loc[c.index,"typec"]=c["type_c"]

In [626]:
df[["usb2","usb3","typec"]]=temp_df[["usb2","usb3","typec"]]

## Camera

In [539]:
df["camera"].isnull().sum()
# there are almost 983 null values
df.drop("camera",axis=1,inplace=True)

KeyError: 'camera'

## others

In [548]:
temp_df=df.copy()

In [549]:
temp_df["others"]=temp_df["others"].str.split(",")

In [551]:
temp_df["others"].fillna("False",inplace=True)

In [560]:
col1=[]
col2=[]
col3=[]
for i in temp_df["others"]:
    if i is not False:
        if len(i)==3:
            col1.append(i[0])
            col2.append(i[1])
            col3.append(i[2])
        elif len(i)==2:
            col1.append(i[0])
            col2.append(i[1])
            col3.append(np.nan)
        elif len(i)==1:
            col1.append(i[0])
            col2.append(np.nan)
            col3.append(np.nan)
        else:
            col1.append(np.nan)
            col2.append(np.nan)
            col3.append(np.nan)
    else:
        col1.append(np.nan)
        col2.append(np.nan)
        col3.append(np.nan)

temp_df["col11"]=col1
temp_df["col21"]=col2
temp_df["col31"]=col3

            

In [566]:
temp_df["col11"].fillna("False",inplace=True)
temp_df["col21"].fillna("False",inplace=True)
temp_df["col31"].fillna("False",inplace=True)

In [570]:
a=temp_df[temp_df["col11"].str.contains("Backlit")]
b=temp_df[temp_df["col21"].str.contains("Backlit")]
c=temp_df[temp_df["col31"].str.contains("Backlit")]
temp_df["backlit"]=a["col11"]
temp_df.loc[b.index,"backlit"]=b["col21"]
temp_df.loc[c.index,"backlit"]=c["col31"]

In [571]:
a=temp_df[temp_df["col11"].str.contains("Sensor")]
b=temp_df[temp_df["col21"].str.contains("Sensor")]
c=temp_df[temp_df["col31"].str.contains("Sensor")]
temp_df["fingerprint_sensor"]=a["col11"]
temp_df.loc[b.index,"fingerprint_sensor"]=b["col21"]
temp_df.loc[c.index,"fingerprint_sensor"]=c["col31"]

In [572]:
a=temp_df[temp_df["col11"].str.contains("Inbuilt")]
b=temp_df[temp_df["col21"].str.contains("Inbuilt")]
c=temp_df[temp_df["col31"].str.contains("Inbuilt")]
temp_df["inbuilt_microphone"]=a["col11"]
temp_df.loc[b.index,"inbuilt_microphone"]=b["col21"]
temp_df.loc[c.index,"inbuilt_microphone"]=c["col31"]

In [576]:
df=temp_df

In [607]:
df.columns

Index(['price', 'Brand', 'num_votes', 'Ratings', 'utility', 'thickness',
       'weight', 'warranty', 'screen_size', 'resolution', 'ppi', 'processor',
       'threads', 'graphic_card', 'ram', 'hard_disk', 'battery1', 'hdmi',
       'usb', 'others', 'antiglare1', 'aspect_ratio1', 'touch_screen1',
       'battery', 'cores', 'cores1', 'threads1', 'battery_capacity',
       'battery_cell', 'hdmi1', 'ethernet', 'multi_card_reader', 'thunderbolt',
       'display_port', 'vga', 'usb2', 'typec', 'backlit', 'fingerprint_sensor',
       'inbuilt_microphone', 'usb3'],
      dtype='object')

In [590]:
df.drop(["col1","col2","col3","col4","col5","usb_2","usb_3","type_c","col11","col21","col31"],axis=1,inplace=True)

In [608]:
df.drop(["battery1","battery","hdmi","others","usb","usb2","usb3","typec"],axis=1,inplace=True)

In [670]:
df.drop(["usb","cores","threads1"],axis=1,inplace=True)

In [668]:
# one value is different in aspect_ratio1 so we replace it with np.nan.
df["aspect_ratio1"] = np.where(df["aspect_ratio1"] == "250 nits, 141 ppi, Color Gamut: 45%NTSC Aspect Ratio", np.nan, df["aspect_ratio1"])    

In [779]:
df.columns

Index(['price', 'Brand', 'num_votes', 'Ratings', 'utility', 'thickness',
       'weight', 'warranty', 'screen_size', 'resolution', 'ppi', 'threads',
       'graphic_card', 'ram', 'hard_disk', 'antiglare1', 'aspect_ratio1',
       'touch_screen1', 'cores1', 'battery_capacity', 'battery_cell', 'hdmi1',
       'ethernet', 'multi_card_reader', 'thunderbolt', 'display_port', 'vga',
       'backlit', 'fingerprint_sensor', 'inbuilt_microphone', 'usb2', 'usb3',
       'typec', 'processor_gen', 'processor_brand', 'processor_model',
       'graphics_brand', 'graphics_capacity', 'graphics_model'],
      dtype='object')

## Processor

In [681]:
temp_df=df.copy()

In [726]:
# There is one column in which wrong brand is given
temp_df["processor"].replace("AMD Core i3 N305","Intel Core i3 N305",inplace=True)

In [728]:
# Here i make a processor brand column and store all the processor_brands in it.
list1=[]
for i in temp_df["processor"]:
    if "Intel" in i or "intel" in i:
        list1.append("intel")
    elif "AMD" in i or "Amd" in i:
        list1.append("amd")
    elif "Apple" in i:
        list1.append("apple")
    elif "MediaTek" in i:
        list1.append("mediatek")
    else:
        list1.append(np.nan)
temp_df["processor_brand"]=list1

In [739]:
# Here i make a processor_gen column and store all the generation in it.
list1=[]
for i in temp_df["processor"]:
    if "13th" in i:
        list1.append("13")
    elif "12th" in i:
        list1.append("12")
    elif "11th" in i:
        list1.append("11")
    elif "10th" in i:
        list1.append("10")
    elif "9th" in i:
        list1.append("9")
    elif "8th" in i:
        list1.append("8")
    elif "7th" in i:
        list1.append("7")
    elif "6th" in i:
        list1.append("6")
    elif "5th" in i:
        list1.append("5")
    elif "4th" in i:
        list1.append("4")
    elif "3rd" in i:
        list1.append("3")
    else:
        list1.append(np.nan)
temp_df["processor_gen"]=list1

In [740]:
# Here i make a processor_model column and store all the processor model in it.
list1=[]
for i in temp_df["processor"]:
    if "i5" in i:
        list1.append("i5")
    elif "i3" in i:
        list1.append("i3")
    elif "i7" in i:
        list1.append("i7")
    elif "i9" in i:
        list1.append("i9")
    elif "M1" in i:
        list1.append("M1")
    elif "M2" in i:
        list1.append("M2")
    elif "3" in i:
        list1.append("3")
    elif "5" in i:
        list1.append("5")
    elif "7" in i or "Ryzen 7040":
        list1.append("7")
    elif "9" in i:
        list1.append("9")
    elif "Athlon" in i:
        list1.append("athlon")
    elif "Celeron" in i:
        list1.append("celeron")
    elif "Pentium" in i:
        list1.append("pentium")
    else:
        list1.append(np.nan)
temp_df["processor_model"]=list1

In [741]:
df=temp_df

In [743]:
df.drop("processor",axis=1,inplace=True)

## Graphic Card

In [752]:
df["graphic_card"].isnull().sum()
temp_df=df.copy()

In [757]:
list1=[]
for i in temp_df["graphic_card"]:
    if "Intel" in i or "Iris" in i or "UHD" in i:
        list1.append("intel")
    elif "NVIDIA" in i or "Nvidia" in i or "Geforce" in i:
        list1.append("nvidia")
    elif "AMD" in i or "Radeon" in i :
        list1.append("amd")
    elif "ARM" in i:
        list1.append("arm")
    elif "Apple" in i or "8 Core" in i or "10 Core" in i or "16 Core" in i or "38 Core" in i or "8-Core" in i or "10-Core" in i or "16-core" in i or "38-core" in i:
        list1.append("apple")
    else:
        list1.append(np.nan)
temp_df["graphics_brand"]=list1
        

In [1058]:
temp_df["graphics_capacity"]=temp_df["graphic_card"].str.split().str.get(0).str.strip()

In [1069]:
temp_df["graphics_capacity"]=temp_df["graphics_capacity"].apply(lambda x: x if x in ['2','4','6','8','10','12','16'] else np.nan)

In [1070]:
df["graphics_capacity"]=temp_df["graphics_capacity"]

In [766]:
list1=[]
for i in temp_df["graphic_card"]:
    if "RTX 2050" in i or "RTX2050" in i:
        list1.append("rtx2050")
    elif "RTX 2060" in i or "RTX2060" in i:
        list1.append("rtx2060")
    elif "RTX 3050" in i or "RTX3050" in i:
        list1.append("rtx3050")
    elif "RTX 3060" in i or "RTX3060" in i:
        list1.append("rtx3060")
    elif "RTX 4050" in i or "RTX4050" in i:
        list1.append("rtx4050")
    elif "RTX 4060" in i or "RTX4060" in i:
        list1.append("rtx4060")
    elif "RTX 4070" in i or "RTX4070" in i:
        list1.append("rtx4070")
    elif "RTX 4080" in i or "RTX4080" in i:
        list1.append("rtx4080")
    elif "RTX 4090" in i or "RTX4090" in i:
        list1.append("rtx4090")
    elif "RTX 3070 Ti" in i or "RTX3070 Ti" in i:
        list1.append("rtx3070ti")
    elif "RTX 3060 Ti" in i or "RTX3060 Ti" in i:
        list1.append("rtx3060ti")
    elif "RTX 3080 Ti" in i or "RTX3080 Ti" in i:
        list1.append("rtx3080ti")
    elif "GTX 1650" in i or "GTX1650" in i:
        list1.append("gtx1650")
    elif "GTX 2050" in i or "GTX 2050" in i:
        list1.append("gtx2050")
    elif "T500" in i or "T 500" in i:
        list1.append("t500")
    elif "T 550" in i or "T550" in i:
        list1.append("t550")
    elif "T 600" in i or "T600" in i:
        list1.append("t600")
    elif "MX130" in i or "MX 130" in i:
        list1.append("mx130")
    elif "MX450" in i or "MX 450" in i:
        list1.append("mx450")
    elif "MX 550" in i or "MX550" in i:
        list1.append("mx550")
    elif "MX 570" in i or "MX570" in i:
        list1.append("mx570")
    elif "RX 5500M" in i or "RX5500M" in i:
        list1.append("rx5500m")
    elif "RX 5600M" in i or "RX5600M" in i:
        list1.append("rx5600m")
    elif "RX 6500M" in i or "RX6500M" in i:
        list1.append("rx6500m")
    elif "RX 7600S" in i or "RX7600S" in i:
        list1.append("rx7600s")
    else:
        list1.append("Integrated")
    
temp_df["graphics_model"]=list1
        

In [768]:
df=temp_df

**Now we eliminate any other issues related to columns**

**Price**

In [777]:
print(df["price"].isnull().sum())
print(df["price"].info())
print(df["price"].value_counts())
print(df["price"].describe())

0
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: price
Non-Null Count  Dtype
--------------  -----
1020 non-null   int64
dtypes: int64(1)
memory usage: 8.1 KB
None
59990     20
69990     18
94990     17
44990     17
36990     15
54990     15
49990     15
62990     15
63990     14
109990    14
52990     14
58990     14
79990     14
34990     13
67990     12
57990     12
64990     11
89990     10
23990     10
47990     10
35990     10
48990     10
74990      9
60990      9
37990      9
76990      8
42990      8
51990      8
72990      8
32990      8
56990      8
99990      8
114990     7
73990      7
38990      7
53990      7
29990      7
45990      7
84990      7
61990      6
30990      6
40990      6
39990      6
82990      6
87990      6
199990     6
27990      6
129990     6
86990      5
16990      5
88990      5
65990      5
100990     5
85990      5
41990      5
43990      5
66990      5
138990     4
87490      4
41490      4
134990     4
75990

**Brand**

In [778]:
print(df["Brand"].isnull().sum())
print(df["Brand"].info())
print(df["Brand"].value_counts())
print(df["Brand"].describe())

0
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: Brand
Non-Null Count  Dtype 
--------------  ----- 
1020 non-null   object
dtypes: object(1)
memory usage: 8.1+ KB
None
Asus         201
HP           193
Lenovo       187
MSI          121
Dell         103
Acer          94
Infinix       22
LG            17
Samsung       12
Gigabyte      12
Apple         10
Fujitsu        9
Honor          6
Wings          6
Xiaomi         5
Ultimus        3
AXL            3
Avita          2
Razer          2
Huawei         2
Chuwi          2
Realme         2
Primebook      2
Vaio           1
Ninkear        1
ASUS           1
Walker         1
Name: Brand, dtype: int64
count     1020
unique      27
top       Asus
freq       201
Name: Brand, dtype: object


**Number of Votes**

In [780]:
print(df["num_votes"].isnull().sum())
print(df["num_votes"].info())
print(df["num_votes"].value_counts())
print(df["num_votes"].describe())

0
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: num_votes
Non-Null Count  Dtype 
--------------  ----- 
1020 non-null   object
dtypes: object(1)
memory usage: 8.1+ KB
None
91       25
106      22
83       21
110      21
69       20
94       19
58       19
85       18
89       18
74       18
111      17
101      17
99       17
56       16
81       16
103      15
70       15
107      15
73       14
90       14
109      14
77       13
72       13
55       13
67       13
76       13
80       12
66       12
59       12
68       12
60       12
65       12
105      11
51       11
86       11
71       11
100      11
87       11
53       11
96       10
75       10
54       10
97       10
104      10
95        9
82        9
52        9
63        9
57        9
84        8
78        7
64        7
79        7
93        7
61        6
92        6
112       6
116       6
108       6
88        6
98        5
62        5
115       5
124       4
140       4
123      

**Rating**

In [781]:
print(df["Ratings"].isnull().sum())
print(df["Ratings"].info())
print(df["Ratings"].value_counts())
print(df["Ratings"].describe())

0
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: Ratings
Non-Null Count  Dtype  
--------------  -----  
1020 non-null   float64
dtypes: float64(1)
memory usage: 8.1 KB
None
4.30    93
4.10    85
4.20    85
4.15    74
4.40    72
4.25    65
4.60    60
4.50    56
4.35    56
4.75    56
4.00    55
4.55    51
4.70    49
4.05    49
4.45    45
4.65    44
3.95    12
3.85     3
3.90     3
3.70     2
3.65     2
3.55     1
3.75     1
3.80     1
Name: Ratings, dtype: int64
count    1020.000000
mean        4.338480
std         0.232508
min         3.550000
25%         4.150000
50%         4.300000
75%         4.550000
max         4.750000
Name: Ratings, dtype: float64


**Utility**

In [783]:
print(df["utility"].isnull().sum())
print(df["utility"].info())
print(df["utility"].value_counts())
print(df["utility"].describe())
#here 8 null values
#there are four category in this column "Performance","Gaming","Everyday Use","Bussiness" and remove "utility:"

8
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: utility
Non-Null Count  Dtype 
--------------  ----- 
1012 non-null   object
dtypes: object(1)
memory usage: 8.1+ KB
None
Utility: Performance                                    429
Utility: Gaming                                         175
Utility: Everyday Use                                   143
Utility: Business, Performance                           68
Utility: Everyday Use, Business, Performance             47
Utility: Gaming, Performance                             40
Utility: Everyday Use, Performance                       32
Utility: Performance, Business                           17
Utility: Performance, Everyday Use                       11
Utility: Everyday Use, Gaming, Performance               10
Utility: Business                                         9
Utility: Everyday Use, Gaming                             8
Utility: Everyday Use, Gaming, Business, Performance      8
Utility: Ev

In [784]:
temp_df=df.copy()

In [790]:
temp_df["utility"]=temp_df["utility"].str.replace("Utility:","").str.strip().str.split(",")

In [798]:
temp_df["utility"].fillna("False",inplace=True)

In [809]:
temp_df['everyday_use'] = temp_df['utility'].apply(lambda x: 1 if 'Everyday Use' in x else 0)
temp_df['business'] = temp_df['utility'].apply(lambda x: 1 if 'Business' in x else 0)
temp_df['performance'] = temp_df['utility'].apply(lambda x: 1 if 'Performance' in x else 0)
temp_df['gaming'] = temp_df['utility'].apply(lambda x: 1 if 'Gaming' in x else 0)
        
    

In [812]:
temp_df.drop("Everyday Use",axis=1,inplace=True)

In [813]:
df=temp_df

In [815]:
df.drop("utility",axis=1,inplace=True)

**Thickness**

In [820]:
print(df["thickness"].isnull().sum())
print(df["thickness"].info())
print(df["thickness"].value_counts())
print(df["thickness"].describe())
#233 are null values
#remove "Thickness:" and keep only integer value because all values are in mm

233
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: thickness
Non-Null Count  Dtype 
--------------  ----- 
787 non-null    object
dtypes: object(1)
memory usage: 8.1+ KB
None
Thickness: 19.9 mmAverage       124
Thickness: 17.9 mmSlim           89
Thickness: 18.9 mmAverage        29
Thickness: 24.9 mmThick          22
Thickness: 23.5 mmThick          17
Thickness: 17 mmSlim             17
Thickness: 21.7 mmThick          16
Thickness: 26.9 mmThick          15
Thickness: 19 mmAverage          14
Thickness: 18.9 mmSlim           13
Thickness: 24.2 mmThick          12
Thickness: 16.9 mmSlim           12
Thickness: 18.4 mmAverage        12
Thickness: 25.2 mmThick          11
Thickness: 22.5 mmThick          10
Thickness: 17.9 mmAverage         9
Thickness: 15.9 mmSlim            9
Thickness: 19.35 mmAverage        9
Thickness: 24 mmThick             9
Thickness: 19 mmSlim              8
Thickness: 18 mmAverage           8
Thickness: 23.9 mmThick        

In [821]:
temp_df=df.copy()

In [824]:
temp_df["thickness"].fillna("False",inplace=True)

In [834]:
temp_df["thickness"]=temp_df["thickness"].str.replace("Thickness:","").str.strip().str.split().str.get(0).str.replace("False","0")

In [836]:
df=temp_df

**Weight**

In [844]:
print(df["weight"].isnull().sum())
print(df["weight"].info())
print(df["weight"].value_counts())
print(df["weight"].describe())
#here 85 null values
#remove everything except float value, convert type as float, some values are in gram

85
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: weight
Non-Null Count  Dtype 
--------------  ----- 
935 non-null    object
dtypes: object(1)
memory usage: 8.1+ KB
None
1.7 kgAverage      70
1.8 kgAverage      54
2.25 kgHeavy       45
1.4 kgLight        36
1.69 kgAverage     28
1.5 kgLight        24
1.86 kgHeavy       22
1.8 kgHeavy        22
1.65 kgAverage     20
2.6 kgHeavy        19
2.37 kgHeavy       17
1.6 kgLight        17
1.59 kgLight       16
2.3 kgHeavy        15
1.41 kgLight       15
2.5 kgHeavy        14
1.74 kgAverage     14
1.46 kgLight       12
1.3 kgLight        12
1.78 kgHeavy       12
1.38 kgLight       11
2.1 kgHeavy        11
1.63 kgAverage     11
1.75 kgHeavy       11
1.62 kgLight       11
1.24 kgLight       11
1.49 kgLight       11
1.47 kgLight       11
1.86 kgAverage     11
1.6 kgAverage      10
2.7 kgHeavy        10
2.2 kgHeavy        10
2.14 kgHeavy        9
1.73 kgLight        8
1.37 kgLight        8
1.5 kgAverage       8

In [840]:
temp_df=df.copy()

In [871]:
temp_df["weight"]=temp_df["weight"].str.split().str.get(0).str.strip().astype("float")

In [878]:
temp_df["weight"]=temp_df["weight"].apply(lambda x: x if x<10 else x/1000)

In [879]:
df=temp_df

**Warranty**

In [885]:
print(df["warranty"].isnull().sum())
print(df["warranty"].info())
print(df["warranty"].value_counts())
print(df["warranty"].describe())
#here 5 null values
#remove everything except int value and convert it to integer column

5
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: warranty
Non-Null Count  Dtype 
--------------  ----- 
1015 non-null   object
dtypes: object(1)
memory usage: 8.1+ KB
None
1 Year Warranty    890
2 Year Warranty    113
3 Year Warranty     12
Name: warranty, dtype: int64
count                1015
unique                  3
top       1 Year Warranty
freq                  890
Name: warranty, dtype: object


In [889]:
df["warranty"]=df["warranty"].str.split().str.get(0)

In [901]:
df["warranty"]=df["warranty"].astype("float")

**screen size**

In [892]:
print(df["screen_size"].isnull().sum())
print(df["screen_size"].info())
print(df["screen_size"].value_counts())
print(df["screen_size"].describe())
#keep only integer value and convert this column to integer

0
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: screen_size
Non-Null Count  Dtype 
--------------  ----- 
1020 non-null   object
dtypes: object(1)
memory usage: 8.1+ KB
None
15.6 inchesAverage     308
15.6 inchesLarge       175
14 inchesSmall         149
14 inchesAverage       106
16 inchesLarge          79
16 inchesAverage        45
15.6 inchesLargest      31
13.3 inchesSmall        26
16.1 inchesLarge        22
17.3 inchesLarge        10
17.3 inchesLargest      10
16 inchesLargest         7
17 inchesLarge           5
15.6 inchesSmall         5
17 inchesAverage         5
13.4 inchesSmall         4
11.6 inchesLarge         3
14.1 inchesAverage       3
18 inchesLargest         3
16 inchesSmallest        3
11.6 inchesSmallest      3
14.1 inchesLargest       3
14.5 inchesAverage       2
11.6 inchesSmall         2
13.5 inchesSmall         2
14.2 inchesSmall         2
13 inchesSmallest        1
14.1 inchesLarge         1
13.6 inchesSmall         1
16.2

In [902]:
df["screen_size"]=df["screen_size"].str.split().str.get(0).astype("float")

**Resolution**

In [1075]:
print(df["resolution"].isnull().sum())
print(df["resolution"].info())
print(df["resolution"].value_counts())
print(df["resolution"].describe())
#2 null values
#make two columns "screen_width" and "screen_height" and store respective values

2
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: resolution
Non-Null Count  Dtype 
--------------  ----- 
1018 non-null   object
dtypes: object(1)
memory usage: 8.1+ KB
None
1920 x 1080 pixelsAverage    625
1920 x 1200 pixelsGood        97
2560 x 1600 pixelsAverage     38
1920 x 1200 pixelsAverage     30
2880 x 1800 pixelsGood        27
1366 x 768 pixelsAverage      24
1366 x 768 pixelsBad          20
2560 x 1600 pixelsGood        19
3840 x 2400 pixelsBest        17
1920 x 1080 pixelsBad         14
1920 x 1080 pixelsBest         9
1600 x 2560 pixelsAverage      8
2560 x 1440 pixelsGood         8
1080 x 1920 pixelsAverage      7
3200 x 1800 pixelsAverage      7
2880 x 1620 pixelsGood         7
1920 x 1200 pixelsPoor         6
3200 x 2000 pixelsGood         6
2560 x 1440 pixelsAverage      5
1366 x 768 pixelsPoor          4
1600 x 2560 pixelsGood         3
1366 x 768 pixelsGood          3
1200 x 1920 pixelsAverage      2
3456 x 2160 pixelsAverage    

In [1081]:
df["resolution_width"]=df["resolution"].str.split().str.get(0).str.strip().astype("float")
df["resolution_height"]=df["resolution"].str.split().str.get(2).str.strip().astype("float")

In [1082]:
df.drop("resolution",axis=1,inplace=True)

**PPI**

In [910]:
print(df["ppi"].isnull().sum())
print(df["ppi"].info())
print(df["ppi"].value_counts())
print(df["ppi"].describe())

0
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: ppi
Non-Null Count  Dtype 
--------------  ----- 
1020 non-null   object
dtypes: object(1)
memory usage: 8.1+ KB
None
~ 141 PPIAverage    448
~ 157 PPIHigh       103
~ 142 PPIAverage     43
~ 162 PPIHigh        35
~ 157 PPIAverage     29
~ 243 PPIHigh        22
~ 137 PPILow         21
~ 283 PPIHigh        19
~ 112 PPILow         18
~ 141 PPILow         15
~ 189 PPIAverage     14
~ 170 PPIHigh        12
~ 216 PPIHigh        12
~ 189 PPIHigh        11
~ 227 PPIHigh        11
~ 162 PPIAverage     10
~ 127 PPILow         10
~ 283 PPIHighest     10
~ 112 PPIAverage      9
~ 100 PPILowest       9
~ 182 PPIAverage      8
~ 147 PPIHigh         8
~ 255 PPIHigh         7
~ 184 PPIHigh         7
~ 212 PPIHigh         7
~ 135 PPIAverage      7
~ 100 PPILow          6
~ 140 PPILow          5
~ 138 PPILow          5
~ 229 PPIAverage      4
~ 290 PPIHigh         4
~ 236 PPIHigh         4
~ 166 PPIHigh         4
~ 1

In [914]:
df["ppi"]=df["ppi"].str.split().str.get(1)

In [918]:
df["ppi"]=df["ppi"].astype("int")

**Threads**

In [932]:
print(df["threads"].isnull().sum())
print(df["threads"].info())
print(df["threads"].value_counts())
print(df["threads"].describe())

22
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: threads
Non-Null Count  Dtype  
--------------  -----  
998 non-null    float64
dtypes: float64(1)
memory usage: 8.1 KB
None
12.0    352
16.0    255
8.0     190
20.0     72
4.0      60
2.0      36
32.0     16
24.0     14
6.0       3
Name: threads, dtype: int64
count    998.000000
mean      12.466934
std        5.249484
min        2.000000
25%        8.000000
50%       12.000000
75%       16.000000
max       32.000000
Name: threads, dtype: float64


In [921]:
df["threads"]=df["threads"].str.split().str.get(0).str.strip()

In [926]:
df["threads"]=df["threads"].astype("float")

In [928]:
df.drop("graphic_card",axis=1,inplace=True)

**RAM**

In [933]:
print(df["ram"].isnull().sum())
print(df["ram"].info())
print(df["ram"].value_counts())
print(df["ram"].describe())

0
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: ram
Non-Null Count  Dtype 
--------------  ----- 
1020 non-null   object
dtypes: object(1)
memory usage: 8.1+ KB
None
8 GB DDR4 RAMAverage               226
16 GB DDR4 RAMAverage              163
16 GB DDR5 RAMAverage              110
16 GB DDR4 RAMLargest               84
16 GB LPDDR5 RAMAverage             63
8 GB LPDDR5 RAMAverage              36
32 GB DDR5 RAMLargest               33
16 GB DDR5 RAMLargest               22
16 GB LPDDR5 RAMLargest             20
8 GB DDR4 RAMLargest                20
16 GB DDR4 RAMLarge                 17
8 GB DDR5 RAMAverage                16
8 GB DDR5 RAMSmallest               15
16 GB LPDDR4X RAMLargest            13
16 GB LPDDR5  RAMAverage            13
8 GB DDR4 RAMSmallest                9
32 GB DDR5 RAMAverage                8
8 GB DDR4  RAMAverage                8
16 GB  LPDDR5 RAMAverage             8
8 GB RAMAverage                      7
8 GB LPDDR4X RA

In [938]:
df["ram"]=df["ram"].str.split().str.get(0).astype("int")

**Hard-Disk**

In [1167]:
print(df["hard_disk"].isnull().sum())
print(df["hard_disk"].info())
print(df["hard_disk"].value_counts())
print(df["hard_disk"].describe())
#make two columns hard_disk,ssd fill the respective values in it

0
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: hard_disk
Non-Null Count  Dtype 
--------------  ----- 
1020 non-null   object
dtypes: object(1)
memory usage: 8.1+ KB
None
512 GB SSD                           727
1 TB SSD                             200
256 GB SSD                            47
2 TB SSD                              13
1 TB HARD Disk, 256 GB SSDAverage     11
64 GB SSD                              6
128 GB SSD                             5
64 GB HARD DiskLarge                   3
1 TB HARD Disk, 512 GB SSDSmall        3
1 TB HARD DiskLargest                  1
4 TB SSD                               1
128 GB HARD DiskLargest                1
32 GB HARD DiskSmallest                1
1 TB HARD DiskAverage                  1
Name: hard_disk, dtype: int64
count           1020
unique            14
top       512 GB SSD
freq             727
Name: hard_disk, dtype: object


In [1168]:
df["hard_disk"].fillna("False",inplace=True)

In [1177]:
df["hdd"]=df["hard_disk"].str.split(",").str.get(0).str.strip()

In [1178]:
df["ssd"]=df["hard_disk"].str.split(",").str.get(1).str.strip()

In [1180]:
temp_df["ssd"]=df["hdd"].apply(lambda x: x if "SSD" in x else np.nan )


In [1182]:
df.loc[temp_df["ssd"].index,"ssd"]=temp_df["ssd"]

In [1186]:
df["ssd"]=df["ssd"].str.split().str.get(0).astype("float")

In [1188]:
df["ssd"].fillna(0,inplace=True)

In [1190]:
df["ssd"]=df["ssd"].apply(lambda x: x*1024 if x<10 else x)

In [1193]:
df["hdd"]=df["hdd"].apply(lambda x: x if "HARD" in x else 0)

In [1195]:
df["hdd"]=df["hdd"].str.split().str.get(0).str.strip().astype("float")

In [1196]:
df["hdd"]=df["hdd"].apply(lambda x: x*1024 if x<10 else x)

**Antiglare**

In [941]:
print(df["antiglare1"].isnull().sum())
print(df["antiglare1"].info())
print(df["antiglare1"].value_counts())
print(df["antiglare1"].describe())

167
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: antiglare1
Non-Null Count  Dtype 
--------------  ----- 
853 non-null    object
dtypes: object(1)
memory usage: 8.1+ KB
None
Anti Glare    853
Name: antiglare1, dtype: int64
count            853
unique             1
top       Anti Glare
freq             853
Name: antiglare1, dtype: object


In [944]:
df["antiglare1"].fillna("False",inplace=True)

In [947]:
df["antiglare1"]=df["antiglare1"].apply(lambda x: 1 if "Anti Glare" in x else 0)

**Touchscreen**

In [950]:
print(df["touch_screen1"].isnull().sum())
print(df["touch_screen1"].info())
print(df["touch_screen1"].value_counts())
print(df["touch_screen1"].describe())

928
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: touch_screen1
Non-Null Count  Dtype 
--------------  ----- 
92 non-null     object
dtypes: object(1)
memory usage: 8.1+ KB
None
Touch Screen    92
Name: touch_screen1, dtype: int64
count               92
unique               1
top       Touch Screen
freq                92
Name: touch_screen1, dtype: object


In [952]:
df["touch_screen1"].fillna("False",inplace=True)

In [953]:
df["touch_screen1"]=df["touch_screen1"].apply(lambda x: 1 if "Touch Screen" in x else 0).astype("int")

**Cores**

In [954]:
print(df["cores1"].isnull().sum())
print(df["cores1"].info())
print(df["cores1"].value_counts())
print(df["cores1"].describe())

5
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: cores1
Non-Null Count  Dtype 
--------------  ----- 
1015 non-null   object
dtypes: object(1)
memory usage: 8.1+ KB
None
Hexa Core              145
10 Cores (2P + 8E)     135
Quad Core              128
Octa Core              120
12 Cores (4P + 8E)     108
Dual Core               91
14 Cores (6P + 8E)      76
Octa Core (4P + 4E)     73
Hexa Core (2P + 4E)     55
10 Cores (6P + 4E)      44
24 Cores (8P + 16E)     17
16 Cores (8P + 8E)      13
5 Cores (1P + 4E)        3
16 Cores                 2
10 Cores                 2
14 Cores                 1
12 Cores (8P + 4E)       1
12 Cores                 1
Name: cores1, dtype: int64
count          1015
unique           18
top       Hexa Core
freq            145
Name: cores1, dtype: object


In [955]:
df["cores1"].fillna("False",inplace=True)

In [958]:
df["cores1"]=df["cores1"].str.split().str.get(0)

In [963]:
def core(x):
    if x=="Hexa":
        return 6
    elif x=="Octa":
        return 8
    elif x=="Quad":
        return 4
    elif x=="Dual":
        return 2
    elif x=="False":
        return np.nan
    else:
        return x
df["cores1"]=df["cores1"].apply(core).astype("float")

**Battery Capacity**

In [965]:
print(df["battery_capacity"].isnull().sum())
print(df["battery_capacity"].info())
print(df["battery_capacity"].value_counts())
print(df["battery_capacity"].describe())
#254 null values

254
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: battery_capacity
Non-Null Count  Dtype 
--------------  ----- 
766 non-null    object
dtypes: object(1)
memory usage: 8.1+ KB
None
41 Wh                   90
50 Wh                   58
90 Wh                   44
45 Wh                   42
53.5 Wh                 40
70 Wh                   32
52.5 Wh                 25
54 Wh                   22
51 Wh                   20
42 Wh                   20
86 Wh                   18
50 Wh BatteryGood       17
55 Wh                   14
70 Wh BatteryGood       13
40 Wh                   12
63 Wh                   12
52.5 Wh BatteryGood     11
97 Wh                   11
83 Wh                   11
45 Wh BatteryGood       11
37 Wh                   10
52.4 Wh                  9
48 Wh                    9
60 Wh                    9
56.6 Wh                  9
56 Wh                    9
99 Wh BatteryGood        8
47 Wh                    8
43 Wh                   

In [969]:
df["battery_capacity"]=df["battery_capacity"].str.split().str.get(0)

In [970]:
df["battery_capacity"]=df["battery_capacity"].astype("float")

**Battery Cell**

In [971]:
print(df["battery_cell"].isnull().sum())
print(df["battery_cell"].info())
print(df["battery_cell"].value_counts())
print(df["battery_cell"].describe())

202
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: battery_cell
Non-Null Count  Dtype 
--------------  ----- 
818 non-null    object
dtypes: object(1)
memory usage: 8.1+ KB
None
 3 Cell BatteryGood    389
 4 Cell BatteryGood    155
3 Cell Battery         133
 6 Cell BatteryGood     52
4 Cell Battery          44
 2 Cell BatteryGood     29
2 Cell Battery           9
6 Cell Battery           7
Name: battery_cell, dtype: int64
count                     818
unique                      8
top        3 Cell BatteryGood
freq                      389
Name: battery_cell, dtype: object


In [974]:
df["battery_cell"]=df["battery_cell"].str.split().str.get(0).astype("float")

**HDMI**

In [975]:
print(df["hdmi1"].isnull().sum())
print(df["hdmi1"].info())
print(df["hdmi1"].value_counts())
print(df["hdmi1"].describe())

59
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: hdmi1
Non-Null Count  Dtype 
--------------  ----- 
961 non-null    object
dtypes: object(1)
memory usage: 8.1+ KB
None
HDMI     526
 HDMI    435
Name: hdmi1, dtype: int64
count      961
unique       2
top       HDMI
freq       526
Name: hdmi1, dtype: object


In [976]:
df["hdmi1"].fillna("False",inplace=True)

In [979]:
df["hdmi1"]=df["hdmi1"].str.strip().apply(lambda x: 1 if "HDMI" in x else 0).astype("int")

**Ethernet**

In [980]:
print(df["ethernet"].isnull().sum())
print(df["ethernet"].info())
print(df["ethernet"].value_counts())
print(df["ethernet"].describe())

584
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: ethernet
Non-Null Count  Dtype 
--------------  ----- 
436 non-null    object
dtypes: object(1)
memory usage: 8.1+ KB
None
Ethernet (LAN)    436
Name: ethernet, dtype: int64
count                436
unique                 1
top       Ethernet (LAN)
freq                 436
Name: ethernet, dtype: object


In [982]:
df["ethernet"].fillna("False",inplace=True)

In [986]:
df["ethernet"]=df["ethernet"].str.split().str.get(0).apply(lambda x: 1 if "Ethernet" in x else 0).astype("int")

**Multi Card Reader**

In [987]:
print(df["multi_card_reader"].isnull().sum())
print(df["multi_card_reader"].info())
print(df["multi_card_reader"].value_counts())
print(df["multi_card_reader"].describe())

715
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: multi_card_reader
Non-Null Count  Dtype 
--------------  ----- 
305 non-null    object
dtypes: object(1)
memory usage: 8.1+ KB
None
 Multi Card Reader    302
Multi Card Reader       3
Name: multi_card_reader, dtype: int64
count                    305
unique                     2
top        Multi Card Reader
freq                     302
Name: multi_card_reader, dtype: object


In [988]:
df["multi_card_reader"].fillna("False",inplace=True)

In [989]:
df["multi_card_reader"]=df["multi_card_reader"].str.strip().apply(lambda x: 1 if "Multi Card Reader" in x else 0).astype("int")

**Thunderbolt**

In [991]:
print(df["thunderbolt"].isnull().sum())
print(df["thunderbolt"].info())
print(df["thunderbolt"].value_counts())
print(df["thunderbolt"].describe())

743
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: thunderbolt
Non-Null Count  Dtype 
--------------  ----- 
277 non-null    object
dtypes: object(1)
memory usage: 8.1+ KB
None
 Thunderbolt    243
Thunderbolt      34
Name: thunderbolt, dtype: int64
count              277
unique               2
top        Thunderbolt
freq               243
Name: thunderbolt, dtype: object


In [992]:
df["thunderbolt"].fillna("False",inplace=True)

In [995]:
df["thunderbolt"]=df["thunderbolt"].str.strip().apply(lambda x: 1 if "Thunderbolt" in x else 0).astype("int")

**Display Port**

In [996]:
print(df["display_port"].isnull().sum())
print(df["display_port"].info())
print(df["display_port"].value_counts())
print(df["display_port"].describe())

1000
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: display_port
Non-Null Count  Dtype 
--------------  ----- 
20 non-null     object
dtypes: object(1)
memory usage: 8.1+ KB
None
 Display Port    18
Display Port      2
Name: display_port, dtype: int64
count                20
unique                2
top        Display Port
freq                 18
Name: display_port, dtype: object


In [998]:
df["display_port"].fillna("False",inplace=True)

In [999]:
df["display_port"]=df["display_port"].str.strip().apply(lambda x: 1 if "Display Port" in x else 0).astype("int")

**VGA**

In [1001]:
print(df["vga"].isnull().sum())
print(df["vga"].info())
print(df["vga"].value_counts())
print(df["vga"].describe())

1016
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: vga
Non-Null Count  Dtype 
--------------  ----- 
4 non-null      object
dtypes: object(1)
memory usage: 8.1+ KB
None
 VGA    4
Name: vga, dtype: int64
count        4
unique       1
top        VGA
freq         4
Name: vga, dtype: object


In [1002]:
df["vga"].fillna("False",inplace=True)

In [1003]:
df["vga"]=df["vga"].str.strip().apply(lambda x: 1 if "VGA" in x else 0).astype("int")

**Back Light**

In [1007]:
print(df["backlit"].isnull().sum())
print(df["backlit"].info())
print(df["backlit"].value_counts())
print(df["backlit"].describe())

0
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: backlit
Non-Null Count  Dtype
--------------  -----
1020 non-null   int32
dtypes: int32(1)
memory usage: 4.1 KB
None
1    816
0    204
Name: backlit, dtype: int64
count    1020.000000
mean        0.800000
std         0.400196
min         0.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: backlit, dtype: float64


In [1005]:
df["backlit"].fillna("False",inplace=True)

In [1006]:
df["backlit"]=df["backlit"].str.strip().apply(lambda x: 1 if "Backlit Keyboard" in x else 0).astype("int")

**Fingerprint Sensor**

In [1008]:
print(df["fingerprint_sensor"].isnull().sum())
print(df["fingerprint_sensor"].info())
print(df["fingerprint_sensor"].value_counts())
print(df["fingerprint_sensor"].describe())

693
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: fingerprint_sensor
Non-Null Count  Dtype 
--------------  ----- 
327 non-null    object
dtypes: object(1)
memory usage: 8.1+ KB
None
Fingerprint Sensor    327
Name: fingerprint_sensor, dtype: int64
count                    327
unique                     1
top       Fingerprint Sensor
freq                     327
Name: fingerprint_sensor, dtype: object


In [1010]:
df["fingerprint_sensor"].fillna("False",inplace=True)

In [1011]:
df["fingerprint_sensor"]=df["fingerprint_sensor"].str.strip().apply(lambda x: 1 if "Fingerprint Sensor" in x else 0).astype("int")

**Inbuilt Microphone**

In [1012]:
print(df["inbuilt_microphone"].isnull().sum())
print(df["inbuilt_microphone"].info())
print(df["inbuilt_microphone"].value_counts())
print(df["inbuilt_microphone"].describe())

13
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: inbuilt_microphone
Non-Null Count  Dtype 
--------------  ----- 
1007 non-null   object
dtypes: object(1)
memory usage: 8.1+ KB
None
 Inbuilt Microphone    817
Inbuilt Microphone     190
Name: inbuilt_microphone, dtype: int64
count                    1007
unique                      2
top        Inbuilt Microphone
freq                      817
Name: inbuilt_microphone, dtype: object


In [1013]:
df["inbuilt_microphone"].fillna("False",inplace=True)

In [1015]:
df["inbuilt_microphone"]=df["inbuilt_microphone"].str.strip().apply(lambda x: 1 if "Inbuilt Microphone" in x else 0).astype("int")

**USB 2.0**

In [1020]:
print(df["usb2"].isnull().sum())
print(df["usb2"].info())
print(df["usb2"].value_counts())
print(df["usb2"].describe())

723
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: usb2
Non-Null Count  Dtype  
--------------  -----  
297 non-null    float64
dtypes: float64(1)
memory usage: 8.1 KB
None
1.0    235
2.0     62
Name: usb2, dtype: int64
count    297.000000
mean       1.208754
std        0.407104
min        1.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        2.000000
Name: usb2, dtype: float64


In [1019]:
df["usb2"]=df["usb2"].str.split("x").str.get(0).astype("float")

**USB 3.0**

In [1022]:
print(df["usb3"].isnull().sum())
print(df["usb3"].info())
print(df["usb3"].value_counts())
print(df["usb3"].describe())

63
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: usb3
Non-Null Count  Dtype 
--------------  ----- 
957 non-null    object
dtypes: object(1)
memory usage: 8.1+ KB
None
2 x USB 3.0     454
 1 x USB 3.0    157
3 x USB 3.0     134
 2 x USB 3.0    132
1 x USB 3.0      74
 3 x USB 3.0      3
4 x USB 3.0       3
Name: usb3, dtype: int64
count             957
unique              7
top       2 x USB 3.0
freq              454
Name: usb3, dtype: object


In [1025]:
df["usb3"]=df["usb3"].str.split("x").str.get(0).str.strip().astype("float")

**Type-C**

In [1026]:
print(df["typec"].isnull().sum())
print(df["typec"].info())
print(df["typec"].value_counts())
print(df["typec"].describe())

60
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: typec
Non-Null Count  Dtype 
--------------  ----- 
960 non-null    object
dtypes: object(1)
memory usage: 8.1+ KB
None
 1 x USB Type-C    615
 2 x USB Type-C    283
1 x USB Type-C      20
2 x USB Type-C      14
 3 x USB Type-C     13
3 x USB Type-C      11
4 x USB Type-C       4
Name: typec, dtype: int64
count                 960
unique                  7
top        1 x USB Type-C
freq                  615
Name: typec, dtype: object


In [1027]:
df["typec"]=df["typec"].str.split("x").str.get(0).str.strip().astype("float")

**Processor Gen**

In [1028]:
print(df["processor_gen"].isnull().sum())
print(df["processor_gen"].info())
print(df["processor_gen"].value_counts())
print(df["processor_gen"].describe())

55
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: processor_gen
Non-Null Count  Dtype 
--------------  ----- 
965 non-null    object
dtypes: object(1)
memory usage: 8.1+ KB
None
12    276
13    257
7     143
11    137
5      96
6      17
3      16
10     13
4       6
8       3
9       1
Name: processor_gen, dtype: int64
count     965
unique     11
top        12
freq      276
Name: processor_gen, dtype: object


In [1029]:
df["processor_gen"]=df["processor_gen"].astype("float")

**Processor Brand**

In [1030]:
print(df["processor_brand"].isnull().sum())
print(df["processor_brand"].info())
print(df["processor_brand"].value_counts())
print(df["processor_brand"].describe())

0
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: processor_brand
Non-Null Count  Dtype 
--------------  ----- 
1020 non-null   object
dtypes: object(1)
memory usage: 8.1+ KB
None
intel       732
amd         278
apple         8
mediatek      2
Name: processor_brand, dtype: int64
count      1020
unique        4
top       intel
freq        732
Name: processor_brand, dtype: object


**Processor Model**

In [1031]:
print(df["processor_model"].isnull().sum())
print(df["processor_model"].info())
print(df["processor_model"].value_counts())
print(df["processor_model"].describe())

0
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: processor_model
Non-Null Count  Dtype 
--------------  ----- 
1020 non-null   object
dtypes: object(1)
memory usage: 8.1+ KB
None
i5    338
i7    184
5     151
i3    125
3     108
7      57
i9     49
M2      6
M1      2
Name: processor_model, dtype: int64
count     1020
unique       9
top         i5
freq       338
Name: processor_model, dtype: object


**Graphic Brand**

In [1035]:
print(df["graphics_brand"].isnull().sum())
print(df["graphics_brand"].info())
print(df["graphics_brand"].value_counts())
print(df["graphics_brand"].describe())

1
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: graphics_brand
Non-Null Count  Dtype 
--------------  ----- 
1019 non-null   object
dtypes: object(1)
memory usage: 8.1+ KB
None
intel     434
nvidia    401
amd       174
apple       8
arm         2
Name: graphics_brand, dtype: int64
count      1019
unique        5
top       intel
freq        434
Name: graphics_brand, dtype: object


**Graphic Capacity**

In [1071]:
print(df["graphics_capacity"].isnull().sum())
print(df["graphics_capacity"].info())
print(df["graphics_capacity"].value_counts())
print(df["graphics_capacity"].describe())

607
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: graphics_capacity
Non-Null Count  Dtype 
--------------  ----- 
413 non-null    object
dtypes: object(1)
memory usage: 8.1+ KB
None
4     190
8      96
6      90
16     13
2      11
12     11
10      2
Name: graphics_capacity, dtype: int64
count     413
unique      7
top         4
freq      190
Name: graphics_capacity, dtype: object


In [1072]:
df["graphics_capacity"]=df["graphics_capacity"].astype("float")

**Graphic Model**

In [1073]:
print(df["graphics_model"].isnull().sum())
print(df["graphics_model"].info())
print(df["graphics_model"].value_counts())
print(df["graphics_model"].describe())

0
<class 'pandas.core.series.Series'>
RangeIndex: 1020 entries, 0 to 1019
Series name: graphics_model
Non-Null Count  Dtype 
--------------  ----- 
1020 non-null   object
dtypes: object(1)
memory usage: 8.1+ KB
None
Integrated    622
rtx3050       104
rtx4060        64
rtx4050        61
rtx2050        45
gtx1650        34
rtx4070        24
rtx4090         9
rtx4080         8
rtx3060         7
rx6500m         7
mx450           6
t600            6
rtx3070ti       5
rtx3080ti       4
t550            3
mx550           2
gtx2050         2
mx570           2
rx5600m         1
rx7600s         1
mx130           1
rtx2060         1
t500            1
Name: graphics_model, dtype: int64
count           1020
unique            24
top       Integrated
freq             622
Name: graphics_model, dtype: object


In [None]:
#i think we should add one more column of "laptop_model"

In [1201]:
df.drop(["sdd","hard_disk"],axis=1,inplace=True)

In [1205]:
df["thickness"]=df["thickness"].astype("float")

In [1207]:
df["num_votes"]=df["num_votes"].astype("float")

In [1210]:
#i think we should add one more column of "laptop_model"
temp_df=pd.read_csv("smartprix_laptop.csv",index_col=0)

In [1213]:
df.loc[temp_df["name"].index,"laptop_model"]=temp_df["name"]

In [1217]:
#Now export the file
df.to_csv("laptop_cleaned_dataset.csv",index=False)