In [2]:
import pandas as pd

In [4]:
df = pd.read_csv(r"I:\CampusX_DS\campusx_dsmp2\9. MLOps revisited\laptop_price_predictor_mlops\laptop_data.csv")

In [7]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [8]:
df.head()

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


In [11]:
def fetch_processor(text):
    if text in ['Intel Core i7', 'Intel Core i5', 'Intel Core i3']:
        return text
    elif text.startswith('Intel'):
        return 'Other Intel Processor'
    else:
        return 'AMD Processor'


def cat_os(inp):
    if inp in ['Windows 10', 'Windows 7', 'Windows 10 S']:
        return 'Windows'
    elif inp in ['macOS', 'Mac OS X']:
        return 'Mac'
    else:
        return 'Others/No OS/Linux'

In [13]:
def preprocess(df):
    df['Ram'] = df['Ram'].str.replace('GB', '', regex=False).astype('int32')
    df['Weight'] = df['Weight'].str.replace('kg', '', regex=False).astype('float32')

    df['Touchscreen'] = df['ScreenResolution'].apply(lambda x: 1 if 'Touchscreen' in x else 0)
    df['Ips'] = df['ScreenResolution'].apply(lambda x: 1 if 'IPS' in x else 0)

    new_res = df['ScreenResolution'].str.split('x', n=1, expand=True)
    df['X_res'] = new_res[0].str.replace(',', '').str.extract(r'(\d+)').astype(int)
    df['Y_res'] = new_res[1].astype(int)

    df['ppi'] = (((df['X_res'] ** 2 + df['Y_res'] ** 2) ** 0.5) / df['Inches']).astype(float)

    df.drop(columns=['ScreenResolution', 'Inches', 'X_res', 'Y_res'], inplace=True)

    df['Cpu Name'] = df['Cpu'].apply(lambda x: " ".join(x.split()[0:3]))
    df['Cpu brand'] = df['Cpu Name'].apply(fetch_processor)
    df.drop(columns=['Cpu', 'Cpu Name'], inplace=True)

    df['Memory'] = df['Memory'].astype(str).replace('\.0', '', regex=True)
    df['Memory'] = df['Memory'].str.replace('GB', '')
    df['Memory'] = df['Memory'].str.replace('TB', '000')

    mem_split = df['Memory'].str.split('+', n=1, expand=True)
    df['first'] = mem_split[0].str.strip()
    df['second'] = mem_split[1].fillna('0')

    for col in ['first', 'second']:
        df[f"{col}_HDD"] = df[col].apply(lambda x: 1 if 'HDD' in x else 0)
        df[f"{col}_SSD"] = df[col].apply(lambda x: 1 if 'SSD' in x else 0)
        df[col] = df[col].str.replace(r'\D', '', regex=True).astype(int)

    df['HDD'] = df['first'] * df['first_HDD'] + df['second'] * df['second_HDD']
    df['SSD'] = df['first'] * df['first_SSD'] + df['second'] * df['second_SSD']

    df.drop(columns=['Memory', 'first', 'second','first_HDD', 'first_SSD','second_HDD', 'second_SSD'], errors='ignore', inplace=True)

    df['Gpu brand'] = df['Gpu'].apply(lambda x: x.split()[0])
    df = df[df['Gpu brand'] != 'ARM']
    df.drop(columns=['Gpu'], inplace=True)

    df['os'] = df['OpSys'].apply(cat_os)
    df.drop(columns=['OpSys'], inplace=True)

    df.reset_index(drop=True, inplace=True)
    
    return df

In [14]:
df = preprocess(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['Gpu'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['os'] = df['OpSys'].apply(cat_os)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['OpSys'], inplace=True)


In [21]:
df.head()

Unnamed: 0,Company,TypeName,Ram,Weight,Price,Touchscreen,Ips,ppi,Cpu brand,HDD,SSD,Gpu brand,os
0,Apple,Ultrabook,8,1.37,71378.6832,0,1,226.983005,Intel Core i5,0,128,Intel,Mac
1,Apple,Ultrabook,8,1.34,47895.5232,0,0,127.67794,Intel Core i5,0,0,Intel,Mac
2,HP,Notebook,8,1.86,30636.0,0,0,141.211998,Intel Core i5,0,256,Intel,Others/No OS/Linux
3,Apple,Ultrabook,16,1.83,135195.336,0,1,220.534624,Intel Core i7,0,512,AMD,Mac
4,Apple,Ultrabook,8,1.37,96095.808,0,1,226.983005,Intel Core i5,0,256,Intel,Mac


In [18]:
from sklearn.model_selection import train_test_split

X = df.drop('Price', axis=1)
y = df['Price']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)