* Import libraries

In [None]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

import warnings
warnings.filterwarnings('ignore')

* Import data

In [None]:
df = pd.read_csv('laptop_price.csv', encoding='latin1')

* Delete Outliers from data visualization

In [None]:
df = df.drop(df[df['Cpu'].str.contains('Samsung')].index, axis=0)
df = df.drop(df[df.Ram == '64GB'].index, axis=0)
df = df.drop(df[df.Memory.str.contains('240GB')].index, axis=0)
df = df.drop(df[df.Memory.str.contains('508GB')].index, axis=0)
df = df.drop(df[df.Price_euros > 4000].index, axis=0)
df = df.reset_index(drop=True)

# Product feature is similar to company featuure
df = df.drop(['laptop_ID', 'Product', 'Gpu'], axis=1) 

* Screen resolution

In [None]:
class ScreenExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Screen resolution:
        X['Screen'] = X.ScreenResolution.str.split().str[-1]

        # Screen touch:
        X['Screen_touch'] = X['ScreenResolution'].apply(lambda x: 1 if 'Touchscreen' in x else 0)

        # Screen type:
        X['Screen_type'] = X['ScreenResolution'].apply(lambda x: 'IPS Panel' if 'IPS Panel' in x
                                                      else 'Retina Display' if 'Retina Display' in x
                                                      else 'Full HD' if 'Full HD' in x
                                                      else 'Quad HD+' if 'Quad HD+' in x
                                                      else '4K Ultra HD' if '4K Ultra HD' in x
                                                      else 'NaN')
        return X.drop('ScreenResolution', axis=1)

* Cpu

In [None]:
class CpuExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X['Cpu_brand'] = X['Cpu'].str.split(' ').str[0]
        X['Cpu_type'] = X['Cpu'].apply(lambda cpu: cpu.split(' ')[2] if cpu.split(' ')[0] == 'Intel' else cpu.split(' ')[1])
        X['Cpu_speed'] = X.Cpu.str.extract("(\d\.?\d*)GHz", expand=False).astype(float)
        return X.drop(['Cpu'], axis=1)

* Weight

In [None]:
class WeightExtractor(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    X['Weight'] = X['Weight'].str.replace('kg', '').astype(float)
    X.rename(columns={'Weight': 'Weight(kg)'}, inplace=True)
    return X

* Memory

In [None]:
class MemoryTransformer(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    X['Memory_capacity'] = X.Memory.str.split(' ', expand=True).iloc[:, 0]
    X['Memory_capacity'] = X['Memory_capacity'].apply(lambda x: float(x[:-2])*1000 if x.endswith('TB') else float(x[:-2]))

    X['Memory_type'] = X.Memory.str.extract("[GB|TB]\s(.*)", expand=False)
    X['Memory_type'] = X.Memory_type.str.extract(r'(\w+)')
    return X.drop(['Memory'], axis=1)

* Pipeline

In [None]:
split_pipeline = Pipeline([
    ('Weight', WeightExtractor()),
    ('Cpu', CpuExtractor()),
    ('Memory', MemoryTransformer()),
    ('screen', ScreenExtractor()),
])

df = split_pipeline.fit_transform(df)

In [None]:
df

Unnamed: 0,Company,TypeName,Inches,Ram,OpSys,Weight(kg),Price_euros,Cpu_brand,Cpu_type,Cpu_speed,Memory_capacity,Memory_type,Screen,Screen_touch,Screen_type
0,Apple,Ultrabook,13.3,8GB,macOS,1.37,1339.69,Intel,i5,2.3,128.0,SSD,2560x1600,0,IPS Panel
1,Apple,Ultrabook,13.3,8GB,macOS,1.34,898.94,Intel,i5,1.8,128.0,Flash,1440x900,0,
2,HP,Notebook,15.6,8GB,No OS,1.86,575.00,Intel,i5,2.5,256.0,SSD,1920x1080,0,Full HD
3,Apple,Ultrabook,15.4,16GB,macOS,1.83,2537.45,Intel,i7,2.7,512.0,SSD,2880x1800,0,IPS Panel
4,Apple,Ultrabook,13.3,8GB,macOS,1.37,1803.60,Intel,i5,3.1,256.0,SSD,2560x1600,0,IPS Panel
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1290,Lenovo,2 in 1 Convertible,14.0,4GB,Windows 10,1.80,638.00,Intel,i7,2.5,128.0,SSD,1920x1080,1,IPS Panel
1291,Lenovo,2 in 1 Convertible,13.3,16GB,Windows 10,1.30,1499.00,Intel,i7,2.5,512.0,SSD,3200x1800,1,IPS Panel
1292,Lenovo,Notebook,14.0,2GB,Windows 10,1.50,229.00,Intel,Dual,1.6,64.0,Flash,1366x768,0,
1293,HP,Notebook,15.6,6GB,Windows 10,2.19,764.00,Intel,i7,2.5,1000.0,HDD,1366x768,0,


In [None]:
#df.to_csv('laptop_price_cleaned.csv', index=False)