In [None]:
import pandas as pd 
import numpy as np

# read in all our data
laptop_train = pd.read_csv("datasets/laptops_train.csv")
laptop_test = pd.read_csv("datasets/laptops_test.csv")

# set seed for reproducibility
np.random.seed(0)

#Viewing Data
print(laptop_train.head())
print(laptop_test.head())

In [None]:
#Combining Train and Test Data for Data Processing
frames = [laptop_train,laptop_test]
df = pd.concat(frames)
#Must do to keep from having duplicate indexes
df.reset_index(drop=True, inplace=True)
df.head()

In [None]:
# get the number of missing data points per column
missing_values_count = df.isnull().sum()
missing_values_count

In [None]:
#Data Exploration before Dropping Operating System Version column
df['Operating System Version'].value_counts()

df[df['Operating System Version'].isna()].groupby('Operating System').groups

grouped = df.groupby('Operating System')

for name,group in grouped:
    print(name)
    print(group['Operating System Version'].unique())


In [None]:
# Drop the Operating System Version because the laptop can just be updated by the user
df = df.drop('Operating System Version', axis=1)

In [None]:
#Confirm Column drop
df.columns

Data Preprocessing
Course of Action:
Manufacturer         Target Encoding
Model Name           Split on the () into two columns- Target Encoding
Category             One Hot Encoding
Screen Size          Convert to float and scale (units is inches)
Screen               Split into new features - screen type, screen quality, HD(Binary), Touchscreen(Binary)
CPU                  Split into new features - Brand, Model Number, Speed
RAM                  Remove "GB" and scale
Storage              Convert to Total Storage column and scale
GPU                  Split into new features - Brand, Model Number, Speed
Operating System     One Hot Encoding (macOS = MacOS)
Weight               Convert to lbs and scale (4s = 4.04 typo)
Price                Convert to USD

In [None]:
""" Course of Action:
Manufacturer         Target Encoding
Model Name           Split on the () into two columns- Target Encoding
Category             One Hot Encoding
Screen Size          Convert to float and scale (units is inches)
Screen               Split into new features - screen type, screen quality, HD(Binary), Touchscreen(Binary)
CPU                  Split into new features - Brand, Model Number, Speed
RAM                  Remove "GB" and scale
Storage              Convert to Total Storage column and scale
GPU                  Split into new features - Brand, Model Number, Speed
Operating System     One Hot Encoding (macOS = MacOS)
Weight               Convert to lbs and scale (4s = 4.04 typo)
Price                Convert to USD """

In [None]:
df.dtypes

In [None]:
df[['Screen Size','Screen']]

In [None]:
#Treat Manufacturer as an nominal variable
df['Manufacturer'].unique()

In [None]:
# gives a tuple of column name and series
# for each column in the dataframe
for (columnName, columnData) in df.items():
    print('Column Name : ', columnName)
    print('Column Contents : ', columnData.unique())

In [None]:
model_name = df['Model Name'].unique()
model_name.sort()
model_name

In [None]:
#Looking at the data redundancy in Model Name,CPU,RAM,Storage, and GPU for specific rows
df[['Model Name','CPU']]

In [None]:
df[['Model Name','CPU','RAM','Storage', 'GPU']][df['Model Name'].str.contains('/')]

In [None]:
category = df['Category'].unique()
category.sort()
category

In [None]:
# gives a tuple of column name and series
# for each column in the dataframe
#Deciding which columns need to be Target Encoded or One Hot Encoded (cut off being > 15)
for (columnName, columnData) in df.items():
    print('Column Name : ', columnName)
    print('Column Contents : ', columnData.nunique())

Data Exploration of each column

In [None]:
screen_size = df['Screen Size'].unique()
screen_size.sort()
screen_size

In [None]:
screen = df['Screen'].unique()
screen.sort()
print(df['Screen'].value_counts())
screen

In [None]:
CPU = df['CPU'].unique()
CPU.sort()
print(df['CPU'].value_counts())
CPU

In [None]:
RAM = df['RAM'].unique()
RAM.sort()
print(df['RAM'].value_counts())
RAM

In [122]:
storage = df['Storage'].unique()
storage.sort()
print(df['Storage'].value_counts())
storage

256GB SSD                        412
1TB HDD                          224
500GB HDD                        132
512GB SSD                        118
128GB SSD +  1TB HDD              94
128GB SSD                         76
256GB SSD +  1TB HDD              73
32GB Flash Storage                38
2TB HDD                           16
64GB Flash Storage                15
512GB SSD +  1TB HDD              14
1TB SSD                           14
256GB SSD +  2TB HDD              10
1TB Hybrid                         9
256GB Flash Storage                8
16GB Flash Storage                 7
32GB SSD                           6
1GB SSD                            5
128GB Flash Storage                4
16GB SSD                           3
512GB Flash Storage                2
1TB SSD +  1TB HDD                 2
512GB SSD +  2TB HDD               2
256GB SSD +  500GB HDD             2
128GB SSD +  2TB HDD               2
256GB SSD +  256GB SSD             2
512GB SSD +  256GB SSD             1
5

array(['128GB Flash Storage', '128GB HDD', '128GB SSD',
       '128GB SSD +  1TB HDD', '128GB SSD +  2TB HDD',
       '16GB Flash Storage', '16GB SSD', '1GB SSD', '1TB HDD',
       '1TB HDD +  1TB HDD', '1TB Hybrid', '1TB SSD',
       '1TB SSD +  1TB HDD', '240GB SSD', '256GB Flash Storage',
       '256GB SSD', '256GB SSD +  1TB HDD', '256GB SSD +  1TB Hybrid',
       '256GB SSD +  256GB SSD', '256GB SSD +  2TB HDD',
       '256GB SSD +  500GB HDD', '2TB HDD', '32GB Flash Storage',
       '32GB HDD', '32GB SSD', '500GB HDD', '508GB Hybrid',
       '512GB Flash Storage', '512GB SSD', '512GB SSD +  1TB HDD',
       '512GB SSD +  1TB Hybrid', '512GB SSD +  256GB SSD',
       '512GB SSD +  2TB HDD', '512GB SSD +  512GB SSD',
       '64GB Flash Storage', '64GB Flash Storage +  1TB HDD', '64GB SSD',
       '8GB SSD'], dtype=object)

In [None]:
GPU = df['GPU'].unique()
GPU.sort()
print(df['GPU'].value_counts())
GPU

In [None]:
OS = df['Operating System'].unique()
OS.sort()
print(df['Operating System'].value_counts())
OS

In [None]:
weight = df['Weight'].unique()
weight.sort()
print(df['Weight'].value_counts())
weight

In [None]:
price = df['Price'].unique()
price.sort()
print(df['Price'].value_counts())
price

In [None]:
df.columns

In [None]:
#1 INR = 0.012203 USD Conversion Rate as of May 10,2023
#Unit Conversion for Price Column
df['Price_USD'] = df.Price/81.9433
df[['Price','Price_USD']]

In [None]:
#4s is a typo. Google search the weight of the laptop
df.Weight = df.Weight.replace('4s','4.04')
df.Weight = df.Weight.str.replace('kg','')
df.Weight = df.Weight.astype(float)
#Unit Conversion for Weight Column
df['Weight_LBS'] = df.Weight*2.204623
df[['Weight','Weight_LBS']]

In [None]:
#Operating System Typo
df['Operating System'] = df['Operating System'].replace('macOS','Mac OS')

In [None]:
df.RAM = df.RAM.str.replace('GB','')
df.RAM = df.RAM.astype(int)
df['RAM']

In [None]:
#Remove Leading Space in Column Name
df.rename(columns = {' Storage':'Storage'}, inplace = True)

In [None]:
#Converting from string to float
df['Screen Size'] = df['Screen Size'].str.replace('"','')
df['Screen Size'] = df['Screen Size'].astype(float)
df['Screen Size']

In [143]:
df.info()

<class 'pandas.core.series.Series'>
RangeIndex: 1302 entries, 0 to 1301
Series name: Storage
Non-Null Count  Dtype 
--------------  ----- 
1302 non-null   object
dtypes: object(1)
memory usage: 10.3+ KB


In [148]:
#,'Flash':'','HDD':'','Hybrid':'','Storage':'','+':''
#df['Storage'] = df.Storage.str.replace('SSD','')
df['Storage'] = df.Storage.str.replace('Flash','')
df['Storage'] = df.Storage.str.replace('HDD','')
df['Storage'] = df.Storage.str.replace('Hybrid','')
df['Storage'] = df.Storage.str.replace('Storage','')
df['Storage'] = df.Storage.str.replace('+','')
df.Storage

  df['Storage'] = df.Storage.str.replace('+','')


0        128GB 
1       128GB  
2        256GB 
3        512GB 
4        256GB 
         ...   
1297     128GB 
1298     512GB 
1299     64GB  
1300       1TB 
1301     500GB 
Name: Storage, Length: 1302, dtype: object

In [149]:
storage_df = df.Storage.str.split(" ", expand=True)

In [160]:
for (columnName, columnData) in storage_df.items():
    print('Column Name : ', columnName)
    print('Column Contents : ', columnData.unique())

Column Name :  0
Column Contents :  ['128GB' '256GB' '512GB' '500GB' '1TB' '32GB' '64GB' '2TB' '16GB' '1GB'
 '240GB' '8GB' '508GB']
Column Name :  4
Column Contents :  [None '1TB' '256GB' '2TB' '500GB' '512GB' '']
Column Name :  5
Column Contents :  [None '' '1TB']


In [159]:
storage_df.head(10)

Unnamed: 0,0,4,5
0,128GB,,
1,128GB,,
2,256GB,,
3,512GB,,
4,256GB,,
5,500GB,,
6,256GB,,
7,256GB,,
8,512GB,,
9,256GB,,


In [158]:
storage_df.drop([1,2,3,6],axis=1,inplace=True)

In [162]:
storage_df2 = storage_df.replace('', 0, regex=True)

In [163]:
storage_df2 = storage_df2.fillna(0)

In [166]:
for (columnName, columnData) in storage_df2.items():
    print('Column Name : ', columnName)
    print('Column Contents : ', columnData.unique())

Column Name :  0
Column Contents :  ['128GB' '256GB' '512GB' '500GB' '1TB' '32GB' '64GB' '2TB' '16GB' '1GB'
 '240GB' '8GB' '508GB']
Column Name :  4
Column Contents :  [0 '1TB' '256GB' '2TB' '500GB' '512GB']
Column Name :  5
Column Contents :  [0 '1TB']


In [169]:
storage_df3 = storage_df2

In [173]:
import re
for x in storage_df2[0]:
    if 'GB' in x:
        num = re.findall(r'\d+', x)
        print(num[0]*(1024 * 1024 * 1024))
    elif 'TB' in x:
        num = re.findall(r'\d+', x)
        print(num[0]*(1024 * 1024 * 1024 * 1024))
    else:
        pass

MemoryError: 

In [155]:
#Drop all the empty columns
#Fill the None values with 0
#Convert all other items to bytes, add them together, then convert to TB
import re

num = re.findall(r'\d+', '2GB')

print(num[0])


2


In [146]:
df = pd.read_csv('datasets/laptops_preprocessed.csv', index_col=0)

In [2]:
import pandas as pd 
import numpy as np

df = pd.read_csv('datasets/laptops_preprocessed.csv',index_col=0)


In [5]:
CPU_df = df[['CPU']].copy()

In [7]:
# Python code
# To reverse words in a given string
CPU_reversed = []
# input string
for x in CPU_df.CPU:
    # reversing words in a given string
    s = x.split()[::-1]
    l = []
    for i in s:
        # appending reversed words to l
        l.append(i)
    # printing reverse words
    CPU_reversed.append(" ".join(l))

In [8]:
CPU_df['CPU_reversed'] = CPU_reversed

In [11]:
new = CPU_df['CPU_reversed'].str.split(' ', expand=True, n=2)

In [13]:
df['CPU_Speed'] = new[0]

In [15]:
new['CPU_Flipped'] = new[1] + " " +new[2]

In [16]:
# Python code
# To reverse words in a given string
CPU_flipped = []
# input string
for x in new.CPU_Flipped:
    # reversing words in a given string
    s = x.split()[::-1]
    l = []
    for i in s:
        # appending reversed words to l
        l.append(i)
    # printing reverse words
    CPU_flipped.append(" ".join(l))

In [18]:
new['CPU_flipped'] = CPU_flipped

In [19]:
new2 = new['CPU_flipped'].str.split(' ', expand=True, n=1)

In [20]:
df['CPU Brand'] = new2[0]
df['CPU Model'] = new2[1]

In [25]:
CPU_speed = df['CPU_Speed'].unique()
CPU_speed.sort()
print(df['CPU_Speed'].value_counts())
CPU_speed

2.50    293
2.80    165
2.70    165
1.60    134
2.30     86
2.00     86
1.80     78
2.60     76
1.10     53
2.40     52
2.90     21
3.00     19
1.20     15
1.44     12
2.20     11
1.50     10
1.30      6
3.60      5
0.90      4
3.10      3
2.10      3
1.90      2
3.20      1
1.00      1
1.92      1
Name: CPU_Speed, dtype: int64


array([0.9 , 1.  , 1.1 , 1.2 , 1.3 , 1.44, 1.5 , 1.6 , 1.8 , 1.9 , 1.92,
       2.  , 2.1 , 2.2 , 2.3 , 2.4 , 2.5 , 2.6 , 2.7 , 2.8 , 2.9 , 3.  ,
       3.1 , 3.2 , 3.6 ])

In [22]:
df['CPU_Speed'] = df['CPU_Speed'].str.replace('GHz','')

In [23]:
df['CPU_Speed'] = df['CPU_Speed'].astype(float)

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1302 entries, 0 to 1301
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Manufacturer      1302 non-null   object 
 1   Model Name        1302 non-null   object 
 2   Category          1302 non-null   object 
 3   Screen Size       1302 non-null   float64
 4   Screen            1302 non-null   object 
 5   CPU               1302 non-null   object 
 6   RAM               1302 non-null   int64  
 7   Storage           1302 non-null   object 
 8   GPU               1302 non-null   object 
 9   Operating System  1302 non-null   object 
 10  Weight            1302 non-null   float64
 11  Price             1302 non-null   float64
 12  Price_USD         1302 non-null   float64
 13  Weight_LBS        1302 non-null   float64
 14  CPU_Speed         1302 non-null   float64
 15  CPU Brand         1302 non-null   object 
 16  CPU Model         1302 non-null   object 


In [26]:
CPU_brand = df['CPU Brand'].unique()
CPU_brand.sort()
print(df['CPU Brand'].value_counts())
CPU_brand

Intel      1239
AMD          62
Samsung       1
Name: CPU Brand, dtype: int64


array(['AMD', 'Intel', 'Samsung'], dtype=object)

In [27]:
CPU_model = df['CPU Model'].unique()
CPU_model.sort()
print(df['CPU Model'].value_counts())
CPU_model

Core i5 7200U       193
Core i7 7700HQ      147
Core i7 7500U       134
Core i3 6006U        81
Core i7 8550U        73
                   ... 
FX 9830P              1
Core i5 6440HQ        1
E-Series E2-6110      1
Ryzen 1600            1
Core i7 6920HQ        1
Name: CPU Model, Length: 93, dtype: int64


array(['A10-Series 9600P', 'A10-Series 9620P', 'A10-Series A10-9620P',
       'A12-Series 9700P', 'A12-Series 9720P', 'A4-Series 7210',
       'A6-Series 7310', 'A6-Series 9220', 'A6-Series A6-9220',
       'A8-Series 7410', 'A9-Series 9410', 'A9-Series 9420',
       'A9-Series A9-9420', 'Atom X5-Z8350', 'Atom Z8350',
       'Atom x5-Z8300', 'Atom x5-Z8350', 'Atom x5-Z8550',
       'Celeron Dual Core 3205U', 'Celeron Dual Core 3855U',
       'Celeron Dual Core N3050', 'Celeron Dual Core N3060',
       'Celeron Dual Core N3350', 'Celeron Quad Core N3160',
       'Celeron Quad Core N3450', 'Celeron Quad Core N3710', 'Core M',
       'Core M 6Y30', 'Core M 6Y54', 'Core M 6Y75', 'Core M 7Y30',
       'Core M M3-6Y30', 'Core M M7-6Y75', 'Core M m3', 'Core M m3-7Y30',
       'Core M m7-6Y75', 'Core i3 6006U', 'Core i3 6100U',
       'Core i3 7100U', 'Core i3 7130U', 'Core i5', 'Core i5 6200U',
       'Core i5 6260U', 'Core i5 6300HQ', 'Core i5 6300U',
       'Core i5 6440HQ', 'Core i5 7200U'

In [28]:
new3 = df['GPU'].str.split(' ', expand=True, n=1)

In [29]:
df['GPU Brand'] = new3[0]
df['GPU Model'] = new3[1]

In [30]:
GPU_model = df['GPU Model'].unique()
GPU_model.sort()
print(df['GPU Model'].value_counts())
GPU_model

HD Graphics 620     281
HD Graphics 520     185
UHD Graphics 620     68
GeForce GTX 1050     66
GeForce GTX 1060     48
                   ... 
Radeon R5 520         1
Radeon R7             1
HD Graphics 540       1
Radeon 540            1
Mali T860 MP4         1
Name: GPU Model, Length: 110, dtype: int64


array(['FirePro W4190M', 'FirePro W4190M ', 'FirePro W5130M',
       'FirePro W6150M', 'GTX 980 SLI', 'GeForce 150MX', 'GeForce 920',
       'GeForce 920M', 'GeForce 920MX', 'GeForce 920MX ', 'GeForce 930M',
       'GeForce 930MX', 'GeForce 930MX ', 'GeForce 940M', 'GeForce 940MX',
       'GeForce 960M', 'GeForce GT 940MX', 'GeForce GTX 1050',
       'GeForce GTX 1050 Ti', 'GeForce GTX 1050M', 'GeForce GTX 1050Ti',
       'GeForce GTX 1060', 'GeForce GTX 1070', 'GeForce GTX 1070M',
       'GeForce GTX 1080', 'GeForce GTX 930MX', 'GeForce GTX 940M',
       'GeForce GTX 940MX', 'GeForce GTX 950M', 'GeForce GTX 960',
       'GeForce GTX 960<U+039C>', 'GeForce GTX 960M', 'GeForce GTX 965M',
       'GeForce GTX 970M', 'GeForce GTX 980 ', 'GeForce GTX 980M',
       'GeForce GTX1050 Ti', 'GeForce GTX1060', 'GeForce GTX1080',
       'GeForce MX130', 'GeForce MX150', 'Graphics 620', 'HD Graphics',
       'HD Graphics 400', 'HD Graphics 405', 'HD Graphics 500',
       'HD Graphics 505', 'HD Grap

In [31]:
GPU_brand = df['GPU Brand'].unique()
GPU_brand.sort()
print(df['GPU Brand'].value_counts())
GPU_brand

Intel     722
Nvidia    399
AMD       180
ARM         1
Name: GPU Brand, dtype: int64


array(['AMD', 'ARM', 'Intel', 'Nvidia'], dtype=object)

In [32]:
new4 = df['Model Name'].str.split('(', expand=True, n=1)

In [33]:
df['Model Name Cleaned'] = new4[0]

In [34]:
model_name_c = df['Model Name Cleaned'].unique()
model_name_c.sort()
print(df['Model Name Cleaned'].value_counts())
model_name_c

XPS 13                30
Inspiron 3567         29
250 G6                21
Vostro 3568           19
Legion Y520-15IKBN    19
                      ..
FX550IK-DM018T         1
V320-17ISK             1
Notebook Odyssey       1
Inspiron 3179          1
Zbook 17               1
Name: Model Name Cleaned, Length: 592, dtype: int64


array(['110-15ACL ', '14-am079na ', '15-AC110nv ', '15-AY023na ',
       '15-BA015wm ', '15-BS026nv ', '15-BS028nv ', '15-BS078nr ',
       '15-BS101nv ', '15-BS103nv ', '15-BW004nv ', '15-BW037na ',
       '15-BW091ND ', '15-BW094nd ', '15-ay047nv ', '15-ba043na ',
       '15-bs002nv ', '15-bs005nv ', '15-bs011nv ', '15-bs012nv ',
       '15-bs015dx ', '15-bs017nv ', '15-bs018nq ', '15-bs023nv ',
       '15-bs024nv ', '15-bs025nv ', '15-bs053od ', '15-bs078cl ',
       '15-bs190od ', '15-bw000nv ', '15-bw002nv ', '15-bw003nv ',
       '15-bw007nv ', '15-bw009nv ', '15-bw011nv ', '15-cb003na ',
       '15-cd005nv ', '15-ra044nv ', '15-rb013nv ', '17-AK091ND ',
       '17-BS037cl ', '17-BS092ND ', '17-X047na ', '17-Y002nv ',
       '17-ak001nv ', '17-ak002nv ', '17-bs000nv I3', '17-bs001nv ',
       '250 G4', '250 G5', '250 G6', '255 G6', '320-15ISK ',
       'A541NA-GO342 ', 'A715-71G-59DH ', 'Alienware 15', 'Alienware 17',
       'Aspire 1', 'Aspire 3', 'Aspire 5', 'Aspire 7', 'Aspire

In [36]:
screen_touch = []

for x in df.Screen:
    if 'Touchscreen' in x:
        screen_touch.append(1)
    else:
        screen_touch.append(0)

In [37]:
screen_hd = []

for x in df.Screen:
    if 'HD' in x:
        screen_hd.append(1)
    else:
        screen_hd.append(0)

In [38]:
df['Touchscreen'] = screen_touch
df['Screen_HD'] = screen_hd

In [56]:
new5 = df['Screen'].str.split(' ', expand=True)

In [71]:
screen_quality = []
test = []
for index,row in new5.iterrows():
    screen_quality.append(row[row.str.contains('x')].values[0])
 #       screen_quality.append(row[index])
  #  else:
   #     screen_quality.append(0)

In [57]:
new5 = new5.fillna('0')

In [72]:
df['Screen Quality'] = screen_quality

In [73]:
screen_quality_cleaned = df['Screen Quality'].unique()
screen_quality_cleaned.sort()
print(df['Screen Quality'].value_counts())
screen_quality_cleaned

1920x1080    841
1366x768     308
3840x2160     42
3200x1800     27
2560x1440     23
1600x900      23
2560x1600      6
2304x1440      6
2256x1504      6
1920x1200      5
1440x900       4
2880x1800      4
2400x1600      4
2160x1440      2
2736x1824      1
Name: Screen Quality, dtype: int64


array(['1366x768', '1440x900', '1600x900', '1920x1080', '1920x1200',
       '2160x1440', '2256x1504', '2304x1440', '2400x1600', '2560x1440',
       '2560x1600', '2736x1824', '2880x1800', '3200x1800', '3840x2160'],
      dtype=object)

In [74]:
df.Screen

0               IPS Panel Retina Display 2560x1600
1                                         1440x900
2                                Full HD 1920x1080
3               IPS Panel Retina Display 2880x1800
4               IPS Panel Retina Display 2560x1600
                           ...                    
1297     IPS Panel Full HD / Touchscreen 1920x1080
1298    IPS Panel Quad HD+ / Touchscreen 3200x1800
1299                                      1366x768
1300                                      1366x768
1301                                      1366x768
Name: Screen, Length: 1302, dtype: object

In [75]:
new6 = df['Storage'].str.split(' ', expand=True)

In [83]:
storage = []
for index,row in new6.iterrows():
    storage.append(row[row.str.contains('B')].values)

In [79]:
new6 = new6.fillna('0')

In [89]:
import re
for i in storage:
    for j in i:
        if 'GB' in j:
            re.findall(r'\b\d+\b',j)*(1024*1024*1024)
        elif 'TB' in j:
            re.findall(r'\b\d+\b',j)*(1024*1024*1024*1024)

In [114]:
storage_df = pd.DataFrame(storage)

In [143]:
#*(1024*1024*1024)
#*(1024*1024*1024*1024)
col_0 = []
for val in storage_df[0]:
    if 'GB' in val:
        col_0.append(int(re.findall(r'\d+',val)[0])*(1024*1024*1024))
    elif 'TB' in val:
        col_0.append(int(re.findall(r'\d+',val)[0])*(1024*1024*1024*1024))
    else:
        print('Nothing')

In [141]:
for val in storage_df[0]:
    print(val)
    print(re.findall(r'\b\d',val))

128GB
['1']
128GB
['1']
256GB
['2']
512GB
['5']
256GB
['2']
500GB
['5']
256GB
['2']
256GB
['2']
512GB
['5']
256GB
['2']
500GB
['5']
500GB
['5']
256GB
['2']
256GB
['2']
256GB
['2']
256GB
['2']
256GB
['2']
512GB
['5']
1TB
['1']
128GB
['1']
32GB
['3']
128GB
['1']
500GB
['5']
256GB
['2']
256GB
['2']
1TB
['1']
128GB
['1']
256GB
['2']
256GB
['2']
1TB
['1']
64GB
['6']
32GB
['3']
500GB
['5']
512GB
['5']
256GB
['2']
64GB
['6']
1TB
['1']
128GB
['1']
1TB
['1']
256GB
['2']
1TB
['1']
256GB
['2']
1TB
['1']
256GB
['2']
1TB
['1']
256GB
['2']
128GB
['1']
256GB
['2']
256GB
['2']
1TB
['1']
64GB
['6']
256GB
['2']
256GB
['2']
512GB
['5']
1TB
['1']
256GB
['2']
128GB
['1']
256GB
['2']
256GB
['2']
256GB
['2']
256GB
['2']
256GB
['2']
256GB
['2']
256GB
['2']
256GB
['2']
512GB
['5']
500GB
['5']
32GB
['3']
1TB
['1']
1TB
['1']
128GB
['1']
256GB
['2']
256GB
['2']
128GB
['1']
1TB
['1']
1TB
['1']
2TB
['2']
128GB
['1']
1TB
['1']
128GB
['1']
256GB
['2']
512GB
['5']
256GB
['2']
128GB
['1']
1TB
['1']
128GB
['1']
256GB
['

In [116]:
storage_df = storage_df.fillna('0GB')

In [144]:
col_1 = []
for val in storage_df[1]:
    if 'GB' in val:
        col_1.append(int(re.findall(r'\d+',val)[0])*(1024*1024*1024))
    elif 'TB' in val:
        col_1.append(int(re.findall(r'\d+',val)[0])*(1024*1024*1024*1024))
    else:
        print('Nothing')

In [127]:
total_storage_list = [ x+y for x in col_0 for y in col_1 ]

In [145]:
res_list = [col_0[i] + col_1[i] for i in range(len(col_0))]

In [146]:
total_storage_list = [x/(1024*1024*1024*1024) for x in res_list]

In [147]:
df['Total Storage in TB'] = total_storage_list

In [148]:
df[['Storage','Total Storage in TB']]

Unnamed: 0,Storage,Total Storage in TB
0,128GB SSD,0.125000
1,128GB Flash Storage,0.125000
2,256GB SSD,0.250000
3,512GB SSD,0.500000
4,256GB SSD,0.250000
...,...,...
1297,128GB SSD,0.125000
1298,512GB SSD,0.500000
1299,64GB Flash Storage,0.062500
1300,1TB HDD,1.000000


In [149]:
df.to_csv('datasets/laptops_preprocessed.csv',index=True)

In [None]:
df.to_csv('datasets/laptops_preprocessed.csv',index=False)