In [None]:
import pandas as pd 
import numpy as np

# read in all our data
laptop_train = pd.read_csv("datasets/laptops_train.csv")
laptop_test = pd.read_csv("datasets/laptops_test.csv")

# set seed for reproducibility
np.random.seed(0)

#Viewing Data
print(laptop_train.head())
print(laptop_test.head())

In [None]:
#Combining Train and Test Data for Data Processing
frames = [laptop_train,laptop_test]
df = pd.concat(frames)
#Must do to keep from having duplicate indexes
df.reset_index(drop=True, inplace=True)
df.head()

In [None]:
# get the number of missing data points per column
missing_values_count = df.isnull().sum()
missing_values_count

In [None]:
#Data Exploration before Dropping Operating System Version column
df['Operating System Version'].value_counts()

df[df['Operating System Version'].isna()].groupby('Operating System').groups

grouped = df.groupby('Operating System')

for name,group in grouped:
    print(name)
    print(group['Operating System Version'].unique())


In [None]:
# Drop the Operating System Version because the laptop can just be updated by the user
df = df.drop('Operating System Version', axis=1)

In [None]:
#Confirm Column drop
df.columns

Data Preprocessing
Course of Action:
Manufacturer         Target Encoding
Model Name           Split on the () into two columns- Target Encoding
Category             One Hot Encoding
Screen Size          Convert to float and scale (units is inches)
Screen               Split into new features - screen type, screen quality, HD(Binary), Touchscreen(Binary)
CPU                  Split into new features - Brand, Model Number, Speed
RAM                  Remove "GB" and scale
Storage              Convert to Total Storage column and scale
GPU                  Split into new features - Brand, Model Number, Speed
Operating System     One Hot Encoding (macOS = MacOS)
Weight               Convert to lbs and scale (4s = 4.04 typo)
Price                Convert to USD

In [None]:
""" Course of Action:
Manufacturer         Target Encoding
Model Name           Split on the () into two columns- Target Encoding
Category             One Hot Encoding
Screen Size          Convert to float and scale (units is inches)
Screen               Split into new features - screen type, screen quality, HD(Binary), Touchscreen(Binary)
CPU                  Split into new features - Brand, Model Number, Speed
RAM                  Remove "GB" and scale
Storage              Convert to Total Storage column and scale
GPU                  Split into new features - Brand, Model Number, Speed
Operating System     One Hot Encoding (macOS = MacOS)
Weight               Convert to lbs and scale (4s = 4.04 typo)
Price                Convert to USD """

In [None]:
df.dtypes

In [None]:
df[['Screen Size','Screen']]

In [None]:
#Treat Manufacturer as an nominal variable
df['Manufacturer'].unique()

In [None]:
# gives a tuple of column name and series
# for each column in the dataframe
for (columnName, columnData) in df.items():
    print('Column Name : ', columnName)
    print('Column Contents : ', columnData.unique())

In [None]:
model_name = df['Model Name'].unique()
model_name.sort()
model_name

In [None]:
#Looking at the data redundancy in Model Name,CPU,RAM,Storage, and GPU for specific rows
df[['Model Name','CPU']]

In [None]:
df[['Model Name','CPU','RAM','Storage', 'GPU']][df['Model Name'].str.contains('/')]

In [None]:
category = df['Category'].unique()
category.sort()
category

In [None]:
# gives a tuple of column name and series
# for each column in the dataframe
#Deciding which columns need to be Target Encoded or One Hot Encoded (cut off being > 15)
for (columnName, columnData) in df.items():
    print('Column Name : ', columnName)
    print('Column Contents : ', columnData.nunique())

Data Exploration of each column

In [None]:
screen_size = df['Screen Size'].unique()
screen_size.sort()
screen_size

In [None]:
screen = df['Screen'].unique()
screen.sort()
print(df['Screen'].value_counts())
screen

In [None]:
CPU = df['CPU'].unique()
CPU.sort()
print(df['CPU'].value_counts())
CPU

In [None]:
RAM = df['RAM'].unique()
RAM.sort()
print(df['RAM'].value_counts())
RAM

In [122]:
storage = df['Storage'].unique()
storage.sort()
print(df['Storage'].value_counts())
storage

256GB SSD                        412
1TB HDD                          224
500GB HDD                        132
512GB SSD                        118
128GB SSD +  1TB HDD              94
128GB SSD                         76
256GB SSD +  1TB HDD              73
32GB Flash Storage                38
2TB HDD                           16
64GB Flash Storage                15
512GB SSD +  1TB HDD              14
1TB SSD                           14
256GB SSD +  2TB HDD              10
1TB Hybrid                         9
256GB Flash Storage                8
16GB Flash Storage                 7
32GB SSD                           6
1GB SSD                            5
128GB Flash Storage                4
16GB SSD                           3
512GB Flash Storage                2
1TB SSD +  1TB HDD                 2
512GB SSD +  2TB HDD               2
256GB SSD +  500GB HDD             2
128GB SSD +  2TB HDD               2
256GB SSD +  256GB SSD             2
512GB SSD +  256GB SSD             1
5

array(['128GB Flash Storage', '128GB HDD', '128GB SSD',
       '128GB SSD +  1TB HDD', '128GB SSD +  2TB HDD',
       '16GB Flash Storage', '16GB SSD', '1GB SSD', '1TB HDD',
       '1TB HDD +  1TB HDD', '1TB Hybrid', '1TB SSD',
       '1TB SSD +  1TB HDD', '240GB SSD', '256GB Flash Storage',
       '256GB SSD', '256GB SSD +  1TB HDD', '256GB SSD +  1TB Hybrid',
       '256GB SSD +  256GB SSD', '256GB SSD +  2TB HDD',
       '256GB SSD +  500GB HDD', '2TB HDD', '32GB Flash Storage',
       '32GB HDD', '32GB SSD', '500GB HDD', '508GB Hybrid',
       '512GB Flash Storage', '512GB SSD', '512GB SSD +  1TB HDD',
       '512GB SSD +  1TB Hybrid', '512GB SSD +  256GB SSD',
       '512GB SSD +  2TB HDD', '512GB SSD +  512GB SSD',
       '64GB Flash Storage', '64GB Flash Storage +  1TB HDD', '64GB SSD',
       '8GB SSD'], dtype=object)

In [None]:
GPU = df['GPU'].unique()
GPU.sort()
print(df['GPU'].value_counts())
GPU

In [None]:
OS = df['Operating System'].unique()
OS.sort()
print(df['Operating System'].value_counts())
OS

In [None]:
weight = df['Weight'].unique()
weight.sort()
print(df['Weight'].value_counts())
weight

In [None]:
price = df['Price'].unique()
price.sort()
print(df['Price'].value_counts())
price

In [None]:
df.columns

In [None]:
#1 INR = 0.012203 USD Conversion Rate as of May 10,2023
#Unit Conversion for Price Column
df['Price_USD'] = df.Price/81.9433
df[['Price','Price_USD']]

In [None]:
#4s is a typo. Google search the weight of the laptop
df.Weight = df.Weight.replace('4s','4.04')
df.Weight = df.Weight.str.replace('kg','')
df.Weight = df.Weight.astype(float)
#Unit Conversion for Weight Column
df['Weight_LBS'] = df.Weight*2.204623
df[['Weight','Weight_LBS']]

In [None]:
#Operating System Typo
df['Operating System'] = df['Operating System'].replace('macOS','Mac OS')

In [None]:
df.RAM = df.RAM.str.replace('GB','')
df.RAM = df.RAM.astype(int)
df['RAM']

In [None]:
#Remove Leading Space in Column Name
df.rename(columns = {' Storage':'Storage'}, inplace = True)

In [None]:
#Converting from string to float
df['Screen Size'] = df['Screen Size'].str.replace('"','')
df['Screen Size'] = df['Screen Size'].astype(float)
df['Screen Size']

In [143]:
df.info()

<class 'pandas.core.series.Series'>
RangeIndex: 1302 entries, 0 to 1301
Series name: Storage
Non-Null Count  Dtype 
--------------  ----- 
1302 non-null   object
dtypes: object(1)
memory usage: 10.3+ KB


In [148]:
#,'Flash':'','HDD':'','Hybrid':'','Storage':'','+':''
#df['Storage'] = df.Storage.str.replace('SSD','')
df['Storage'] = df.Storage.str.replace('Flash','')
df['Storage'] = df.Storage.str.replace('HDD','')
df['Storage'] = df.Storage.str.replace('Hybrid','')
df['Storage'] = df.Storage.str.replace('Storage','')
df['Storage'] = df.Storage.str.replace('+','')
df.Storage

  df['Storage'] = df.Storage.str.replace('+','')


0        128GB 
1       128GB  
2        256GB 
3        512GB 
4        256GB 
         ...   
1297     128GB 
1298     512GB 
1299     64GB  
1300       1TB 
1301     500GB 
Name: Storage, Length: 1302, dtype: object

In [149]:
storage_df = df.Storage.str.split(" ", expand=True)

In [160]:
for (columnName, columnData) in storage_df.items():
    print('Column Name : ', columnName)
    print('Column Contents : ', columnData.unique())

Column Name :  0
Column Contents :  ['128GB' '256GB' '512GB' '500GB' '1TB' '32GB' '64GB' '2TB' '16GB' '1GB'
 '240GB' '8GB' '508GB']
Column Name :  4
Column Contents :  [None '1TB' '256GB' '2TB' '500GB' '512GB' '']
Column Name :  5
Column Contents :  [None '' '1TB']


In [159]:
storage_df.head(10)

Unnamed: 0,0,4,5
0,128GB,,
1,128GB,,
2,256GB,,
3,512GB,,
4,256GB,,
5,500GB,,
6,256GB,,
7,256GB,,
8,512GB,,
9,256GB,,


In [158]:
storage_df.drop([1,2,3,6],axis=1,inplace=True)

In [162]:
storage_df2 = storage_df.replace('', 0, regex=True)

In [163]:
storage_df2 = storage_df2.fillna(0)

In [166]:
for (columnName, columnData) in storage_df2.items():
    print('Column Name : ', columnName)
    print('Column Contents : ', columnData.unique())

Column Name :  0
Column Contents :  ['128GB' '256GB' '512GB' '500GB' '1TB' '32GB' '64GB' '2TB' '16GB' '1GB'
 '240GB' '8GB' '508GB']
Column Name :  4
Column Contents :  [0 '1TB' '256GB' '2TB' '500GB' '512GB']
Column Name :  5
Column Contents :  [0 '1TB']


In [169]:
storage_df3 = storage_df2

In [173]:
import re
for x in storage_df2[0]:
    if 'GB' in x:
        num = re.findall(r'\d+', x)
        print(num[0]*(1024 * 1024 * 1024))
    elif 'TB' in x:
        num = re.findall(r'\d+', x)
        print(num[0]*(1024 * 1024 * 1024 * 1024))
    else:
        pass

MemoryError: 

In [155]:
#Drop all the empty columns
#Fill the None values with 0
#Convert all other items to bytes, add them together, then convert to TB
import re

num = re.findall(r'\d+', '2GB')

print(num[0])


2


In [146]:
df = pd.read_csv('datasets/laptops_preprocessed.csv', index_col=0)

In [None]:
df.to_csv('datasets/laptops_preprocessed.csv',index=False)