In [55]:
import pandas as pd

laptops = pd.read_csv('datasets/laptops.csv', encoding='Latin-1')
laptops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Manufacturer              1303 non-null   object
 1   Model Name                1303 non-null   object
 2   Category                  1303 non-null   object
 3   Screen Size               1303 non-null   object
 4   Screen                    1303 non-null   object
 5   CPU                       1303 non-null   object
 6   RAM                       1303 non-null   object
 7    Storage                  1303 non-null   object
 8   GPU                       1303 non-null   object
 9   Operating System          1303 non-null   object
 10  Operating System Version  1133 non-null   object
 11  Weight                    1303 non-null   object
 12  Price (Euros)             1303 non-null   object
dtypes: object(13)
memory usage: 132.5+ KB


In [56]:
# Clean column names

def clean_column(name):
    name = name.strip()
    name = name.replace("Operating System", "os")
    name = name.replace(" ", "_")
    name = name.replace("(", "")
    name = name.replace(")", "")
    name = name.lower()
    return name
    
new_columns = []

for column in laptops.columns:
    new_columns.append(clean_column(column))
    
laptops.columns = new_columns
laptops.columns

Index(['manufacturer', 'model_name', 'category', 'screen_size', 'screen',
       'cpu', 'ram', 'storage', 'gpu', 'os', 'os_version', 'weight',
       'price_euros'],
      dtype='object')

In [57]:
# Clean up 'ram' column
laptops['ram'] = laptops['ram'].str.replace('GB', '')
laptops['ram'] = laptops['ram'].astype(int)

laptops.rename({'ram': 'ram_gb'}, axis=1, inplace=True)
ram_gb_desc = laptops['ram_gb'].describe()
ram_gb_desc

count    1303.000000
mean        8.382195
std         5.084665
min         2.000000
25%         4.000000
50%         8.000000
75%         8.000000
max        64.000000
Name: ram_gb, dtype: float64

In [58]:
# Extract CPU manufaturer
laptops['cpu_manufacturer'] = (laptops['cpu']
                    .str.split()
                    .str[0]
)

cpu_manufacturer_count = laptops['cpu_manufacturer'].value_counts()

cpu_manufacturer_count

Intel      1240
AMD          62
Samsung       1
Name: cpu_manufacturer, dtype: int64

In [59]:
# Clean up 'os' columns

mapping_dict = {
    'Android': 'Android',
    'Chrome OS': 'Chrome OS',
    'Linux': 'Linux',
    'Mac OS': 'macOS',
    'No OS': 'No OS',
    'Windows': 'Windows',
    'macOS': 'macOS'
}

laptops['os'] = laptops['os'].map(mapping_dict)

In [60]:
# If we want to remove null rows and columns, use:
laptops_no_null_rows = laptops.dropna()
laptops_no_null_cols = laptops.dropna(axis=1)

# To fill in values instead:
value_counts_before = laptops.loc[laptops["os_version"].isnull(), "os"].value_counts()
laptops.loc[laptops["os"] == "macOS", "os_version"] = "X"

laptops.loc[laptops['os'] == 'No OS', 'os_version'] = 'Version Unknown'

value_counts_after = laptops.loc[laptops['os_version'].isnull(), 'os'].value_counts()

In [61]:
# Clean up 'weight' column
laptops['weight'] = (laptops['weight']
                        .str.replace('kg','')
                        .str.replace('s', '')
                    )
laptops['weight'] = laptops['weight'].astype(float)

laptops.rename({"weight": "weight_kg"}, axis=1, inplace=True)
laptops['weight_kg'].describe()

count    1303.000000
mean        2.038734
std         0.665475
min         0.690000
25%         1.500000
50%         2.040000
75%         2.300000
max         4.700000
Name: weight_kg, dtype: float64

In [62]:
laptops.to_csv('datasets/laptops_cleaned.csv', index=False)

Here are some questions you might like to answer in your own time by analyzing the cleaned data:

Are laptops made by Apple more expensive than those made by other manufacturers?
What is the best value laptop with a screen size of 15" or more?
Which laptop has the most storage space?