In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


df = pd.read_csv("Data/laptopData.csv")
df

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,1.0,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,2.0,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0000
3,3.0,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.3360
4,4.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.8080
...,...,...,...,...,...,...,...,...,...,...,...,...
1298,1298.0,Lenovo,2 in 1 Convertible,14,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 6500U 2.5GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows 10,1.8kg,33992.6400
1299,1299.0,Lenovo,2 in 1 Convertible,13.3,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel Core i7 6500U 2.5GHz,16GB,512GB SSD,Intel HD Graphics 520,Windows 10,1.3kg,79866.7200
1300,1300.0,Lenovo,Notebook,14,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2GB,64GB Flash Storage,Intel HD Graphics,Windows 10,1.5kg,12201.1200
1301,1301.0,HP,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6GB,1TB HDD,AMD Radeon R5 M330,Windows 10,2.19kg,40705.9200


### -->  To get a quick summary of a DataFrame? we need to use:

In [103]:
df.info

<bound method DataFrame.info of       Unnamed: 0 Company            TypeName Inches  \
0            0.0   Apple           Ultrabook   13.3   
1            1.0   Apple           Ultrabook   13.3   
2            2.0      HP            Notebook   15.6   
3            3.0   Apple           Ultrabook   15.4   
4            4.0   Apple           Ultrabook   13.3   
...          ...     ...                 ...    ...   
1298      1298.0  Lenovo  2 in 1 Convertible     14   
1299      1299.0  Lenovo  2 in 1 Convertible   13.3   
1300      1300.0  Lenovo            Notebook     14   
1301      1301.0      HP            Notebook   15.6   
1302      1302.0    Asus            Notebook   15.6   

                                ScreenResolution  \
0             IPS Panel Retina Display 2560x1600   
1                                       1440x900   
2                              Full HD 1920x1080   
3             IPS Panel Retina Display 2880x1800   
4             IPS Panel Retina Display 2560x160

#### It’s especially useful during data exploration or data cleaning stages, helps in data type optimization and provides a quick overview without printing the whole dataset

# -> Just to check the data type to see which columns are strings, integers, floats, or booleans.

In [104]:
df.dtypes

Unnamed: 0          float64
Company              object
TypeName             object
Inches               object
ScreenResolution     object
Cpu                  object
Ram                  object
Memory               object
Gpu                  object
OpSys                object
Weight               object
Price               float64
dtype: object

In [105]:
df.columns

Index(['Unnamed: 0', 'Company', 'TypeName', 'Inches', 'ScreenResolution',
       'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight', 'Price'],
      dtype='object')

# -> remove any space in my columns using:

In [106]:
df.columns = df.columns.str.strip()

# -> We need to check the missing values in my dataframe by using:

In [107]:
df.isnull()

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
1298,False,False,False,False,False,False,False,False,False,False,False,False
1299,False,False,False,False,False,False,False,False,False,False,False,False
1300,False,False,False,False,False,False,False,False,False,False,False,False
1301,False,False,False,False,False,False,False,False,False,False,False,False


### -> To make it simple we will count the missing value in my dataframe  in each column

In [108]:
df.isnull().sum()

Unnamed: 0          30
Company             30
TypeName            30
Inches              30
ScreenResolution    30
Cpu                 30
Ram                 30
Memory              30
Gpu                 30
OpSys               30
Weight              30
Price               30
dtype: int64

# -> I want to know the total numbers of missing values in the data that i have it:

In [109]:
df.isnull().sum().sum()

np.int64(360)

# -> To remove all rows with any missing values (NaN) from the DataFrame:


In [110]:
df.dropna(inplace=True)


In [111]:
df.isnull().sum()

Unnamed: 0          0
Company             0
TypeName            0
Inches              0
ScreenResolution    0
Cpu                 0
Ram                 0
Memory              0
Gpu                 0
OpSys               0
Weight              0
Price               0
dtype: int64

# -> Count how many duplicate rows exist in a DataFrame.

In [112]:
df.duplicated().sum()


np.int64(0)

# --> Returns a new DataFrame without duplicates


In [113]:
duplicates = df.drop_duplicates(inplace=True)
duplicates

In [114]:
df

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,1.0,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,2.0,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0000
3,3.0,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.3360
4,4.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.8080
...,...,...,...,...,...,...,...,...,...,...,...,...
1298,1298.0,Lenovo,2 in 1 Convertible,14,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 6500U 2.5GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows 10,1.8kg,33992.6400
1299,1299.0,Lenovo,2 in 1 Convertible,13.3,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel Core i7 6500U 2.5GHz,16GB,512GB SSD,Intel HD Graphics 520,Windows 10,1.3kg,79866.7200
1300,1300.0,Lenovo,Notebook,14,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2GB,64GB Flash Storage,Intel HD Graphics,Windows 10,1.5kg,12201.1200
1301,1301.0,HP,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6GB,1TB HDD,AMD Radeon R5 M330,Windows 10,2.19kg,40705.9200


# --> Just to check if their is any 0 in the praice to fixe it 

In [115]:
df[df['Price'] < 0]

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price


In [116]:
df.duplicated().sum()

np.int64(0)

 # --> To Return all unique values from the columns in a DataFrame:

In [117]:
df['Company'].unique()

array(['Apple', 'HP', 'Acer', 'Asus', 'Dell', 'Lenovo', 'Chuwi', 'MSI',
       'Microsoft', 'Toshiba', 'Huawei', 'Xiaomi', 'Vero', 'Razer',
       'Mediacom', 'Samsung', 'Google', 'Fujitsu', 'LG'], dtype=object)

In [118]:
df['Gpu'].unique()

array(['Intel Iris Plus Graphics 640', 'Intel HD Graphics 6000',
       'Intel HD Graphics 620', 'AMD Radeon Pro 455',
       'Intel Iris Plus Graphics 650', 'AMD Radeon R5',
       'Intel Iris Pro Graphics', 'Nvidia GeForce MX150',
       'Intel UHD Graphics 620', 'Intel HD Graphics 520',
       'AMD Radeon Pro 555', 'AMD Radeon R5 M430',
       'Intel HD Graphics 615', 'AMD Radeon Pro 560',
       'Nvidia GeForce 940MX', 'Nvidia GeForce GTX 1050', 'AMD Radeon R2',
       'AMD Radeon 530', 'Nvidia GeForce 930MX', 'Intel HD Graphics',
       'Intel HD Graphics 500', 'Nvidia GeForce 930MX ',
       'Nvidia GeForce GTX 1060', 'Nvidia GeForce 150MX',
       'Intel Iris Graphics 540', 'AMD Radeon RX 580',
       'Nvidia GeForce 920MX', 'AMD Radeon R4 Graphics', 'AMD Radeon 520',
       'Nvidia GeForce GTX 1070', 'Nvidia GeForce GTX 1050 Ti',
       'Intel HD Graphics 400', 'Nvidia GeForce MX130', 'AMD R4 Graphics',
       'Nvidia GeForce GTX 940MX', 'AMD Radeon RX 560',
       'Nvidia GeFo

In [119]:
df['OpSys'].unique()

array(['macOS', 'No OS', 'Windows 10', 'Mac OS X', 'Linux',
       'Windows 10 S', 'Chrome OS', 'Windows 7', 'Android'], dtype=object)

In [120]:
df['Memory'].unique()

array(['128GB SSD', '128GB Flash Storage', '256GB SSD', '512GB SSD',
       '500GB HDD', '256GB Flash Storage', '1TB HDD',
       '128GB SSD +  1TB HDD', '256GB SSD +  256GB SSD',
       '64GB Flash Storage', '32GB Flash Storage', '256GB SSD +  1TB HDD',
       '256GB SSD +  2TB HDD', '32GB SSD', '2TB HDD', '64GB SSD',
       '1.0TB Hybrid', '512GB SSD +  1TB HDD', '1TB SSD',
       '256GB SSD +  500GB HDD', '128GB SSD +  2TB HDD',
       '512GB SSD +  512GB SSD', '16GB SSD', '16GB Flash Storage',
       '512GB SSD +  256GB SSD', '512GB SSD +  2TB HDD',
       '64GB Flash Storage +  1TB HDD', '180GB SSD', '1TB HDD +  1TB HDD',
       '32GB HDD', '1TB SSD +  1TB HDD', '?', '512GB Flash Storage',
       '128GB HDD', '240GB SSD', '8GB SSD', '508GB Hybrid', '1.0TB HDD',
       '512GB SSD +  1.0TB Hybrid', '256GB SSD +  1.0TB Hybrid'],
      dtype=object)

# --> To replace the unique value by correcte one, Clean the data: Replacing  values like '?' with meaningful defaults.

In [121]:
df['Memory'] = df['Memory'].replace('?', '126GB SSD')
df['Memory'] 

0                 128GB SSD
1       128GB Flash Storage
2                 256GB SSD
3                 512GB SSD
4                 256GB SSD
               ...         
1298              128GB SSD
1299              512GB SSD
1300     64GB Flash Storage
1301                1TB HDD
1302              500GB HDD
Name: Memory, Length: 1273, dtype: object

In [122]:
df['Memory'].unique()

array(['128GB SSD', '128GB Flash Storage', '256GB SSD', '512GB SSD',
       '500GB HDD', '256GB Flash Storage', '1TB HDD',
       '128GB SSD +  1TB HDD', '256GB SSD +  256GB SSD',
       '64GB Flash Storage', '32GB Flash Storage', '256GB SSD +  1TB HDD',
       '256GB SSD +  2TB HDD', '32GB SSD', '2TB HDD', '64GB SSD',
       '1.0TB Hybrid', '512GB SSD +  1TB HDD', '1TB SSD',
       '256GB SSD +  500GB HDD', '128GB SSD +  2TB HDD',
       '512GB SSD +  512GB SSD', '16GB SSD', '16GB Flash Storage',
       '512GB SSD +  256GB SSD', '512GB SSD +  2TB HDD',
       '64GB Flash Storage +  1TB HDD', '180GB SSD', '1TB HDD +  1TB HDD',
       '32GB HDD', '1TB SSD +  1TB HDD', '126GB SSD',
       '512GB Flash Storage', '128GB HDD', '240GB SSD', '8GB SSD',
       '508GB Hybrid', '1.0TB HDD', '512GB SSD +  1.0TB Hybrid',
       '256GB SSD +  1.0TB Hybrid'], dtype=object)

In [123]:
df.columns

Index(['Unnamed: 0', 'Company', 'TypeName', 'Inches', 'ScreenResolution',
       'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight', 'Price'],
      dtype='object')

In [124]:
df['Weight'].unique()

array(['1.37kg', '1.34kg', '1.86kg', '1.83kg', '2.1kg', '2.04kg', '1.3kg',
       '1.6kg', '2.2kg', '0.92kg', '1.22kg', '2.5kg', '1.62kg', '1.91kg',
       '2.3kg', '1.35kg', '1.88kg', '1.89kg', '1.65kg', '2.71kg', '1.2kg',
       '1.44kg', '2.8kg', '2kg', '2.65kg', '2.77kg', '3.2kg', '1.49kg',
       '2.4kg', '2.13kg', '2.43kg', '1.7kg', '1.4kg', '1.8kg', '1.9kg',
       '3kg', '1.252kg', '2.7kg', '2.02kg', '1.63kg', '1.96kg', '1.21kg',
       '2.45kg', '1.25kg', '1.5kg', '2.62kg', '1.38kg', '1.58kg',
       '1.85kg', '1.23kg', '2.16kg', '2.36kg', '7.2kg', '2.05kg',
       '1.32kg', '1.75kg', '0.97kg', '2.56kg', '1.48kg', '1.74kg',
       '1.1kg', '1.56kg', '2.03kg', '1.05kg', '5.4kg', '4.4kg', '1.90kg',
       '1.29kg', '2.0kg', '1.95kg', '2.06kg', '1.12kg', '3.49kg',
       '3.35kg', '2.23kg', '?', '2.9kg', '4.42kg', '2.69kg', '2.37kg',
       '4.7kg', '3.6kg', '2.08kg', '4.3kg', '1.68kg', '1.41kg', '4.14kg',
       '2.18kg', '2.24kg', '2.67kg', '4.1kg', '2.14kg', '1.36kg',
       '

In [125]:
df['Weight'] = df['Weight'].replace('?', '1.10kg')
df['Weight'] 

0       1.37kg
1       1.34kg
2       1.86kg
3       1.83kg
4       1.37kg
         ...  
1298     1.8kg
1299     1.3kg
1300     1.5kg
1301    2.19kg
1302     2.2kg
Name: Weight, Length: 1273, dtype: object

# --> I want's to check every cells and count how many cells in each column contain the value '?'

In [126]:
(df == '?').sum()

Unnamed: 0          0
Company             0
TypeName            0
Inches              1
ScreenResolution    0
Cpu                 0
Ram                 0
Memory              0
Gpu                 0
OpSys               0
Weight              0
Price               0
dtype: int64

In [127]:
df['Inches'].unique()

array(['13.3', '15.6', '15.4', '14', '12', '17.3', '13.5', '12.5', '13',
       '18.4', '13.9', '11.6', '25.6', '35.6', '12.3', '27.3', '24',
       '33.5', '?', '31.6', '17', '15', '14.1', '11.3', '10.1'],
      dtype=object)

In [128]:
df['Inches'] = df['Inches'].replace('?', '11.6')
df['Inches'] 

0       13.3
1       13.3
2       15.6
3       15.4
4       13.3
        ... 
1298      14
1299    13.3
1300      14
1301    15.6
1302    15.6
Name: Inches, Length: 1273, dtype: object

In [129]:
df['Weight'] = df['Weight'].replace('?', '11.6')
df['Weight'] 

0       1.37kg
1       1.34kg
2       1.86kg
3       1.83kg
4       1.37kg
         ...  
1298     1.8kg
1299     1.3kg
1300     1.5kg
1301    2.19kg
1302     2.2kg
Name: Weight, Length: 1273, dtype: object

# --> After i cleaned my database i need to reset the row index of a DataFrame to the default integer index (0, 1, 2, …), and optionally drop the old index. 

In [130]:
df.reset_index(drop=True, inplace=True)


In [131]:
df.index


RangeIndex(start=0, stop=1273, step=1)

# 👌The last step save our work:


In [132]:
df.to_csv('Cleaned_Laptop_Data.csv', index=False)


#### Now we need to make my database more detailed in terms of adding columns to separate this information overlap. This method will work to:
##### 1- Facilitate understanding of the data.
##### 2- Prevent overlap.
##### 3- Make it easier to access the specifications of the laptop.

### 1- Starting with column ScreenResolution:
#### Because I have more ditailed on it so, i will separate it into 4 columns like [Touchscreen, ScreenTechnology,  ResolutionWidth and ResolutionHeight]

In [133]:
df['Ram'] = df['Ram'].str.replace('GB', '').astype(int)
df['Ram']

0        8
1        8
2        8
3       16
4        8
        ..
1268     4
1269    16
1270     2
1271     6
1272     4
Name: Ram, Length: 1273, dtype: int64

In [134]:
df[['Ram']]

Unnamed: 0,Ram
0,8
1,8
2,8
3,16
4,8
...,...
1268,4
1269,16
1270,2
1271,6


In [135]:
df[['Weight']]

Unnamed: 0,Weight
0,1.37kg
1,1.34kg
2,1.86kg
3,1.83kg
4,1.37kg
...,...
1268,1.8kg
1269,1.3kg
1270,1.5kg
1271,2.19kg


In [136]:
df['Weight'] = df['Weight'].str.replace('kg', '').astype(float)
df['Weight']

0       1.37
1       1.34
2       1.86
3       1.83
4       1.37
        ... 
1268    1.80
1269    1.30
1270    1.50
1271    2.19
1272    2.20
Name: Weight, Length: 1273, dtype: float64

In [137]:
df['Touchscreen'] = df['ScreenResolution'].str.contains('Touchscreen', case=False, na=False)
df['Touchscreen']

0       False
1       False
2       False
3       False
4       False
        ...  
1268     True
1269     True
1270    False
1271    False
1272    False
Name: Touchscreen, Length: 1273, dtype: bool

In [138]:
df.columns

Index(['Unnamed: 0', 'Company', 'TypeName', 'Inches', 'ScreenResolution',
       'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight', 'Price',
       'Touchscreen'],
      dtype='object')

In [139]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price,Touchscreen
0,0.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,71378.6832,False


### #Now we add column for Touchscreen but i need it to be after the column ScreenResolution that we will use:
#### 1- Get all column names as a list.
#### 2- Find the index of 'ScreenResolution.
### --> because i add the column Touchscreen before but it's far from the column ScreenResolution so, i will remove it then add it after column ScreenResolution :
#### 3- Remove 'Touchscreen' from the list (if it's already there).
#### 4- Insert column Touchscreen right after column ScreenResolution.
#### 5- Reorder the DataFrame columns.





In [140]:
# 1
cols = list(df.columns)

# 2
idx = cols.index('ScreenResolution')

# 3
if 'Touchscreen' in cols:
    cols.remove('Touchscreen')

# 4
cols.insert(idx + 1, 'Touchscreen')

# 5
df = df[cols] 


In [141]:
df.head(20)

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Touchscreen,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,False,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,71378.6832
1,1.0,Apple,Ultrabook,13.3,1440x900,False,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34,47895.5232
2,2.0,HP,Notebook,15.6,Full HD 1920x1080,False,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,1.86,30636.0
3,3.0,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,False,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,1.83,135195.336
4,4.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,False,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37,96095.808
5,5.0,Acer,Notebook,15.6,1366x768,False,AMD A9-Series 9420 3GHz,4,500GB HDD,AMD Radeon R5,Windows 10,2.1,21312.0
6,6.0,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,False,Intel Core i7 2.2GHz,16,256GB Flash Storage,Intel Iris Pro Graphics,Mac OS X,2.04,114017.6016
7,7.0,Apple,Ultrabook,13.3,1440x900,False,Intel Core i5 1.8GHz,8,256GB Flash Storage,Intel HD Graphics 6000,macOS,1.34,61735.536
8,8.0,Asus,Ultrabook,14.0,Full HD 1920x1080,False,Intel Core i7 8550U 1.8GHz,16,512GB SSD,Nvidia GeForce MX150,Windows 10,1.3,79653.6
9,9.0,Acer,Ultrabook,14.0,IPS Panel Full HD 1920x1080,False,Intel Core i5 8250U 1.6GHz,8,256GB SSD,Intel UHD Graphics 620,Windows 10,1.6,41025.6


In [142]:

df['ResolutionWidth'] = df['ScreenResolution'].str.extract(r'(\d+)x\d+').astype(float)
df['ResolutionHeight'] = df['ScreenResolution'].str.extract(r'\d+x(\d+)').astype(float)

# 1
cols = list(df.columns)

# 2
idx = cols.index('ScreenResolution')

for col in ['ResolutionWidth', 'ResolutionHeight']:
    if col in cols:
        cols.remove(col)


# 3
cols.insert(idx + 1, 'ResolutionWidth')
cols.insert(idx + 2, 'ResolutionHeight')

# 4
df = df[cols]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ResolutionWidth'] = df['ScreenResolution'].str.extract(r'(\d+)x\d+').astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ResolutionHeight'] = df['ScreenResolution'].str.extract(r'\d+x(\d+)').astype(float)


In [143]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,ResolutionWidth,ResolutionHeight,Touchscreen,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,2560.0,1600.0,False,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,71378.6832


# ⚠️❗By mistake, I clicked on RUN multiple times without noticing, which led to duplication columns ResolutionHeight and ResolutionWidth

#### To handle and fix this i need to use:

In [144]:
df.drop_duplicates(inplace=True)



In [145]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,ResolutionWidth,ResolutionHeight,Touchscreen,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,2560.0,1600.0,False,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,71378.6832


In [146]:
# 1. Create the new column separately
df['ScreenTechnology'] = df['ScreenResolution'].str.extract(r'^(.*?)\s?\d+x\d+')

# 2. Get all column names
cols = list(df.columns)

# 3. Find index of 'ScreenResolution'
idx = cols.index('ScreenResolution')

# 4. Remove the column from list if it exists to avoid duplication
if 'ScreenTechnology' in cols:
    cols.remove('ScreenTechnology')

# 5. Insert the column right after 'ScreenResolution'
cols.insert(idx + 1, 'ScreenTechnology')

# 6. Reorder the DataFrame
df = df[cols]



In [147]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,ScreenTechnology,ResolutionWidth,ResolutionHeight,Touchscreen,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,IPS Panel Retina Display,2560.0,1600.0,False,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,71378.6832


## Now after i separate it into 4 columns so, no need for ScreenResolution column

In [148]:
#df.drop(columns=['ScreenResolution'], inplace=True)


In [149]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [150]:
df.head(100)

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,ScreenTechnology,ResolutionWidth,ResolutionHeight,Touchscreen,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,IPS Panel Retina Display,2560.0,1600.0,False,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,71378.6832
1,Apple,Ultrabook,13.3,1440x900,,1440.0,900.0,False,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34,47895.5232
2,HP,Notebook,15.6,Full HD 1920x1080,Full HD,1920.0,1080.0,False,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,1.86,30636.0000
3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,IPS Panel Retina Display,2880.0,1800.0,False,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,1.83,135195.3360
4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,IPS Panel Retina Display,2560.0,1600.0,False,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37,96095.8080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Asus,Notebook,15.6,Full HD 1920x1080,Full HD,1920.0,1080.0,False,Intel Core i3 7100U 2.4GHz,6,256GB SSD,Nvidia GeForce 920M,Windows 10,2.00,30742.5600
96,HP,Gaming,15.6,IPS Panel Full HD 1920x1080,IPS Panel Full HD,1920.0,1080.0,False,Intel Core i7 7700HQ 2.8GHz,12,128GB SSD + 1TB HDD,Nvidia GeForce GTX 1050,Windows 10,2.62,66546.7200
97,HP,Notebook,15.6,Full HD 1920x1080,Full HD,1920.0,1080.0,False,Intel Core i7 7500U 2.7GHz,8,256GB SSD,AMD Radeon 530,Windows 10,1.91,38308.3200
98,HP,Notebook,15.6,Full HD 1920x1080,Full HD,1920.0,1080.0,False,AMD E-Series E2-9000e 1.5GHz,4,500GB HDD,AMD Radeon R2,Windows 10,2.10,18594.7200


## 2- Moving to Cpu:


In [151]:

df['CpuBrand'] = df['Cpu'].str.extract(r'^(Intel|AMD|Apple)')


df['CpuSeries'] = df['Cpu'].str.extract(r'(Core i\d|Celeron|Pentium|Ryzen \d|M1)')


df['CpuSpeedGHz'] = df['Cpu'].str.extract(r'(\d+\.\d+)GHz').astype(float)


In [152]:
df.head(100)

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,ScreenTechnology,ResolutionWidth,ResolutionHeight,Touchscreen,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price,CpuBrand,CpuSeries,CpuSpeedGHz
0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,IPS Panel Retina Display,2560.0,1600.0,False,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,71378.6832,Intel,Core i5,2.3
1,Apple,Ultrabook,13.3,1440x900,,1440.0,900.0,False,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34,47895.5232,Intel,Core i5,1.8
2,HP,Notebook,15.6,Full HD 1920x1080,Full HD,1920.0,1080.0,False,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,1.86,30636.0000,Intel,Core i5,2.5
3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,IPS Panel Retina Display,2880.0,1800.0,False,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,1.83,135195.3360,Intel,Core i7,2.7
4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,IPS Panel Retina Display,2560.0,1600.0,False,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37,96095.8080,Intel,Core i5,3.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Asus,Notebook,15.6,Full HD 1920x1080,Full HD,1920.0,1080.0,False,Intel Core i3 7100U 2.4GHz,6,256GB SSD,Nvidia GeForce 920M,Windows 10,2.00,30742.5600,Intel,Core i3,2.4
96,HP,Gaming,15.6,IPS Panel Full HD 1920x1080,IPS Panel Full HD,1920.0,1080.0,False,Intel Core i7 7700HQ 2.8GHz,12,128GB SSD + 1TB HDD,Nvidia GeForce GTX 1050,Windows 10,2.62,66546.7200,Intel,Core i7,2.8
97,HP,Notebook,15.6,Full HD 1920x1080,Full HD,1920.0,1080.0,False,Intel Core i7 7500U 2.7GHz,8,256GB SSD,AMD Radeon 530,Windows 10,1.91,38308.3200,Intel,Core i7,2.7
98,HP,Notebook,15.6,Full HD 1920x1080,Full HD,1920.0,1080.0,False,AMD E-Series E2-9000e 1.5GHz,4,500GB HDD,AMD Radeon R2,Windows 10,2.10,18594.7200,AMD,,1.5


In [153]:
df.drop(columns=['Cpu'], inplace=True)

In [154]:
df.head(100)

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,ScreenTechnology,ResolutionWidth,ResolutionHeight,Touchscreen,Ram,Memory,Gpu,OpSys,Weight,Price,CpuBrand,CpuSeries,CpuSpeedGHz
0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,IPS Panel Retina Display,2560.0,1600.0,False,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,71378.6832,Intel,Core i5,2.3
1,Apple,Ultrabook,13.3,1440x900,,1440.0,900.0,False,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34,47895.5232,Intel,Core i5,1.8
2,HP,Notebook,15.6,Full HD 1920x1080,Full HD,1920.0,1080.0,False,8,256GB SSD,Intel HD Graphics 620,No OS,1.86,30636.0000,Intel,Core i5,2.5
3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,IPS Panel Retina Display,2880.0,1800.0,False,16,512GB SSD,AMD Radeon Pro 455,macOS,1.83,135195.3360,Intel,Core i7,2.7
4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,IPS Panel Retina Display,2560.0,1600.0,False,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37,96095.8080,Intel,Core i5,3.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Asus,Notebook,15.6,Full HD 1920x1080,Full HD,1920.0,1080.0,False,6,256GB SSD,Nvidia GeForce 920M,Windows 10,2.00,30742.5600,Intel,Core i3,2.4
96,HP,Gaming,15.6,IPS Panel Full HD 1920x1080,IPS Panel Full HD,1920.0,1080.0,False,12,128GB SSD + 1TB HDD,Nvidia GeForce GTX 1050,Windows 10,2.62,66546.7200,Intel,Core i7,2.8
97,HP,Notebook,15.6,Full HD 1920x1080,Full HD,1920.0,1080.0,False,8,256GB SSD,AMD Radeon 530,Windows 10,1.91,38308.3200,Intel,Core i7,2.7
98,HP,Notebook,15.6,Full HD 1920x1080,Full HD,1920.0,1080.0,False,4,500GB HDD,AMD Radeon R2,Windows 10,2.10,18594.7200,AMD,,1.5


In [157]:
df.isnull().sum()

Company              0
TypeName             0
Inches               0
ScreenResolution     0
ScreenTechnology     0
ResolutionWidth      0
ResolutionHeight     0
Touchscreen          0
Ram                  0
Memory               0
Gpu                  0
OpSys                0
Weight               0
Price                0
CpuBrand             1
CpuSeries           93
CpuSpeedGHz         84
dtype: int64

In [158]:
df[df[['CpuBrand', 'CpuSeries', 'CpuSpeedGHz']].isnull().any(axis=1)]


Unnamed: 0,Company,TypeName,Inches,ScreenResolution,ScreenTechnology,ResolutionWidth,ResolutionHeight,Touchscreen,Ram,Memory,Gpu,OpSys,Weight,Price,CpuBrand,CpuSeries,CpuSpeedGHz
5,Acer,Notebook,15.6,1366x768,,1366.0,768.0,False,4,500GB HDD,AMD Radeon R5,Windows 10,2.10,21312.0000,AMD,,
11,HP,Notebook,15.6,Full HD 1920x1080,Full HD,1920.0,1080.0,False,4,500GB HDD,Intel HD Graphics 520,No OS,1.86,18381.0672,Intel,Core i3,
13,Dell,Notebook,15.6,Full HD 1920x1080,Full HD,1920.0,1080.0,False,4,256GB SSD,AMD Radeon R5 M430,Windows 10,2.20,26581.3920,Intel,Core i3,
14,Apple,Ultrabook,12,IPS Panel Retina Display 2304x1440,IPS Panel Retina Display,2304.0,1440.0,False,8,256GB SSD,Intel HD Graphics 615,macOS,0.92,67260.6720,Intel,,1.2
21,HP,Notebook,15.6,1366x768,,1366.0,768.0,False,4,500GB HDD,AMD Radeon R2,No OS,1.86,13746.2400,AMD,,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1236,HP,Notebook,15.6,Full HD 1920x1080,Full HD,1920.0,1080.0,False,6,1.0TB Hybrid,AMD Radeon R7 M440,Windows 10,2.04,29303.4672,AMD,,2.9
1245,Asus,Ultrabook,13.3,IPS Panel Full HD 1920x1080,IPS Panel Full HD,1920.0,1080.0,False,8,512GB SSD,Intel HD Graphics 515,Windows 10,1.20,38841.1200,Intel,,0.9
1250,HP,Notebook,15.6,Full HD 1920x1080,Full HD,1920.0,1080.0,False,6,1.0TB Hybrid,AMD Radeon R7 M440,Windows 10,2.04,29303.4672,AMD,,2.9
1259,Asus,Ultrabook,13.3,IPS Panel Full HD 1920x1080,IPS Panel Full HD,1920.0,1080.0,False,8,512GB SSD,Intel HD Graphics 515,Windows 10,1.20,38841.1200,Intel,,0.9


In [6]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define features (X) and target (y) from the DataFrame
X = df.drop('Price', axis=1)
y = df['Price']

# Check for complex values in the data
if np.iscomplexobj(X) or np.iscomplexobj(y):
    raise ValueError("Data contains complex numbers. Please convert them using np.real or np.abs before proceeding.")

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model (metrics explained later)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("--- Linear Regression Results ---")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

ValueError: could not convert string to float: 'HP'

In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# تأكد أن df موجود ومحتوي على بيانات
print("Data preview:")
print(df.head())
print(df.info())

# Define features (X) and target (y) from the DataFrame
X = df.drop('Price', axis=1)
y = df['Price']

# تنظيف القيم المركبة بتحويلها إلى الجزء الحقيقي فقط
X = X.apply(np.real)
y = np.real(y)

# تأكد من خلو البيانات من القيم المركبة
assert not np.iscomplexobj(X), "X still contains complex numbers!"
assert not np.iscomplexobj(y), "y still contains complex numbers!"

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("--- Linear Regression Results ---")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")


Data preview:
   Unnamed: 0 Company   TypeName Inches                    ScreenResolution  \
0         0.0   Apple  Ultrabook   13.3  IPS Panel Retina Display 2560x1600   
1         1.0   Apple  Ultrabook   13.3                            1440x900   
2         2.0      HP   Notebook   15.6                   Full HD 1920x1080   
3         3.0   Apple  Ultrabook   15.4  IPS Panel Retina Display 2880x1800   
4         4.0   Apple  Ultrabook   13.3  IPS Panel Retina Display 2560x1600   

                          Cpu   Ram               Memory  \
0        Intel Core i5 2.3GHz   8GB            128GB SSD   
1        Intel Core i5 1.8GHz   8GB  128GB Flash Storage   
2  Intel Core i5 7200U 2.5GHz   8GB            256GB SSD   
3        Intel Core i7 2.7GHz  16GB            512GB SSD   
4        Intel Core i5 3.1GHz   8GB            256GB SSD   

                            Gpu  OpSys  Weight        Price  
0  Intel Iris Plus Graphics 640  macOS  1.37kg   71378.6832  
1        Intel HD Graphics

ValueError: could not convert string to float: 'HP'

In [8]:
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R2): {r2:.2f}")


NameError: name 'y_pred' is not defined

In [9]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# تأكد أن df موجود
print(df.head())
print(df.info())

# تحويل النصوص إلى أعمدة رقمية (one-hot encoding)
df_encoded = pd.get_dummies(df)

# فصل المتغيرات المستقلة والتابعة
X = df_encoded.drop('Price', axis=1)
y = df_encoded['Price']

# إزالة القيم المركبة إن وجدت
X = X.apply(np.real)
y = np.real(y)

# تقسيم البيانات
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# تدريب النموذج
model = LinearRegression()
model.fit(X_train, y_train)

# التنبؤ
y_pred = model.predict(X_test)

# التقييم
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("--- Linear Regression Results ---")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")


   Unnamed: 0 Company   TypeName Inches                    ScreenResolution  \
0         0.0   Apple  Ultrabook   13.3  IPS Panel Retina Display 2560x1600   
1         1.0   Apple  Ultrabook   13.3                            1440x900   
2         2.0      HP   Notebook   15.6                   Full HD 1920x1080   
3         3.0   Apple  Ultrabook   15.4  IPS Panel Retina Display 2880x1800   
4         4.0   Apple  Ultrabook   13.3  IPS Panel Retina Display 2560x1600   

                          Cpu   Ram               Memory  \
0        Intel Core i5 2.3GHz   8GB            128GB SSD   
1        Intel Core i5 1.8GHz   8GB  128GB Flash Storage   
2  Intel Core i5 7200U 2.5GHz   8GB            256GB SSD   
3        Intel Core i7 2.7GHz  16GB            512GB SSD   
4        Intel Core i5 3.1GHz   8GB            256GB SSD   

                            Gpu  OpSys  Weight        Price  
0  Intel Iris Plus Graphics 640  macOS  1.37kg   71378.6832  
1        Intel HD Graphics 6000  macOS  

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [10]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Preview the data
print(df.head())
print(df.info())

# معالجة القيم الفارغة (اختر واحدة من الطريقتين)
df = df.fillna(df.mean(numeric_only=True))  # الأفضل
# df = df.dropna()  # البديل

# تحويل القيم النصية إلى أرقام
df_encoded = pd.get_dummies(df)

# فصل المتغيرات المستقلة والتابعة
X = df_encoded.drop('Price', axis=1)
y = df_encoded['Price']

# تنظيف أي قيم مركبة
X = X.apply(np.real)
y = np.real(y)

# تقسيم البيانات
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# تدريب النموذج
model = LinearRegression()
model.fit(X_train, y_train)

# التنبؤ
y_pred = model.predict(X_test)

# التقييم
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("--- Linear Regression Results ---")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")


   Unnamed: 0 Company   TypeName Inches                    ScreenResolution  \
0         0.0   Apple  Ultrabook   13.3  IPS Panel Retina Display 2560x1600   
1         1.0   Apple  Ultrabook   13.3                            1440x900   
2         2.0      HP   Notebook   15.6                   Full HD 1920x1080   
3         3.0   Apple  Ultrabook   15.4  IPS Panel Retina Display 2880x1800   
4         4.0   Apple  Ultrabook   13.3  IPS Panel Retina Display 2560x1600   

                          Cpu   Ram               Memory  \
0        Intel Core i5 2.3GHz   8GB            128GB SSD   
1        Intel Core i5 1.8GHz   8GB  128GB Flash Storage   
2  Intel Core i5 7200U 2.5GHz   8GB            256GB SSD   
3        Intel Core i7 2.7GHz  16GB            512GB SSD   
4        Intel Core i5 3.1GHz   8GB            256GB SSD   

                            Gpu  OpSys  Weight        Price  
0  Intel Iris Plus Graphics 640  macOS  1.37kg   71378.6832  
1        Intel HD Graphics 6000  macOS  