# Loading the data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
path="data/laptopData.csv"
df = pd.read_csv(path)
df= df.drop('Unnamed: 0', axis=1)


In [None]:
df

In [None]:
df.columns

#  Data Exploration

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.dropna(how='all')

In [None]:
df.isnull().sum()

In [None]:
df.isnull().any()

In [None]:
df.shape

In [None]:
len(df)

In [None]:
len(df.columns)

In [None]:
df.duplicated()

In [None]:
df.duplicated().sum()

In [None]:
df.describe()

In [None]:
df.dtypes

## Convert the "Inches" column from object to float

In [None]:
df['Inches'] = pd.to_numeric(df['Inches'], errors='coerce')

In [None]:
df.dtypes

## Cup columns

In [None]:
df['Cpu'] = df['Cpu'].astype(str)
df['Cpu']

In [None]:
df['Cpu_Company'] = df['Cpu'].str.split().str[0]
df['Cpu_Company']

In [None]:
df['Cpu_Speed_GHz'] = df['Cpu'].str.split().str[-1].str.replace('GHz', '').astype(float)
df['Cpu_Speed_GHz']

In [None]:
df['Cpu_Brand'] = df['Cpu'].str.split().str[1] + ' ' + df['Cpu'].str.split().str[2]
df['Cpu_Brand']

In [None]:
df.dtypes

## Cleaning Ram

In [None]:
df['Ram'] = df['Ram'].astype(str).str.replace('GB', '')
df['Ram'] = pd.to_numeric(df['Ram'], errors='coerce')
mean_ram = df['Ram'].mean()
df['Ram'] = df['Ram'].fillna(mean_ram)
df['Ram'] = df['Ram'].astype(int)

In [None]:
df.dtypes

## Cleaning Memory

In [None]:
def convert_size(x):
    if pd.isna(x):
        return 0
    x = x.strip().upper()
    if 'TB' in x:
        return float(x.replace('TB', '').replace('SSD', '').replace('HDD', '').strip()) * 1024
    if 'GB' in x:
        return float(x.replace('GB', '').replace('SSD', '').replace('HDD', '').strip())
    return 0

mem_split = df['Memory'].str.split('+', expand=True)

ssd_vals = mem_split.apply(lambda col: col.map(lambda x: convert_size(x) if isinstance(x, str) and 'SSD' in x.upper() else 0))
hdd_vals = mem_split.apply(lambda col: col.map(lambda x: convert_size(x) if isinstance(x, str) and 'HDD' in x.upper() else 0))

df['Memory_SSD'] = ssd_vals.sum(axis=1).astype(int)
df['Memory_HDD'] = hdd_vals.sum(axis=1).astype(int)


In [None]:
df.dtypes

## Cleaning Gpu

In [None]:
df['Gpu_Brand'] = df['Gpu'].str.split().str[0]
df['Gpu_Brand']

In [None]:
df['Gpu_Brand'].value_counts()

In [None]:
df.dtypes

## Cleaning Weight

In [None]:
df['Weight'] = df['Weight'].str.replace('kg', '').str.strip()
df['Weight'] = pd.to_numeric(df['Weight'], errors='coerce')

In [None]:
df['Weight']

In [None]:
df.dtypes

## Cleaning OpSys

In [None]:
df['OpSys'] = df['OpSys'].str.lower().str.strip()

In [None]:
df['OpSys'].value_counts()

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna()

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
df.describe()

## Missing Values

In [None]:
missing_like = ['?', 'NA', 'N/A', '', 'None', '-']
df = df.replace(missing_like, np.nan)  # بدون inplace=True عشان نتجنب التحذير
missing_counts = df.isnull().sum()
missing_counts

In [None]:
df['Memory'].unique()

In [None]:
mode_mem = df['Memory'].mode()[0]
df['Memory'].fillna(mode_mem)

In [None]:
df['Memory'].unique()

In [None]:
df.columns

## Cleaning the Outliers for Inches

In [None]:
Q1 = df['Inches'].quantile(0.25)
Q3 = df['Inches'].quantile(0.75)

IQR = Q3 - Q1

lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR

print('Lower Limit:', lower_limit, 'Upper Limit:', upper_limit)

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
df.boxplot(column='Inches', grid=False)
plt.title('Boxplot of Inches (Before)')

df['Inches'] = df['Inches'].clip(lower=lower_limit, upper=upper_limit)

plt.subplot(1, 2, 2)
df.boxplot(column='Inches', grid=False)
plt.title('Boxplot of Inches (After)')

plt.show()


## Cleaning the Outliers for Price

In [None]:
Q1 = df['Price'].quantile(0.25)
Q3 = df['Price'].quantile(0.75)

IQR = Q3 - Q1

lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR

print('Lower Limit:', lower_limit, 'Upper Limit:', upper_limit)

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
df.boxplot(column='Price', grid=False)
plt.title('Boxplot of Price (Before)')

df['Price'] = df['Price'].clip(lower=lower_limit, upper=upper_limit)

plt.subplot(1, 2, 2)
df.boxplot(column='Price', grid=False)
plt.title('Boxplot of Price (After)')

plt.show()

## Cleaning the Outliers for Ram

In [None]:
Q1 = df['Ram'].quantile(0.25)
Q3 = df['Ram'].quantile(0.75)
IQR = Q3 - Q1

lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR

print('Lower Limit:', lower_limit, 'Upper Limit:', upper_limit)

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
df.boxplot(column='Ram', grid=False)
plt.title('Boxplot of Ram (Before)')

df['Ram'] = df['Ram'].clip(lower=lower_limit, upper=upper_limit)

plt.subplot(1, 2, 2)
df.boxplot(column='Ram', grid=False)
plt.title('Boxplot of Ram (After)')

plt.show()

## Cleaning the Outliers for Weight

In [None]:
Q1 = df['Weight'].quantile(0.25)
Q3 = df['Weight'].quantile(0.75)
IQR = Q3 - Q1

lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR

print('Lower Limit:', lower_limit, 'Upper Limit:', upper_limit)

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
df.boxplot(column='Weight', grid=False)
plt.title('Boxplot of Weight (Before)')

df['Weight'] = df['Weight'].clip(lower=lower_limit, upper=upper_limit)

plt.subplot(1, 2, 2)
df.boxplot(column='Weight', grid=False)
plt.title('Boxplot of Weight (After)')

plt.show()

## Cleaning the Outliers for Cpu_Speed_GHz

In [None]:
Q1 = df['Cpu_Speed_GHz'].quantile(0.25)
Q3 = df['Cpu_Speed_GHz'].quantile(0.75)
IQR = Q3 - Q1

lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR

print('Lower Limit:', lower_limit, 'Upper Limit:', upper_limit)

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
df.boxplot(column='Cpu_Speed_GHz', grid=False)
plt.title('Boxplot of Cpu_Speed_GHz (Before)')

df['Cpu_Speed_GHz'] = df['Cpu_Speed_GHz'].clip(lower=lower_limit, upper=upper_limit)

plt.subplot(1, 2, 2)
df.boxplot(column='Cpu_Speed_GHz', grid=False)
plt.title('Boxplot of Cpu_Speed_GHz (After)')

plt.show()

## Cleaning the Outliers for Memory_SSD

In [None]:
import matplotlib.pyplot as plt

Q1 = df['Memory_SSD'].quantile(0.25)
Q3 = df['Memory_SSD'].quantile(0.75)
IQR = Q3 - Q1

lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR

print('Lower Limit:', lower_limit, 'Upper Limit:', upper_limit)

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
df.boxplot(column='Memory_SSD', grid=False)
plt.title('Boxplot of Memory_SSD (Before)')

df['Memory_SSD'] = df['Memory_SSD'].clip(lower=lower_limit, upper=upper_limit)

plt.subplot(1, 2, 2)
df.boxplot(column='Memory_SSD', grid=False)
plt.title('Boxplot of Memory_SSD (After)')

plt.show()

## Cheack the Outliers for Memory_HDD

In [None]:
df.boxplot(column='Memory_HDD', grid=False, figsize=(6,4))
plt.title('Boxplot of Memory_HDD (Before)')
plt.show()
# No Outliers found 

# Accuracy Level

In [None]:
#!pip install scikit-learn

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

df_encoded = pd.get_dummies(df, drop_first=True)
X = df_encoded.drop('Price', axis=1)
y = df_encoded['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("--- Linear Regression Results ---")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")
