## Prediction with lineer regression of the data

In [455]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [456]:
df = pd.read_csv('data_cleaned.csv')

In [457]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174 entries, 0 to 173
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   brand       174 non-null    object
 1   processor   174 non-null    object
 2   ram         174 non-null    int64 
 3   video-card  174 non-null    object
 4   memory      174 non-null    int64 
 5   price       174 non-null    int64 
dtypes: int64(3), object(3)
memory usage: 8.3+ KB
None


In [458]:
df["brand"] = df["brand"].astype("category")
df["processor"] = df["processor"].astype("category")
df["video-card"] = df["video-card"].astype("category")
df["ram"] = df["ram"].astype("int")
df["memory"] = df["memory"].astype("int")
df["price"] = df["price"].astype("int")

In [459]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174 entries, 0 to 173
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   brand       174 non-null    category
 1   processor   174 non-null    category
 2   ram         174 non-null    int64   
 3   video-card  174 non-null    category
 4   memory      174 non-null    int64   
 5   price       174 non-null    int64   
dtypes: category(3), int64(3)
memory usage: 5.9 KB
None


In [460]:
categorical_features = ['brand', 'processor', 'video-card']
numerical_features = ['ram', 'memory']

In [461]:
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [462]:
X = df.drop('price', axis=1)
y = df['price']

In [463]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [464]:
model = Pipeline([
    ('preparation', full_pipeline),
    ('model', LinearRegression())
])

In [465]:
model.fit(X_train, y_train)

0,1,2
,steps,"[('preparation', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [466]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [467]:
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2: {r2}")

MSE: 138293980.0934827
RMSE: 11759.846091402842
R^2: -0.10739409367216668


In [468]:
feature_importances = model.named_steps['model'].coef_
print(len(feature_importances))
print(feature_importances)

29
[-1.20401782e+03  9.20475106e+01 -1.15771459e+04  5.34526141e+03
 -7.46061687e+03  2.09583573e+03  2.72036916e+03  8.87629651e+03
  1.64109271e+04  1.39489321e+02  5.07056034e+04 -1.25713262e+04
 -1.00909557e+04 -2.09430983e+04 -1.45702271e+04 -9.08041267e+03
 -1.81271826e+04 -2.03939089e+04 -2.83150837e+04  2.35046507e+04
 -1.42630002e+04 -1.24861319e+04 -1.34015670e+03  6.84884368e+00
 -2.75290005e+03  1.16979782e+04  6.48032326e+03  2.05748972e+04
  3.54136660e+04]


In [469]:
print("Numerical Features")
for i in range(len(numerical_features)):
    print(numerical_features[i], feature_importances[i])

Numerical Features
ram -1204.0178235627834
memory 92.0475105543246


In [470]:
print("Categorical Features")
for i in range(len(categorical_features)):
    for j in range(len(model.named_steps['preparation'].transformers_[1][1].categories_[i])):
        print(model.named_steps['preparation'].transformers_[1][1].categories_[i][j], feature_importances[len(numerical_features) + j])

Categorical Features
Asus -11577.145937197518
Dell 5345.261414272989
GIGABYTE -7460.616874466623
Hp 2095.835730209441
Lenovo 2720.3691574743775
MSI 8876.296509707496
AMD Ryzen 5 -11577.145937197518
AMD Ryzen 7 5345.261414272989
AMD Ryzen 9 -7460.616874466623
Intel Core Ultra 5 2095.835730209441
Intel Core Ultra 7 2720.3691574743775
Intel Core i3 8876.296509707496
Intel Core i5 16410.927103865622
Intel Core i7 139.4893214962611
AMD Radeon 660M Graphics -11577.145937197518
AMD Radeon 740M 5345.261414272989
AMD Radeon Graphics -7460.616874466623
Intel Arc Graphics 2095.835730209441
Intel HD Graphics 2720.3691574743775
Intel Iris Xe Graphics 8876.296509707496
Intel UHD Graphics 16410.927103865622
Nvidia GeForce MX570 139.4893214962611
Nvidia GeForce RTX™ 3050 50705.60344972721
Nvidia GeForce RTX™ 4050 -12571.326170340699
Nvidia GeForce RTX™ 4060 -10090.955709510748
Nvidia GeForce RTX™ 4070 -20943.098267907906
Nvidia GeForce RTX™ 5050 -14570.227058450886


In [471]:
new_data = pd.DataFrame({
    'brand': ['Lenovo'],
    'processor': ['Intel Core i5'],
    'video-card': ['Nvidia GeForce RTX™ 4060'],
    'ram': [16],
    'memory': [512],
})

print(model.predict(new_data))

[43586.82596398]


In [472]:
print(df[(df['brand'] == 'Lenovo') & (df['processor'] == 'Intel Core i5') & (df['video-card'] == 'Nvidia GeForce RTX™ 4060')])

      brand      processor  ram                video-card  memory  price
4    Lenovo  Intel Core i5   32  Nvidia GeForce RTX™ 4060     512  37449
5    Lenovo  Intel Core i5   32  Nvidia GeForce RTX™ 4060    1024  38699
6    Lenovo  Intel Core i5   32  Nvidia GeForce RTX™ 4060    1024  38649
107  Lenovo  Intel Core i5   16  Nvidia GeForce RTX™ 4060    1024  37199
108  Lenovo  Intel Core i5   16  Nvidia GeForce RTX™ 4060     512  35899
118  Lenovo  Intel Core i5   16  Nvidia GeForce RTX™ 4060    1024  36899
137  Lenovo  Intel Core i5   12  Nvidia GeForce RTX™ 4060    1024  36899


In [473]:
def tolerance_r2(y_true, y_pred, tolerance):
    residuals = y_pred - y_true
    residuals[np.abs(residuals) <= tolerance] = 0
    ssr = np.sum(residuals**2)
    sst = np.sum((y_true - np.mean(y_true))**2)
    return 1 - (ssr / sst)

def tolerance_percentage_r2(y_true, y_pred, tolerance):
    residuals = y_pred - y_true
    residuals[(np.abs(residuals) / y_true) <= tolerance] = 0
    ssr = np.sum(residuals**2)
    sst = np.sum((y_true - np.mean(y_true))**2)
    return 1 - (ssr / sst)

In [474]:
print(r2_score(y_test, y_pred))
print(tolerance_r2(y_test, y_pred, 10000))
print(tolerance_percentage_r2(y_test, y_pred, 0.70))

-0.10739409367216668
0.0116566093815762
0.9395931370134096


In [475]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = tolerance_percentage_r2(y_test, y_pred, 0.70)

In [476]:
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2: {r2}")

MSE: 138293980.0934827
RMSE: 11759.846091402842
R^2: 0.9395931370134096


In [477]:
feature_importances = model.named_steps['model'].coef_
print(len(feature_importances))
print(feature_importances)

29
[-1.20401782e+03  9.20475106e+01 -1.15771459e+04  5.34526141e+03
 -7.46061687e+03  2.09583573e+03  2.72036916e+03  8.87629651e+03
  1.64109271e+04  1.39489321e+02  5.07056034e+04 -1.25713262e+04
 -1.00909557e+04 -2.09430983e+04 -1.45702271e+04 -9.08041267e+03
 -1.81271826e+04 -2.03939089e+04 -2.83150837e+04  2.35046507e+04
 -1.42630002e+04 -1.24861319e+04 -1.34015670e+03  6.84884368e+00
 -2.75290005e+03  1.16979782e+04  6.48032326e+03  2.05748972e+04
  3.54136660e+04]


In [478]:
print("Numerical Features")
for i in range(len(numerical_features)):
    print(numerical_features[i], feature_importances[i])

Numerical Features
ram -1204.0178235627834
memory 92.0475105543246


In [479]:
new_data = pd.DataFrame({
    'brand': ['Lenovo'],
    'processor': ['Intel Core i5'],
    'video-card': ['Nvidia GeForce RTX™ 4060'],
    'ram': [16],
    'memory': [512],
})

print(model.predict(new_data))

[43586.82596398]


In [485]:
new_data = pd.DataFrame({
    'brand': ['Acer'],
    'processor': ['Intel Core i7'],
    'video-card': ['Nvidia GeForce RTX™ 3060'],
    'ram': [16],
    'memory': [512],
})

print(model.predict(new_data))

[39875.94793714]


In [498]:
print(df[(df['brand'] == 'Lenovo') & (df['processor'] == 'Intel Core i7') & (df['video-card'] == 'Nvidia GeForce RTX™ 4060')])

     brand      processor  ram                video-card  memory  price
86  Lenovo  Intel Core i7   32  Nvidia GeForce RTX™ 4060    1024  62399


In [501]:
new_data = pd.DataFrame({
    'brand': ['Lenovo'],
    'processor': ['Intel Core i7'],
    'video-card': ['Nvidia GeForce RTX™ 4060'],
    'ram': [16],
    'memory': [512],
})

print(model.predict(new_data))

[49076.64035355]
