In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, mean_squared_error

In [3]:
df = pd.read_csv("laptop_data_cleaned.csv")

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1273 entries, 0 to 1272
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Company      1273 non-null   object 
 1   TypeName     1273 non-null   object 
 2   Ram          1273 non-null   int64  
 3   Weight       1273 non-null   float64
 4   Price        1273 non-null   float64
 5   TouchScreen  1273 non-null   int64  
 6   Ips          1273 non-null   int64  
 7   Ppi          1273 non-null   float64
 8   Cpu_brand    1273 non-null   object 
 9   HDD          1273 non-null   int64  
 10  SSD          1273 non-null   int64  
 11  Gpu_brand    1273 non-null   object 
 12  Os           1273 non-null   object 
dtypes: float64(3), int64(5), object(5)
memory usage: 129.4+ KB
None


In [5]:
print(df.describe())

               Ram       Weight        Price  TouchScreen          Ips  \
count  1273.000000  1273.000000  1273.000000  1273.000000  1273.000000   
mean      8.447761     2.041100    10.828218     0.146897     0.279654   
std       5.098771     0.669241     0.619565     0.354142     0.449006   
min       2.000000     0.690000     9.134616     0.000000     0.000000   
25%       4.000000     1.500000    10.387379     0.000000     0.000000   
50%       8.000000     2.040000    10.872255     0.000000     0.000000   
75%       8.000000     2.310000    11.287447     0.000000     1.000000   
max      64.000000     4.700000    12.691441     1.000000     1.000000   

               Ppi          HDD          SSD  
count  1273.000000  1273.000000  1273.000000  
mean    146.950812   413.715632   186.252946  
std      42.926775   518.054486   186.531571  
min      90.583402     0.000000     0.000000  
25%     127.335675     0.000000     0.000000  
50%     141.211998     0.000000   256.000000  
75% 

In [6]:
print(df.isnull().sum())

Company        0
TypeName       0
Ram            0
Weight         0
Price          0
TouchScreen    0
Ips            0
Ppi            0
Cpu_brand      0
HDD            0
SSD            0
Gpu_brand      0
Os             0
dtype: int64


In [7]:
df.dropna(inplace=True)

In [8]:
le = LabelEncoder()
cat_cols = ['Company', 'TypeName', 'TouchScreen', 'Ips', 'Cpu_brand', 'Gpu_brand', 'Os']
for col in cat_cols:
    df[col] = le.fit_transform(df[col])


In [9]:
X = df.drop('Price', axis=1)
y_classification = pd.qcut(df['Price'], q=2, labels=[0, 1])  # For binary classification
y_regression = df['Price']

In [10]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [11]:
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_scaled, y_classification, test_size=0.2, random_state=42)
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_scaled, y_regression, test_size=0.2, random_state=42)


In [12]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train_c, y_train_c)
y_pred_rf_clf = rf_clf.predict(X_test_c)
print("RandomForestClassifier Accuracy:", accuracy_score(y_test_c, y_pred_rf_clf))

RandomForestClassifier Accuracy: 0.8823529411764706


In [13]:
log_clf = LogisticRegression()
svc_clf = SVC(probability=True)
voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('svc', svc_clf), ('rf', rf_clf)], voting='soft')
voting_clf.fit(X_train_c, y_train_c)
y_pred_vote = voting_clf.predict(X_test_c)
print("VotingClassifier Accuracy:", accuracy_score(y_test_c, y_pred_vote))

VotingClassifier Accuracy: 0.8666666666666667


In [14]:
stacking_clf = StackingClassifier(estimators=[('lr', log_clf), ('svc', svc_clf)], final_estimator=RandomForestClassifier())
stacking_clf.fit(X_train_c, y_train_c)
y_pred_stack = stacking_clf.predict(X_test_c)
print("StackingClassifier Accuracy:", accuracy_score(y_test_c, y_pred_stack))

StackingClassifier Accuracy: 0.8509803921568627


In [15]:
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train_r, y_train_r)
y_pred_rf_reg = rf_reg.predict(X_test_r)
print("RandomForestRegressor RMSE:", np.sqrt(mean_squared_error(y_test_r, y_pred_rf_reg)))

RandomForestRegressor RMSE: 0.21887999083591494


In [16]:
joblib.dump(rf_clf, 'random_forest_classifier.pkl')
joblib.dump(rf_reg, 'random_forest_regressor.pkl')
joblib.dump(voting_clf, 'voting_classifier.pkl')
joblib.dump(stacking_clf, 'stacking_classifier.pkl')

['stacking_classifier.pkl']