In [1]:
import pandas as pd


In [2]:
df=pd.read_csv("/content/after_missing_value_imputation.csv")

In [3]:
df.columns

Index(['name', 'price', 'brand', 'no_of_votes', 'rating', 'os', 'utility',
       'thickness', 'weight', 'warranty', 'display_size', 'ppi',
       'aspect_ratio', 'antiglare', 'touch_screen', 'ram', 'hdd', 'ssd',
       'graphic', 'cache', 'thread', 'core', 'hdmi', 'mcr', 'wifi',
       'bluetooth', 'backlit_keyboard', 'inbuilt_microphone', 'thunderbolt',
       'fingerprint_sensor', 'ethernet', 'display_port', 'camera',
       'num_of_cell', 'battery_capacity', 'pixel_width', 'pixel_height',
       'usb3', 'usb2', 'type_c', 'processor_brand', 'processor_model',
       'processor_gen'],
      dtype='object')

In [4]:
#here we know that "cache","core" and "thread" columns are highly correlated so to prevent the multicollinearity we are going to check the vif
#if its value comes greater than 6 we thaought of removing that column
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# Assuming df is your DataFrame
X = add_constant(df[['cache', 'core', 'thread']])
vif = pd.DataFrame()
vif['Variable'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif)


  Variable        VIF
0    const   6.832836
1    cache   4.207299
2     core   6.071841
3   thread  10.047743


In [5]:
df.columns

Index(['name', 'price', 'brand', 'no_of_votes', 'rating', 'os', 'utility',
       'thickness', 'weight', 'warranty', 'display_size', 'ppi',
       'aspect_ratio', 'antiglare', 'touch_screen', 'ram', 'hdd', 'ssd',
       'graphic', 'cache', 'thread', 'core', 'hdmi', 'mcr', 'wifi',
       'bluetooth', 'backlit_keyboard', 'inbuilt_microphone', 'thunderbolt',
       'fingerprint_sensor', 'ethernet', 'display_port', 'camera',
       'num_of_cell', 'battery_capacity', 'pixel_width', 'pixel_height',
       'usb3', 'usb2', 'type_c', 'processor_brand', 'processor_model',
       'processor_gen'],
      dtype='object')

In [6]:
train_df=df.drop(columns=["thread","name","no_of_votes","rating","camera"])

In [7]:
train_df.columns

Index(['price', 'brand', 'os', 'utility', 'thickness', 'weight', 'warranty',
       'display_size', 'ppi', 'aspect_ratio', 'antiglare', 'touch_screen',
       'ram', 'hdd', 'ssd', 'graphic', 'cache', 'core', 'hdmi', 'mcr', 'wifi',
       'bluetooth', 'backlit_keyboard', 'inbuilt_microphone', 'thunderbolt',
       'fingerprint_sensor', 'ethernet', 'display_port', 'num_of_cell',
       'battery_capacity', 'pixel_width', 'pixel_height', 'usb3', 'usb2',
       'type_c', 'processor_brand', 'processor_model', 'processor_gen'],
      dtype='object')

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder , OrdinalEncoder

In [9]:
from sklearn.preprocessing import OrdinalEncoder

# Create a copy of the original data for label encoding
data_label_encoded = train_df.copy()

categorical_cols = train_df.select_dtypes(include=['object']).columns

# Apply label encoding to categorical columns
for col in categorical_cols:
    oe = OrdinalEncoder()
    data_label_encoded[col] = oe.fit_transform(data_label_encoded[[col]])
    print(oe.categories_)

# Splitting the dataset into training and testing sets
X_label = data_label_encoded.drop('price', axis=1)
y_label = data_label_encoded['price']

[array(['ASUS', 'AXL', 'Acer', 'Apple', 'Asus', 'Avita', 'Chuwi', 'Dell',
       'Fujitsu', 'Gigabyte', 'HP', 'Honor', 'Huawei', 'Infinix', 'LG',
       'Lenovo', 'MSI', 'Ninkear', 'Primebook', 'Razer', 'Realme',
       'Samsung', 'Ultimus', 'Vaio', 'Walker', 'Wings', 'Xiaomi'],
      dtype=object)]
[array(['Mac', 'Others', 'Windows'], dtype=object)]
[array(['Business', 'Everyday Use', 'Gaming', 'Performance'], dtype=object)]
[array(['medium', 'slim', 'thick'], dtype=object)]
[array(['heavy', 'lite', 'medium'], dtype=object)]
[array(['large', 'medium', 'small'], dtype=object)]
[array(['16:10', '16:9', '3:2'], dtype=object)]
[array(['AMD', 'Integrated', 'NVIDIA'], dtype=object)]
[array(['AMD', 'Apple', 'Intel', 'Mediatek'], dtype=object)]
[array(['1255U', '3', '3045B', '3050U', '5', '7', '7040', '7120U',
       '7940HS', '9', 'M1', 'M2', 'MTK8788', 'N305', 'N4020', 'N4500',
       'N5100', 'N6000', 'i3', 'i5', 'i7', 'i9'], dtype=object)]


## Technique-1
# Correlation analysis

In [10]:
fi_df1 = data_label_encoded.corr()['price'].iloc[1:].to_frame().reset_index().rename(columns={'index':'feature','price':'corr_coeff'})
fi_df1

Unnamed: 0,feature,corr_coeff
0,brand,0.005122
1,os,0.019464
2,utility,-0.050737
3,thickness,0.279172
4,weight,-0.342268
5,warranty,0.171335
6,display_size,-0.405146
7,ppi,0.512422
8,aspect_ratio,-0.191674
9,antiglare,0.082817


# Tutorial-2 Random Forest Feature Importance

In [11]:
from sklearn.ensemble import RandomForestRegressor

# Train a Random Forest regressor on label encoded data
rf_label = RandomForestRegressor(n_estimators=100, random_state=42)
rf_label.fit(X_label, y_label)

# Extract feature importance scores for label encoded data
fi_df2 = pd.DataFrame({
    'feature': X_label.columns,
    'rf_importance': rf_label.feature_importances_
}).sort_values(by='rf_importance', ascending=False)

fi_df2

Unnamed: 0,feature,rf_importance
28,battery_capacity,0.493674
11,ram,0.227653
35,processor_model,0.029954
27,num_of_cell,0.028777
13,ssd,0.027932
15,cache,0.027819
16,core,0.02557
7,ppi,0.021517
30,pixel_height,0.016319
29,pixel_width,0.015812


# Tutorial-3 Gradient Boosting Feature Importance

In [12]:
from sklearn.ensemble import GradientBoostingRegressor

# Train a Random Forest regressor on label encoded data
gb_label = GradientBoostingRegressor()
gb_label.fit(X_label, y_label)

# Extract feature importance scores for label encoded data
fi_df3 = pd.DataFrame({
    'feature': X_label.columns,
    'gb_importance': gb_label.feature_importances_
}).sort_values(by='gb_importance', ascending=False)

fi_df3

Unnamed: 0,feature,gb_importance
28,battery_capacity,0.416906
11,ram,0.200779
13,ssd,0.074487
27,num_of_cell,0.068817
16,core,0.046365
35,processor_model,0.038108
7,ppi,0.029492
29,pixel_width,0.028755
15,cache,0.02168
30,pixel_height,0.013214


# Tutorial-4 Permutation Importance

In [13]:
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split

X_train_label, X_test_label, y_train_label, y_test_label = train_test_split(X_label, y_label, test_size=0.2, random_state=42)

# Train a Random Forest regressor on label encoded data
rf_label = RandomForestRegressor(n_estimators=100, random_state=42)
rf_label.fit(X_train_label, y_train_label)

# Calculate Permutation Importance
perm_importance = permutation_importance(rf_label, X_test_label, y_test_label, n_repeats=30, random_state=42)

# Organize results into a DataFrame
fi_df4 = pd.DataFrame({
    'feature': X_label.columns,
    'permutation_importance': perm_importance.importances_mean
}).sort_values(by='permutation_importance', ascending=False)

fi_df4

Unnamed: 0,feature,permutation_importance
28,battery_capacity,0.308005
11,ram,0.048939
16,core,0.037357
21,backlit_keyboard,0.03396
27,num_of_cell,0.020911
15,cache,0.012873
13,ssd,0.012402
35,processor_model,0.011203
36,processor_gen,0.009659
2,utility,0.007057


# Tutorial-5 Lasso

In [15]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_label)

# Train a LASSO regression model
# We'll use a relatively small value for alpha (the regularization strength) for demonstration purposes
lasso = Lasso(alpha=0.01, random_state=42)
lasso.fit(X_scaled, y_label)

# Extract coefficients
fi_df5 = pd.DataFrame({
    'feature': X_label.columns,
    'lasso_coeff': lasso.coef_
}).sort_values(by='lasso_coeff', ascending=False)

fi_df5

Unnamed: 0,feature,lasso_coeff
11,ram,18177.095111
13,ssd,12728.156114
27,num_of_cell,12652.607939
35,processor_model,10497.352235
16,core,10053.931618
30,pixel_height,8171.95886
14,graphic,5448.228402
28,battery_capacity,4863.180331
15,cache,4170.393448
8,aspect_ratio,3654.861813


# Tutorial-6 RFE

In [17]:
from sklearn.feature_selection import RFE

# Initialize the base estimator
estimator = RandomForestRegressor()

# Apply RFE on the label-encoded and standardized training data
selector_label = RFE(estimator, n_features_to_select=X_label.shape[1], step=1)
selector_label = selector_label.fit(X_label, y_label)

# Get the selected features based on RFE
selected_features = X_label.columns[selector_label.support_]

# Extract the coefficients for the selected features from the underlying linear regression model
selected_coefficients = selector_label.estimator_.feature_importances_

# Organize the results into a DataFrame
fi_df6 = pd.DataFrame({
    'feature': selected_features,
    'rfe_score': selected_coefficients
}).sort_values(by='rfe_score', ascending=False)

fi_df6

Unnamed: 0,feature,rfe_score
28,battery_capacity,0.457389
11,ram,0.238343
27,num_of_cell,0.040257
16,core,0.034763
15,cache,0.030369
13,ssd,0.028337
7,ppi,0.025534
35,processor_model,0.023126
30,pixel_height,0.018998
29,pixel_width,0.012585


# Tutorial-7 Linear Regression Weights

In [18]:
from sklearn.linear_model import LinearRegression

In [19]:
# Train a linear regression model on the label-encoded and standardized training data
lin_reg = LinearRegression()
lin_reg.fit(X_scaled, y_label)

# Extract coefficients
fi_df7 = pd.DataFrame({
    'feature': X_label.columns,
    'reg_coeffs': lin_reg.coef_
}).sort_values(by='reg_coeffs', ascending=False)

fi_df7

Unnamed: 0,feature,reg_coeffs
11,ram,18177.09
13,ssd,12728.14
27,num_of_cell,12652.58
35,processor_model,10497.95
16,core,10053.97
30,pixel_height,8171.953
14,graphic,5448.255
28,battery_capacity,4863.186
15,cache,4170.352
8,aspect_ratio,3654.879


In [21]:
final_fi_df = fi_df1.merge(fi_df2,on='feature').merge(fi_df3,on='feature').merge(fi_df4,on='feature').merge(fi_df5,on='feature').merge(fi_df6,on='feature').merge(fi_df7,on='feature').set_index('feature')


In [22]:
final_fi_df


Unnamed: 0_level_0,corr_coeff,rf_importance,gb_importance,permutation_importance,lasso_coeff,rfe_score,reg_coeffs
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
brand,0.005122,0.008298,0.001181,0.004931,1225.401031,0.007719,1225.409
os,0.019464,0.000651,0.001282,0.002184,-1119.902734,0.00062,-1119.911
utility,-0.050737,0.006434,0.004494,0.007057,-3646.531652,0.006824,-3646.543
thickness,0.279172,0.002404,0.001848,-0.007978,-918.526462,0.003278,-918.5609
weight,-0.342268,0.004442,0.002719,-0.006347,986.533937,0.003221,986.5415
warranty,0.171335,0.002438,0.001286,0.001541,2680.531397,0.002728,2680.53
display_size,-0.405146,0.002636,0.001527,0.000775,-2750.484301,0.003597,-2750.503
ppi,0.512422,0.021517,0.029492,0.000385,2952.753881,0.025534,2952.743
aspect_ratio,-0.191674,0.001177,0.000275,0.00232,3654.861813,0.002803,3654.879
antiglare,0.082817,0.001198,0.000164,0.001923,1516.315884,0.001272,1516.337


In [23]:
# normalize the score
final_fi_df = final_fi_df.divide(final_fi_df.sum(axis=0), axis=1)

In [25]:
final_fi_df[['rf_importance','gb_importance','permutation_importance','rfe_score']].mean(axis=1).sort_values(ascending=False)


Unnamed: 0_level_0,0
feature,Unnamed: 1_level_1
battery_capacity,0.508494
ram,0.193149
core,0.046869
num_of_cell,0.045767
ssd,0.039393
processor_model,0.028853
cache,0.026926
backlit_keyboard,0.026095
ppi,0.019343
pixel_height,0.015362


In [26]:
# with all the cols
from sklearn.model_selection import cross_val_score

rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label, y_label, cv=5, scoring='r2')

In [27]:
scores.mean()

0.8436912981835512

In [28]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=["weight","thickness","usb3","bluetooth"]), y_label, cv=5, scoring='r2')

In [29]:
scores.mean()

0.8452334317921526

In [30]:
export_df = X_label.drop(columns=["weight","thickness","usb3","bluetooth"])
export_df['price'] = y_label

In [31]:
export_df.to_csv("final_data.csv",index=False)