# Import Tools

In [1]:
import pandas as pd 
import numpy as np


In [2]:
import pickle
import xgboost as xgb # type: ignore

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

# LOAD DATASET

In [3]:
df= pd.read_excel("Phone_Sales_Dataset.xlsx")

In [4]:
df

Unnamed: 0,brand_name,os,popularity,best_price,lowest_price,highest_price,sellers_amount,screen_size,memory_size,battery_size
0,ALCATEL,Android,422,1690,1529.0,1819.0,36,5.00,8.0,2000.0
1,ALCATEL,Android,323,1803,1659.0,2489.0,36,5.00,16.0,2000.0
2,ALCATEL,Android,299,1803,1659.0,2489.0,36,5.00,16.0,2000.0
3,ALCATEL,Android,287,1803,1659.0,2489.0,36,5.00,16.0,2000.0
4,Nokia,Android,1047,1999,,,10,5.71,16.0,3000.0
...,...,...,...,...,...,...,...,...,...,...
1219,Apple,iOS,1101,22685,16018.0,27900.0,61,6.50,64.0,3174.0
1220,Apple,iOS,530,24600,21939.0,33720.0,28,6.50,64.0,3174.0
1221,HUAWEI,Android,1174,8804,7999.0,9999.0,18,6.26,128.0,3750.0
1222,ZTE,Android,752,18755,18500.0,19010.0,2,6.65,128.0,4500.0


# Data Cleaning Process

In [5]:
df.shape

(1224, 10)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1224 entries, 0 to 1223
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   brand_name      1224 non-null   object 
 1   os              1027 non-null   object 
 2   popularity      1224 non-null   int64  
 3   best_price      1224 non-null   int64  
 4   lowest_price    964 non-null    float64
 5   highest_price   964 non-null    float64
 6   sellers_amount  1224 non-null   int64  
 7   screen_size     1222 non-null   float64
 8   memory_size     1112 non-null   float64
 9   battery_size    1214 non-null   float64
dtypes: float64(5), int64(3), object(2)
memory usage: 95.8+ KB


In [7]:
df.isnull().sum()

brand_name          0
os                197
popularity          0
best_price          0
lowest_price      260
highest_price     260
sellers_amount      0
screen_size         2
memory_size       112
battery_size       10
dtype: int64

In [8]:
df['memory_size'] = df['memory_size'].fillna(df['memory_size'].median())

df['battery_size'] = df['battery_size'].fillna(df['battery_size'].median())

df['screen_size'] = df['screen_size'].fillna(df['screen_size'].median())

df['lowest_price'] = df['lowest_price'].fillna(df['best_price'])

df['highest_price'] = df['highest_price'].fillna(df['best_price'])

df['os'] = df['os'].fillna('Android')

In [9]:
df

Unnamed: 0,brand_name,os,popularity,best_price,lowest_price,highest_price,sellers_amount,screen_size,memory_size,battery_size
0,ALCATEL,Android,422,1690,1529.0,1819.0,36,5.00,8.0,2000.0
1,ALCATEL,Android,323,1803,1659.0,2489.0,36,5.00,16.0,2000.0
2,ALCATEL,Android,299,1803,1659.0,2489.0,36,5.00,16.0,2000.0
3,ALCATEL,Android,287,1803,1659.0,2489.0,36,5.00,16.0,2000.0
4,Nokia,Android,1047,1999,1999.0,1999.0,10,5.71,16.0,3000.0
...,...,...,...,...,...,...,...,...,...,...
1219,Apple,iOS,1101,22685,16018.0,27900.0,61,6.50,64.0,3174.0
1220,Apple,iOS,530,24600,21939.0,33720.0,28,6.50,64.0,3174.0
1221,HUAWEI,Android,1174,8804,7999.0,9999.0,18,6.26,128.0,3750.0
1222,ZTE,Android,752,18755,18500.0,19010.0,2,6.65,128.0,4500.0


In [10]:
df.isnull().sum()

brand_name        0
os                0
popularity        0
best_price        0
lowest_price      0
highest_price     0
sellers_amount    0
screen_size       0
memory_size       0
battery_size      0
dtype: int64

In [11]:
df.duplicated().sum()

0

In [12]:
df.dtypes

brand_name         object
os                 object
popularity          int64
best_price          int64
lowest_price      float64
highest_price     float64
sellers_amount      int64
screen_size       float64
memory_size       float64
battery_size      float64
dtype: object

# Machine Learning Model To Use Get Right Accuracy

In [13]:
ndf=df.iloc[:,[0,2,3,6,7,8,9]]
ndf

Unnamed: 0,brand_name,popularity,best_price,sellers_amount,screen_size,memory_size,battery_size
0,ALCATEL,422,1690,36,5.00,8.0,2000.0
1,ALCATEL,323,1803,36,5.00,16.0,2000.0
2,ALCATEL,299,1803,36,5.00,16.0,2000.0
3,ALCATEL,287,1803,36,5.00,16.0,2000.0
4,Nokia,1047,1999,10,5.71,16.0,3000.0
...,...,...,...,...,...,...,...
1219,Apple,1101,22685,61,6.50,64.0,3174.0
1220,Apple,530,24600,28,6.50,64.0,3174.0
1221,HUAWEI,1174,8804,18,6.26,128.0,3750.0
1222,ZTE,752,18755,2,6.65,128.0,4500.0


In [14]:
le_brandname = LabelEncoder()
ndf['brand_name'] = le_brandname.fit_transform(df['brand_name'])
ndf.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ndf['brand_name'] = le_brandname.fit_transform(df['brand_name'])


Unnamed: 0,brand_name,popularity,best_price,sellers_amount,screen_size,memory_size,battery_size
0,2,422,1690,36,5.0,8.0,2000.0
1,2,323,1803,36,5.0,16.0,2000.0
2,2,299,1803,36,5.0,16.0,2000.0
3,2,287,1803,36,5.0,16.0,2000.0
4,38,1047,1999,10,5.71,16.0,3000.0


In [15]:
x=ndf.drop(["best_price"],axis=1)
y=ndf["best_price"]

In [16]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)


### Random Forest Regresion

In [17]:
Rfr=RandomForestRegressor(n_estimators=300,random_state=42,criterion='squared_error')


In [18]:
Rfr.fit(x_train,y_train)

In [19]:
y_pred=Rfr.predict(x_test)

In [20]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r2)

MAE: 1967.0814965986394
MSE: 13624949.808147665
RMSE: 3691.1989662097144
R2 Score: 0.7659517684435825


In [21]:
xgb_model = xgb.XGBRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [22]:
xgb_model.fit(x_train, y_train)
y_ped = xgb_model.predict(x_test)

In [23]:
mae = mean_absolute_error(y_test, y_ped)
mse = mean_squared_error(y_test, y_ped)
rms = np.sqrt(mse)
r2 = r2_score(y_test, y_ped)

print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rms)
print("R2 Score:", r2)

MAE: 1933.789931628169
MSE: 11208316.462896015
RMSE: 3347.882384865994
R2 Score: 0.8074644804000854


In [24]:
import pickle

model_path = r"phone_sales_model.pkl"

with open(model_path, "wb") as file:
    pickle.dump(Rfr, file)

print("Model saved successfully at:", model_path)

Model saved successfully at: phone_sales_model.pkl
