In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pd.pandas.set_option('display.max_columns',None)
#pd.pandas.set_option('display.max_rows',None)

In [2]:
df=pd.read_csv('smartphones_cleaned_v6.csv')

In [3]:
df.head()

Unnamed: 0,brand_name,model,price,rating,has_5g,has_nfc,has_ir_blaster,processor_brand,num_cores,processor_speed,battery_capacity,fast_charging_available,fast_charging,ram_capacity,internal_memory,screen_size,refresh_rate,num_rear_cameras,num_front_cameras,os,primary_camera_rear,primary_camera_front,extended_memory_available,extended_upto,resolution_width,resolution_height
0,oneplus,OnePlus 11 5G,54999,89.0,True,True,False,snapdragon,8.0,3.2,5000.0,1,100.0,12.0,256.0,6.7,120,3,1.0,android,50.0,16.0,0,,1440,3216
1,oneplus,OnePlus Nord CE 2 Lite 5G,19989,81.0,True,False,False,snapdragon,8.0,2.2,5000.0,1,33.0,6.0,128.0,6.59,120,3,1.0,android,64.0,16.0,1,1024.0,1080,2412
2,samsung,Samsung Galaxy A14 5G,16499,75.0,True,False,False,exynos,8.0,2.4,5000.0,1,15.0,4.0,64.0,6.6,90,3,1.0,android,50.0,13.0,1,1024.0,1080,2408
3,motorola,Motorola Moto G62 5G,14999,81.0,True,False,False,snapdragon,8.0,2.2,5000.0,1,,6.0,128.0,6.55,120,3,1.0,android,50.0,16.0,1,1024.0,1080,2400
4,realme,Realme 10 Pro Plus,24999,82.0,True,False,False,dimensity,8.0,2.6,5000.0,1,67.0,6.0,128.0,6.7,120,3,1.0,android,108.0,16.0,0,,1080,2412


In [4]:
print(df.shape)

(980, 26)


In [5]:
features_with_null=[features for features in df.columns if df[features].isnull().sum()>0]

for feature in features_with_null:
    print(feature, df[feature].isnull().mean() ,  ' % missing values')

rating 0.10306122448979592  % missing values
processor_brand 0.02040816326530612  % missing values
num_cores 0.006122448979591836  % missing values
processor_speed 0.04285714285714286  % missing values
battery_capacity 0.011224489795918367  % missing values
fast_charging 0.2153061224489796  % missing values
num_front_cameras 0.004081632653061225  % missing values
os 0.014285714285714285  % missing values
primary_camera_front 0.00510204081632653  % missing values
extended_upto 0.4897959183673469  % missing values


In [6]:
features_nan=[feature for feature in df.columns if df[feature].isnull().sum()>1 and df[feature].dtypes=='O']

for feature in features_nan:
    print("{}: {}% missing values".format(feature,df[feature].isnull().mean()))

processor_brand: 0.02040816326530612% missing values
os: 0.014285714285714285% missing values


In [7]:
def replace_cat_feature(dataset,features_nan):
    data=df.copy()
    data[features_nan]=data[features_nan].fillna('Missing')
    return data

df=replace_cat_feature(df,features_nan)

df[features_nan].isnull().sum()

processor_brand    0
os                 0
dtype: int64

In [8]:
## Now lets check for numerical variables the contains missing values
numerical_with_nan=[feature for feature in df.columns if df[feature].isnull().sum()>1 and df[feature].dtypes!='O']

## We will print the numerical nan variables and percentage of missing values

for feature in numerical_with_nan:
    print("{}: {}% missing value".format(feature,df[feature].isnull().mean()))

rating: 0.10306122448979592% missing value
num_cores: 0.006122448979591836% missing value
processor_speed: 0.04285714285714286% missing value
battery_capacity: 0.011224489795918367% missing value
fast_charging: 0.2153061224489796% missing value
num_front_cameras: 0.004081632653061225% missing value
primary_camera_front: 0.00510204081632653% missing value
extended_upto: 0.4897959183673469% missing value


In [9]:
for feature in numerical_with_nan:
    ## We will replace by using median since there are outliers
    median_value=df[feature].median()
    
    ## create a new feature to capture nan values
    #df[feature+'nan']=np.where(df[feature].isnull(),1,0)
    df[feature].fillna(median_value,inplace=True)
    
df[numerical_with_nan].isnull().sum()

rating                  0
num_cores               0
processor_speed         0
battery_capacity        0
fast_charging           0
num_front_cameras       0
primary_camera_front    0
extended_upto           0
dtype: int64

In [10]:
df.head()

Unnamed: 0,brand_name,model,price,rating,has_5g,has_nfc,has_ir_blaster,processor_brand,num_cores,processor_speed,battery_capacity,fast_charging_available,fast_charging,ram_capacity,internal_memory,screen_size,refresh_rate,num_rear_cameras,num_front_cameras,os,primary_camera_rear,primary_camera_front,extended_memory_available,extended_upto,resolution_width,resolution_height
0,oneplus,OnePlus 11 5G,54999,89.0,True,True,False,snapdragon,8.0,3.2,5000.0,1,100.0,12.0,256.0,6.7,120,3,1.0,android,50.0,16.0,0,1024.0,1440,3216
1,oneplus,OnePlus Nord CE 2 Lite 5G,19989,81.0,True,False,False,snapdragon,8.0,2.2,5000.0,1,33.0,6.0,128.0,6.59,120,3,1.0,android,64.0,16.0,1,1024.0,1080,2412
2,samsung,Samsung Galaxy A14 5G,16499,75.0,True,False,False,exynos,8.0,2.4,5000.0,1,15.0,4.0,64.0,6.6,90,3,1.0,android,50.0,13.0,1,1024.0,1080,2408
3,motorola,Motorola Moto G62 5G,14999,81.0,True,False,False,snapdragon,8.0,2.2,5000.0,1,33.0,6.0,128.0,6.55,120,3,1.0,android,50.0,16.0,1,1024.0,1080,2400
4,realme,Realme 10 Pro Plus,24999,82.0,True,False,False,dimensity,8.0,2.6,5000.0,1,67.0,6.0,128.0,6.7,120,3,1.0,android,108.0,16.0,0,1024.0,1080,2412


In [11]:
bn=pd.get_dummies(df['brand_name'])
pb=pd.get_dummies(df['processor_brand'])
os_=pd.get_dummies(df['os'])

df.drop(['brand_name','processor_brand','model','os'],axis=1,inplace=True)


In [12]:
df.head()

Unnamed: 0,price,rating,has_5g,has_nfc,has_ir_blaster,num_cores,processor_speed,battery_capacity,fast_charging_available,fast_charging,ram_capacity,internal_memory,screen_size,refresh_rate,num_rear_cameras,num_front_cameras,primary_camera_rear,primary_camera_front,extended_memory_available,extended_upto,resolution_width,resolution_height
0,54999,89.0,True,True,False,8.0,3.2,5000.0,1,100.0,12.0,256.0,6.7,120,3,1.0,50.0,16.0,0,1024.0,1440,3216
1,19989,81.0,True,False,False,8.0,2.2,5000.0,1,33.0,6.0,128.0,6.59,120,3,1.0,64.0,16.0,1,1024.0,1080,2412
2,16499,75.0,True,False,False,8.0,2.4,5000.0,1,15.0,4.0,64.0,6.6,90,3,1.0,50.0,13.0,1,1024.0,1080,2408
3,14999,81.0,True,False,False,8.0,2.2,5000.0,1,33.0,6.0,128.0,6.55,120,3,1.0,50.0,16.0,1,1024.0,1080,2400
4,24999,82.0,True,False,False,8.0,2.6,5000.0,1,67.0,6.0,128.0,6.7,120,3,1.0,108.0,16.0,0,1024.0,1080,2412


In [13]:
df=pd.concat([df,bn,pb,os_],axis=1)

In [14]:
df.head()

Unnamed: 0,price,rating,has_5g,has_nfc,has_ir_blaster,num_cores,processor_speed,battery_capacity,fast_charging_available,fast_charging,ram_capacity,internal_memory,screen_size,refresh_rate,num_rear_cameras,num_front_cameras,primary_camera_rear,primary_camera_front,extended_memory_available,extended_upto,resolution_width,resolution_height,apple,asus,blackview,blu,cat,cola,doogee,duoqin,gionee,google,honor,huawei,ikall,infinix,iqoo,itel,jio,lava,leeco,leitz,lenovo,letv,lg,lyf,micromax,motorola,nokia,nothing,nubia,oneplus,oppo,oukitel,poco,realme,redmi,royole,samsung,sharp,sony,tcl,tecno,tesla,vertu,vivo,xiaomi,zte,Missing,bionic,dimensity,exynos,fusion,google.1,helio,kirin,mediatek,sc9863a,snapdragon,spreadtrum,tiger,unisoc,Missing.1,android,ios,other
0,54999,89.0,True,True,False,8.0,3.2,5000.0,1,100.0,12.0,256.0,6.7,120,3,1.0,50.0,16.0,0,1024.0,1440,3216,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False
1,19989,81.0,True,False,False,8.0,2.2,5000.0,1,33.0,6.0,128.0,6.59,120,3,1.0,64.0,16.0,1,1024.0,1080,2412,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False
2,16499,75.0,True,False,False,8.0,2.4,5000.0,1,15.0,4.0,64.0,6.6,90,3,1.0,50.0,13.0,1,1024.0,1080,2408,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False
3,14999,81.0,True,False,False,8.0,2.2,5000.0,1,33.0,6.0,128.0,6.55,120,3,1.0,50.0,16.0,1,1024.0,1080,2400,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False
4,24999,82.0,True,False,False,8.0,2.6,5000.0,1,67.0,6.0,128.0,6.7,120,3,1.0,108.0,16.0,0,1024.0,1080,2412,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False


In [15]:
x=df.iloc[:,1:]

In [16]:
y=df.iloc[:,0]

In [17]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(x)

In [18]:
scaler.transform(x)

array([[1.        , 1.        , 1.        , ..., 1.        , 0.        ,
        0.        ],
       [0.72413793, 1.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.51724138, 1.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.86206897, 1.        , 1.        , ..., 1.        , 0.        ,
        0.        ],
       [0.68965517, 1.        , 1.        , ..., 1.        , 0.        ,
        0.        ],
       [0.48275862, 1.        , 0.        , ..., 1.        , 0.        ,
        0.        ]])

In [19]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.1,random_state=0)

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
model=RandomForestClassifier()

In [22]:
model.fit(x_train,y_train)

In [23]:
pred=model.predict(x_test)

In [24]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [25]:
mse = mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
r2 = r2_score(y_test, pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

Mean Squared Error: 266801748.1734694
Mean Absolute Error: 6992.336734693878
R-squared: 0.7662332873865066
