# Load libraries and datasets

Loading all the required libraries and datasets for the classification of mobile price range.

In [None]:
# Data Preprocessing
import pandas as pd 
import numpy as np 

# Data Visualization 
import seaborn as sns 
import matplotlib.pyplot as plt

# ML Models 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold 
from sklearn.linear_model import LogisticRegression 
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc
from sklearn.svm import SVC 
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Miscellanous 
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline 

In [None]:
# Loading Data 
train = pd.read_csv('../input/mobile-price-classification/train.csv')
test = pd.read_csv('../input/mobile-price-classification/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
# Let's make a copy of train and test data so that even if we have to make any changes in these datasets we would not lose the original datasets.

train_original=train.copy()
test_original =test.copy()

In [None]:
train.columns

In [None]:
test.drop(['id'],axis=1, inplace=True)

In [None]:
test.columns

In [None]:
# Print data types for each variable 
train.dtypes

In [None]:
# shape of the dataset.
train.shape, test.shape

In [None]:
# unique values in label
train['price_range'].nunique()    

In [None]:
#frequency table
train['price_range'].value_counts()

In [None]:
# Normalize can be set to True to print proportions instead of number ( percentage distribution )
train['price_range'].value_counts(normalize=True)    

In [None]:
train['price_range'].value_counts().plot.bar()

# Visualize features

In [None]:
plt.figure(1)
plt.subplot(121)
train['battery_power'].value_counts(normalize=True).plot.bar(figsize=(30,20), title='battery_power');

In [None]:
plt.figure(1)
plt.subplot(121)
sns.distplot(train['battery_power']);
plt.subplot(122)
train['battery_power'].plot.box(figsize=(16, 5))
plt.show()

In [None]:
plt.figure(1)
plt.subplot(121)
sns.distplot(train['four_g']);
plt.subplot(122)
train['four_g'].plot.box(figsize=(16, 5))
plt.show()

In [None]:
plt.figure(1)
plt.subplot(121)
sns.distplot(train['int_memory']);
plt.subplot(122)
train['int_memory'].plot.box(figsize=(16, 5))
plt.show()

 # Independent Variable v/s Target Variable

In [None]:
blue = pd.crosstab(train['blue'], train['price_range'])
dual_sim = pd.crosstab(train['dual_sim'], train['price_range'])
fc = pd.crosstab(train['fc'], train['price_range'])
four_g = pd.crosstab(train['four_g'], train['price_range'])
n_cores = pd.crosstab(train['n_cores'], train['price_range'])
sc_h = pd.crosstab(train['sc_h'], train['price_range'])
three_g = pd.crosstab(train['three_g'], train['price_range'])
wifi = pd.crosstab(train['wifi'], train['price_range'])
touch_screen = pd.crosstab(train['touch_screen'], train['price_range'])

blue.div(blue.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True, figsize=(4, 4))
dual_sim.div(dual_sim.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True, figsize=(4, 4))
fc.div(fc.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True, figsize=(4, 4))
four_g.div(four_g.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True, figsize=(4, 4))
n_cores.div(n_cores.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True, figsize=(4, 4))
sc_h.div(sc_h.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True, figsize=(4, 4))
three_g.div(three_g.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True, figsize=(4, 4))
wifi.div(wifi.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True, figsize=(4, 4))
touch_screen.div(touch_screen.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True, figsize=(4, 4))



In [None]:
train.groupby('price_range')['ram'].mean().plot.bar()

In [None]:
train.groupby('price_range')['battery_power'].mean().plot.bar()

In [None]:
train.groupby('price_range')['int_memory'].mean().plot.bar()

In [None]:
# heat map to visualize the correlation between all the numerical variables.

matrix = train.corr()
f, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(matrix, vmax=.8, square=True, cmap='BuPu')

# Finding missing values

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
X = train.drop('price_range', 1)
y=train.price_range

In [None]:
class0=train_original[train_original['price_range']==0]
class0.describe()

In [None]:
class1=train_original[train_original['price_range']==1]
class1.describe()

In [None]:
class2=train_original[train_original['price_range']==2]
class2.describe()

In [None]:
class3=train_original[train_original['price_range']==3]
class3.describe()

In [None]:
class0.shape,class1.shape,class2.shape,class3.shape

# Stratified k-fold cross validation Decision tree model

In [None]:
i = 1
l=0
kf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)

for train_index, test_index in kf.split(X, y):
    print('\n{} of kfold {}'.format(i, kf.n_splits))
    xtr, xvl = X.loc[train_index], X.loc[test_index]
    ytr, yvl = y[train_index], y[test_index]
    
    model = tree.DecisionTreeClassifier(random_state=1)
    model.fit(xtr, ytr)
    pred_test = model.predict(xvl)
    score=accuracy_score(yvl, pred_test)
    print('accuracy_score', score)
    i+=1
    l+=score
pred_test = model.predict(test)
pred= model.predict_proba(xvl)[:, 1] 
print('\n accuracy :', l/5)

In [None]:

fpr = {}
tpr = {}
thresh ={}
n_classes = 4
for i in range(n_classes):
    fpr[i], tpr[i] , thresh[i]= roc_curve(yvl,  pred, pos_label=i) 

plt.figure(figsize=(12,8)) 

plt.plot(fpr[0],tpr[0],label="Class 0") 
plt.plot(fpr[1],tpr[1],label="Class 1") 
plt.plot(fpr[2],tpr[2],label="Class 2") 
plt.plot(fpr[3],tpr[3],label="Class 3") 

plt.xlabel('False Positive Rate') 
plt.ylabel('True Positive Rate') 
plt.legend(loc=4) 
plt.show()

# Stratified k-fold cross validation Random forest model

In [None]:
i =1 
l=0
kf = StratifiedKFold(n_splits=5, random_state = 1, shuffle=True)
for train_index, test_index in kf.split(X, y):
    print('\n{} of kfold {}'.format(i,kf.n_splits))     
    xtr,xvl = X.loc[train_index],X.loc[test_index]     
    ytr,yvl = y[train_index],y[test_index]   
    
    model = RandomForestClassifier(random_state=1, max_depth=10)
    model.fit(xtr, ytr)
    pred_test = model.predict(xvl)
    score=accuracy_score(yvl, pred_test)
    print('accuracy_score',score)     
    i+=1 
    l += score

pred_test = model.predict(test)
pred = model.predict_proba(xvl)[:, 1]
print('\n accuracy :', l/5)

In [None]:

fpr = {}
tpr = {}
thresh ={}
n_classes = 4
for i in range(n_classes):
    fpr[i], tpr[i] , thresh[i]= roc_curve(yvl,  pred, pos_label=i) 

plt.figure(figsize=(12,8)) 

plt.plot(fpr[0],tpr[0],label="Class 0") 
plt.plot(fpr[1],tpr[1],label="Class 1") 
plt.plot(fpr[2],tpr[2],label="Class 2") 
plt.plot(fpr[3],tpr[3],label="Class 3") 
plt.xlabel('False Positive Rate') 
plt.ylabel('True Positive Rate') 
plt.legend(loc=4) 
plt.show()

# Hyperparameter tunning

In [None]:
paramgrid = {'max_depth':list(range(1, 20, 2)), 'n_estimators':list(range(1, 200, 20))}

In [None]:
grid_search = GridSearchCV(RandomForestClassifier(random_state=1), paramgrid)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_cv, y_train, y_cv = train_test_split(X, y, test_size=0.3, random_state=1)

In [None]:
grid_search.fit(x_train, y_train)

In [None]:
grid_search.best_estimator_

In [None]:
#  Now let’s build the model using these optimized values.

In [None]:
i=1 
l=0
kf = StratifiedKFold(n_splits=5,random_state=1,shuffle=True) 
for train_index,test_index in kf.split(X,y):     
    print('\n{} of kfold {}'.format(i,kf.n_splits))     
    xtr,xvl = X.loc[train_index],X.loc[test_index]     
    ytr,yvl = y[train_index],y[test_index]         
    model = RandomForestClassifier(random_state=1, max_depth=17, n_estimators=161)     
    model.fit(xtr, ytr)     
    pred_test = model.predict(xvl)     
    score = accuracy_score(yvl,pred_test)     
    print('accuracy_score',score)  
    i+=1 
    l +=score
    
pred_test = model.predict(test) 
pred2=model.predict_proba(xvl)[:,1]
print('\n score',l/5)

In [None]:
fpr = {}
tpr = {}
thresh ={}
n_classes = 4
for i in range(n_classes):
    fpr[i], tpr[i] , thresh[i]= roc_curve(yvl,  pred2, pos_label=i) 

plt.figure(figsize=(12,8)) 

plt.plot(fpr[0],tpr[0],label="Class 0") 
plt.plot(fpr[1],tpr[1],label="Class 1") 
plt.plot(fpr[2],tpr[2],label="Class 2") 
plt.plot(fpr[3],tpr[3],label="Class 3") 

plt.xlabel('False Positive Rate') 
plt.ylabel('True Positive Rate') 
plt.legend(loc=4) 
plt.show()

#  Stratified k-fold cross validation XGBOOST model

In [None]:
i =0
l=0
kf = StratifiedKFold(n_splits=5,random_state=1,shuffle=True) 
for train_index,test_index in kf.split(X,y):     
    print('\n{} of kfold {}'.format(i+1,kf.n_splits))     
    xtr,xvl = X.loc[train_index],X.loc[test_index]     
    ytr,yvl = y[train_index],y[test_index]         
    model = XGBClassifier(n_estimators=50, max_depth=4)     
    model.fit(xtr, ytr)     
    pred_test = model.predict(xvl)     
    score = accuracy_score(yvl,pred_test)     
    print('accuracy_score',score)     
    i+=1
    l +=score
    
pred_test = model.predict(test) 
pred3=model.predict_proba(xvl)[:,1]
print('\n score',l/5)

In [None]:
fpr = {}
tpr = {}
thresh ={}
n_classes = 4
for i in range(n_classes):
    fpr[i], tpr[i] , thresh[i]= roc_curve(yvl,  pred3, pos_label=i) 

plt.figure(figsize=(12,8)) 

plt.plot(fpr[0],tpr[0],label="Class 0") 
plt.plot(fpr[1],tpr[1],label="Class 1") 
plt.plot(fpr[2],tpr[2],label="Class 2") 
plt.plot(fpr[3],tpr[3],label="Class 3") 

plt.xlabel('False Positive Rate') 
plt.ylabel('True Positive Rate') 
plt.legend(loc=4) 
plt.show()

# Feature importance

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
clf = SVC(gamma= 'scale')
clf.fit(x_train, y_train)
clf.score(x_train, y_train), clf.score(x_test, y_test)

In [None]:
results={}
for name, col in x_train.items():
    temp_x_train = x_train.copy()
    temp_x_train[name] = np.random.permutation(col)
    results[name] = clf.score(temp_x_train, y_train)


In [None]:
feature_imp = pd.Series(results).sort_values()
feature_imp

In [None]:
feature_imp.plot.barh()

In [None]:
X_after_drop = X.drop(['ram'], axis=1)
test_after_drop = test.drop(['ram'], axis= 1)

In [None]:
l=0
i=0
kf = StratifiedKFold(n_splits=5,random_state=1,shuffle=True) 
for train_index,test_index in kf.split(X_after_drop,y):     
    print('\n{} of kfold {}'.format(i+1,kf.n_splits))     
    xtr,xvl = X_after_drop.loc[train_index],X_after_drop.loc[test_index]     
    ytr,yvl = y[train_index],y[test_index]         
    model = XGBClassifier(n_estimators=50, max_depth=4)     
    model.fit(xtr, ytr)     
    pred_test = model.predict(xvl)     
    score = accuracy_score(yvl,pred_test)     
    print('accuracy_score',score)     
    i+=1
    l +=score
    
pred_test = model.predict(test_after_drop) 
pred3=model.predict_proba(xvl)[:,1]
print('\n score',l/5)

In [None]:
test_after_drop

In [None]:
X_after_drop

# Conclusion

-  After performing Decision tree, Random forest, Hyperparameter tunning and XGBOOST we found that accuracy score of XGBOOST  is more compare to all of the other models.
- Applying feature importance technique we can see that performance of our model is reduced, so without reducing features XGBOOST gives best accuracy.


# If you like this notebook please do upvote!!