## Imports

In [18]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


## Data Preprocessing

In [19]:
train_df = pd.read_csv(r"C:\Users\DELL\maids\python\data\train.csv")
test_df = pd.read_csv(r"C:\Users\DELL\maids\python\data\test.csv")
print(train_df.head())


   battery_power  blue  clock_speed  dual_sim    fc  four_g  int_memory  \
0            842     0          2.2         0   1.0     0.0         7.0   
1           1021     1          0.5         1   0.0     1.0        53.0   
2            563     1          0.5         1   2.0     1.0        41.0   
3            615     1          2.5         0   0.0     0.0        10.0   
4           1821     1          1.2         0  13.0     1.0        44.0   

   m_dep  mobile_wt  n_cores  ...  px_height  px_width     ram  sc_h  sc_w  \
0    0.6      188.0      2.0  ...       20.0     756.0  2549.0   9.0   7.0   
1    0.7      136.0      3.0  ...      905.0    1988.0  2631.0  17.0   3.0   
2    0.9      145.0      5.0  ...     1263.0    1716.0  2603.0  11.0   2.0   
3    0.8      131.0      6.0  ...     1216.0    1786.0  2769.0  16.0   8.0   
4    0.6      141.0      2.0  ...     1208.0    1212.0  1411.0   8.0   2.0   

   talk_time  three_g  touch_screen  wifi  price_range  
0         19        0  

In [20]:

columns = [
    'id', 'battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
    'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width',
    'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi', 'price_range'
]

categorical_cols = [
    'blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi'
]

numerical_cols = [
    col for col in columns if col not in ['id', 'price_range'] and col not in categorical_cols
]

print("Categorical columns:")
print(categorical_cols)

print("\nNumerical columns:")
print(numerical_cols)


Categorical columns:
['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']

Numerical columns:
['battery_power', 'clock_speed', 'fc', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time']


Handling missing values

In [21]:
print(train_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             1995 non-null   float64
 5   four_g         1995 non-null   float64
 6   int_memory     1995 non-null   float64
 7   m_dep          1995 non-null   float64
 8   mobile_wt      1996 non-null   float64
 9   n_cores        1996 non-null   float64
 10  pc             1995 non-null   float64
 11  px_height      1996 non-null   float64
 12  px_width       1998 non-null   float64
 13  ram            1998 non-null   float64
 14  sc_h           1999 non-null   float64
 15  sc_w           1999 non-null   float64
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

In [22]:
numerical_imputer = SimpleImputer(strategy='mean')
train_df[numerical_cols] = numerical_imputer.fit_transform(train_df[numerical_cols])
test_df[numerical_cols] = numerical_imputer.transform(test_df[numerical_cols])
test_df['price_range'] = 0 

if not categorical_cols:
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    train_df[categorical_cols] = categorical_imputer.fit_transform(train_df[categorical_cols])
    test_df[categorical_cols] = categorical_imputer.transform(test_df[categorical_cols])

label_encoder = LabelEncoder()
for col in categorical_cols:
    train_df[col] = label_encoder.fit_transform(train_df[col])
    test_df[col] = label_encoder.transform(test_df[col])

print(train_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   float64
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   float64
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   float64
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   float64
 9   n_cores        2000 non-null   float64
 10  pc             2000 non-null   float64
 11  px_height      2000 non-null   float64
 12  px_width       2000 non-null   float64
 13  ram            2000 non-null   float64
 14  sc_h           2000 non-null   float64
 15  sc_w           2000 non-null   float64
 16  talk_time      2000 non-null   float64
 17  three_g        2000 non-null   int64  
 18  touch_sc

## Training Possible Models

In [23]:
X = train_df.drop(columns=['price_range'])
y = train_df['price_range']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

Random Forest

In [24]:
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)
rf_val_preds = rf_classifier.predict(X_val)
rf_val_accuracy = accuracy_score(y_val, rf_val_preds)
print("Random Forest Validation Accuracy:", rf_val_accuracy)



Random Forest Validation Accuracy: 0.8925


Gradient Boosting

In [25]:
gb_classifier = GradientBoostingClassifier(random_state=42)
gb_classifier.fit(X_train, y_train)
gb_val_preds = gb_classifier.predict(X_val)
gb_val_accuracy = accuracy_score(y_val, gb_val_preds)
print("Gradient Boosting Validation Accuracy:", gb_val_accuracy)

Gradient Boosting Validation Accuracy: 0.9025


SVM

In [26]:
svm_classifier = SVC(random_state=42)
svm_classifier.fit(X_train, y_train)
svm_val_preds = svm_classifier.predict(X_val)
svm_val_accuracy = accuracy_score(y_val, svm_val_preds)
print("SVM Validation Accuracy:", svm_val_accuracy)


SVM Validation Accuracy: 0.965


## Saving the Model

In [27]:
from joblib import dump

dump(svm_classifier, 'svm_classifier.pkl')

['svm_classifier.pkl']

## Prediction API

In [38]:
from flask import Flask, request, jsonify
import joblib
import numpy as np

app = Flask(__name__)

model = joblib.load('svm_classifier.pkl')

@app.route('/predict', methods=['POST'])
def predict():
    specs = request.json
    specs.pop('id', None)
    specs.pop('price_range', None)
    specs.pop('mdep', None)
    specs.pop('ncores', None)
    specs_array = np.array([list(specs.values())])
    predicted_price = model.predict(specs_array)
    predicted_price_list = predicted_price.tolist()
    return jsonify({'predictedPrice': predicted_price_list})

if __name__ == '__main__':
    app.run(port=5000)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [05/May/2024 00:16:54] "POST /predict HTTP/1.1" 200 -
