In [121]:
#COSC 522 UTK
#Project 2
#Student Name: Jacob Mendez

In [54]:
#load in data
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers
#Show fewer digits to improve readability
np.set_printoptions(precision=3, suppress=True)


In [40]:
dataset = pd.read_csv("ndtv_data_final.csv")
print(dataset.columns)

Index(['Unnamed: 0', 'Name', 'Brand', 'Model', 'Battery capacity (mAh)',
       'Screen size (inches)', 'Touchscreen', 'Resolution x', 'Resolution y',
       'Processor', 'RAM (MB)', 'Internal storage (GB)', 'Rear camera',
       'Front camera', 'Operating system', 'Wi-Fi', 'Bluetooth', 'GPS',
       'Number of SIMs', '3G', '4G/ LTE', 'Price'],
      dtype='object')


In [41]:
dataset.tail()

Unnamed: 0.1,Unnamed: 0,Name,Brand,Model,Battery capacity (mAh),Screen size (inches),Touchscreen,Resolution x,Resolution y,Processor,...,Rear camera,Front camera,Operating system,Wi-Fi,Bluetooth,GPS,Number of SIMs,3G,4G/ LTE,Price
1354,1354,Intex Aqua A2,Intex,Aqua A2,1500,4.0,Yes,480,800,4,...,5.0,0.3,Android,Yes,Yes,Yes,2,Yes,No,2599
1355,1355,Videocon Infinium Z51 Nova+,Videocon,Infinium Z51 Nova+,2000,5.0,Yes,480,854,4,...,8.0,5.0,Android,Yes,Yes,Yes,2,Yes,No,2940
1356,1356,Intex Aqua Y4,Intex,Aqua Y4,1700,4.5,Yes,480,854,2,...,5.0,2.0,Android,Yes,Yes,No,2,Yes,No,2999
1357,1357,iBall Andi4 B20,iBall,Andi4 B20,1250,4.0,Yes,480,800,1,...,2.0,0.3,Android,Yes,Yes,Yes,2,Yes,No,2498
1358,1358,iBall Andi Avonte 5,iBall,Andi Avonte 5,2150,5.0,Yes,480,854,4,...,8.0,0.0,Android,Yes,Yes,Yes,2,Yes,No,3999


In [101]:
### Your code goes here and below

## Data Formation

### One-hot encoding

The feature I chose to one-hot encode was Operating systems because I felt it was the categorical variable that could be encoded the easiest. Name is made up of model and brand, and each one of those would be too complex due to unique entries, whereas OS has 5 distinct values. OS I felt was important enough of a categorical variable to one-hot encode that wouldn't overcomplicate the structure of the data and training of the model.

In [42]:
dataset = pd.get_dummies(dataset, columns=['Operating system'], prefix='', prefix_sep='')
dataset.tail()

Unnamed: 0.1,Unnamed: 0,Name,Brand,Model,Battery capacity (mAh),Screen size (inches),Touchscreen,Resolution x,Resolution y,Processor,...,3G,4G/ LTE,Price,Android,BlackBerry,Cyanogen,Sailfish,Tizen,Windows,iOS
1354,1354,Intex Aqua A2,Intex,Aqua A2,1500,4.0,Yes,480,800,4,...,Yes,No,2599,True,False,False,False,False,False,False
1355,1355,Videocon Infinium Z51 Nova+,Videocon,Infinium Z51 Nova+,2000,5.0,Yes,480,854,4,...,Yes,No,2940,True,False,False,False,False,False,False
1356,1356,Intex Aqua Y4,Intex,Aqua Y4,1700,4.5,Yes,480,854,2,...,Yes,No,2999,True,False,False,False,False,False,False
1357,1357,iBall Andi4 B20,iBall,Andi4 B20,1250,4.0,Yes,480,800,1,...,Yes,No,2498,True,False,False,False,False,False,False
1358,1358,iBall Andi Avonte 5,iBall,Andi Avonte 5,2150,5.0,Yes,480,854,4,...,Yes,No,3999,True,False,False,False,False,False,False


### Remove unwanted features.
I chose to remove most categorical features as they are unnecessary such as name being composed of brand and model. 

In [43]:
dataset.pop('Unnamed: 0')
dataset.pop('Name')
dataset.pop('Brand')
dataset.pop('Model')
dataset.pop('Touchscreen')
dataset.pop('Wi-Fi')
dataset.pop('Bluetooth')
dataset.pop('GPS')
dataset.pop('3G')
dataset.pop('4G/ LTE')

0       Yes
1       Yes
2       Yes
3       Yes
4        No
       ... 
1354     No
1355     No
1356     No
1357     No
1358     No
Name: 4G/ LTE, Length: 1359, dtype: object

In [44]:
dataset.tail()

Unnamed: 0,Battery capacity (mAh),Screen size (inches),Resolution x,Resolution y,Processor,RAM (MB),Internal storage (GB),Rear camera,Front camera,Number of SIMs,Price,Android,BlackBerry,Cyanogen,Sailfish,Tizen,Windows,iOS
1354,1500,4.0,480,800,4,512,8.0,5.0,0.3,2,2599,True,False,False,False,False,False,False
1355,2000,5.0,480,854,4,1000,8.0,8.0,5.0,2,2940,True,False,False,False,False,False,False
1356,1700,4.5,480,854,2,512,4.0,5.0,2.0,2,2999,True,False,False,False,False,False,False
1357,1250,4.0,480,800,1,256,0.512,2.0,0.3,2,2498,True,False,False,False,False,False,False
1358,2150,5.0,480,854,4,1000,8.0,8.0,0.0,2,3999,True,False,False,False,False,False,False


## Building Model

### Overtrained Model

A simple way to overtrain the model is to provide more test data than training data. A higher score with training data over test data indicates the model is remembering the features instead of learning.

#### Split Training and Test Data

In [45]:
train_data = dataset.sample(frac=0.8, random_state=0)
test_data = dataset.drop(train_data.index)

In [46]:
train_features = train_data.copy()
test_features = test_data.copy()

train_labels = train_features.pop('Price')
test_labels = test_features.pop('Price')

### Normalization

In [56]:
mean = train_data.mean(axis=0)
train_data -= mean
std = train_data.std(axis=0)
train_data /= std
test_data -= mean
test_data /= std

The overtrained model

In [57]:
model = keras.Sequential([
    keras.layers.Dense(256, activation='relu', input_shape=(17,)),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1)
])

model.compile(optimizer="rmsprop", loss="mse", metrics=["mae"])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [59]:
model.fit(train_features, train_labels, epochs=100, batch_size=8, verbose=1)

Epoch 1/100
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 111273688.0000 - mae: 5658.0356
Epoch 2/100
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 109458432.0000 - mae: 5488.9082
Epoch 3/100
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 67767328.0000 - mae: 4584.6826
Epoch 4/100
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 96666648.0000 - mae: 5434.4790
Epoch 5/100
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 92392904.0000 - mae: 5093.8257
Epoch 6/100
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 102190368.0000 - mae: 5551.0635
Epoch 7/100
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 125284296.0000 - mae: 5369.7134
Epoch 8/100
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 104681608.0000 - ma

<keras.src.callbacks.history.History at 0x7fb47015c530>

loss: 93203440.0000 - mae: 4724.7637

In [60]:
loss, mae = model.evaluate(test_features, test_labels)
print(f"Test Loss: {loss}, Test MAE: {mae}")

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - loss: 142821536.0000 - mae: 6483.9375
Test Loss: 111873080.0, Test MAE: 5068.43115234375


### Model implemented with early stopping

To implement early stopping, SGDRegressor is needed since LinearRegression does not have earlystopping built in.

In [61]:

model = keras.Sequential([
    keras.layers.Dense(32, activation='relu', input_shape=(17,)),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(1)
])

model.compile(optimizer="rmsprop", loss="mse", metrics=["mae"])

history = model.fit(train_features, train_labels,
                    epochs = 100,
                    batch_size = 512,
                    validation_data = (test_features, test_labels),
                    callbacks=[EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)],
                    verbose=1
)

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 259ms/step - loss: 319325888.0000 - mae: 11526.1045 - val_loss: 362685856.0000 - val_mae: 11548.9053
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - loss: 324264832.0000 - mae: 11464.5752 - val_loss: 356862976.0000 - val_mae: 11352.4805
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - loss: 299230656.0000 - mae: 10947.0107 - val_loss: 351107136.0000 - val_mae: 11160.4072
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - loss: 292127200.0000 - mae: 10879.6055 - val_loss: 346814336.0000 - val_mae: 11008.4600
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - loss: 285295936.0000 - mae: 10556.9277 - val_loss: 343010816.0000 - val_mae: 10872.1914
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - loss: 294142208.0000 - mae: 10630.4346 - val_loss: 3392

In [62]:
loss, mae = model.evaluate(test_features, test_labels)
print(f"Test Loss: {loss}, Test MAE: {mae}")

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 195217072.0000 - mae: 7623.2573 
Test Loss: 152205232.0, Test MAE: 5943.041015625
