<a href="https://colab.research.google.com/github/kirat89/Google_colab/blob/main/Audiobook_train_test_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## import libraries

In [1]:
import numpy as np
import pandas as pd

## loading dataset

In [2]:
raw_data=pd.read_csv('Audiobooks_data.csv')
raw_inputs=raw_data.iloc[:,1:-1].values
targets=raw_data.iloc[:,-1].values

## shuffle

In [3]:
## since data maybe date biased so we try to randomise the data
indices=np.arange(raw_inputs.shape[0])
np.random.shuffle(indices)

shuffled_inputs=raw_inputs[indices]
shuffled_targets=targets[indices]

## balancing

In [4]:
#the customer not purchasing(0) are much more than customer purchasing(1)
#so we balance the dataset to keep the learning effecient

#getting the no of purchases(1)
num_one_target= int(np.sum(targets))
#setting a zero counter
zero_counter=0
#after balance the extra data to be removed will be removed using this list
indices_to_remove=[]

for i in range(shuffled_targets.shape[0]):
  if shuffled_targets[i]==0:
    zero_counter+=1
    if zero_counter>num_one_target:
      indices_to_remove.append(i)

balanced_inputs=np.delete(shuffled_inputs,indices_to_remove,axis=0)
balanced_targets=np.delete(shuffled_targets,indices_to_remove,axis=0)     

In [5]:
#Balancing yet again before train test validation split
indices=np.arange(balanced_inputs.shape[0])
np.random.shuffle(indices)

balanced_inputs=balanced_inputs[indices]
balanced_targets=balanced_targets[indices]

## train_test_validation_split

In [6]:
samples_count=(balanced_inputs.shape[0])
train_count= int(.8*samples_count)
validation_count= int(.1*samples_count)
test_count=samples_count-(train_count+validation_count)

train_inputs=balanced_inputs[:train_count]
train_targets=balanced_targets[:train_count]

validation_inputs=balanced_inputs[train_count:train_count+validation_count]
validation_targets=balanced_targets[train_count:train_count+validation_count]

test_inputs=balanced_inputs[train_count+validation_count:]
test_targets=balanced_targets[train_count+validation_count:]

In [7]:
print(np.sum(train_targets),train_count,np.sum(train_targets)/train_count)
print(np.sum(validation_targets),validation_count,np.sum(validation_targets)/validation_count)
print(np.sum(test_targets),test_count,np.sum(test_targets)/test_count)

1793 3579 0.5009779267951942
222 447 0.4966442953020134
222 448 0.4955357142857143


## Standarizing the data

In [8]:
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
train_inputs=ss.fit_transform(train_inputs)
test_inputs=ss.transform(test_inputs)
validation_inputs=ss.transform(validation_inputs)

## deep learning model

In [9]:
import tensorflow as tf
ann=tf.keras.models.Sequential()
ann.add(tf.keras.layers.Dense(units=50,activation='relu'))
ann.add(tf.keras.layers.Dense(units=50,activation='relu'))
ann.add(tf.keras.layers.Dense(units=1,activation='sigmoid'))


In [10]:
ann.compile(optimizer='adam' ,loss='binary_crossentropy' ,metrics=['accuracy'])

In [11]:
ann.fit(train_inputs,train_targets,batch_size=32,epochs=100, validation_data=(validation_inputs,validation_targets),verbose=2)

Epoch 1/100
112/112 - 1s - loss: 0.5324 - accuracy: 0.7234 - val_loss: 0.4276 - val_accuracy: 0.7919
Epoch 2/100
112/112 - 0s - loss: 0.4159 - accuracy: 0.7840 - val_loss: 0.3843 - val_accuracy: 0.7987
Epoch 3/100
112/112 - 0s - loss: 0.3844 - accuracy: 0.8011 - val_loss: 0.3681 - val_accuracy: 0.8098
Epoch 4/100
112/112 - 0s - loss: 0.3726 - accuracy: 0.8022 - val_loss: 0.3614 - val_accuracy: 0.8233
Epoch 5/100
112/112 - 0s - loss: 0.3648 - accuracy: 0.8027 - val_loss: 0.3645 - val_accuracy: 0.8143
Epoch 6/100
112/112 - 0s - loss: 0.3581 - accuracy: 0.8069 - val_loss: 0.3513 - val_accuracy: 0.8277
Epoch 7/100
112/112 - 0s - loss: 0.3586 - accuracy: 0.8080 - val_loss: 0.3538 - val_accuracy: 0.8166
Epoch 8/100
112/112 - 0s - loss: 0.3506 - accuracy: 0.8108 - val_loss: 0.3485 - val_accuracy: 0.8188
Epoch 9/100
112/112 - 0s - loss: 0.3474 - accuracy: 0.8167 - val_loss: 0.3520 - val_accuracy: 0.8277
Epoch 10/100
112/112 - 0s - loss: 0.3462 - accuracy: 0.8108 - val_loss: 0.3442 - val_accura

<keras.callbacks.History at 0x7fd397dd2c10>

In [12]:
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

pred=ann.predict(test_inputs)

pred

array([[6.68531299e-01],
       [1.00000000e+00],
       [8.17738771e-01],
       [3.40458333e-01],
       [8.31088364e-01],
       [3.69836271e-01],
       [9.99232292e-01],
       [9.13787425e-01],
       [1.00000000e+00],
       [9.99984264e-01],
       [3.92430604e-01],
       [1.00000000e+00],
       [1.33113758e-37],
       [5.74370003e-31],
       [5.33663988e-01],
       [0.00000000e+00],
       [8.51418614e-01],
       [8.63279343e-01],
       [4.46160284e-19],
       [2.62666017e-12],
       [3.94846916e-01],
       [9.99956727e-01],
       [8.51631165e-04],
       [4.02151704e-01],
       [1.93714985e-13],
       [3.01810414e-01],
       [4.25433338e-01],
       [4.07345235e-01],
       [3.42351794e-01],
       [2.35302001e-26],
       [3.85770828e-01],
       [5.00937089e-20],
       [3.91900718e-01],
       [3.50189060e-01],
       [3.74423623e-01],
       [3.82849842e-01],
       [3.31799567e-01],
       [4.67141340e-38],
       [9.98606205e-01],
       [7.00920820e-04],


In [14]:
pred=pred>.5
cm=confusion_matrix(test_targets,pred)
print(cm)

[[203  23]
 [ 56 166]]


In [17]:
score=accuracy_score(test_targets,pred)
print(score)

0.8236607142857143
