## AudioBook Customer Analysis

* Predicting whether the customer will again buy another audiobook or not!!!!

It will help the company to focus more on the customers who are more likey to buy the audiobook next time.

In [55]:
import pandas as pd
import numpy as np

In [56]:
data  = pd.read_csv('original (1).csv',header=None)

In [57]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,994,1620.0,1620,19.73,19.73,1,10.0,0.99,1603.8,5,92,0
1,1143,2160.0,2160,5.33,5.33,0,8.91,0.0,0.0,0,0,0
2,2059,2160.0,2160,5.33,5.33,0,8.91,0.0,0.0,0,388,0
3,2882,1620.0,1620,5.96,5.96,0,8.91,0.42,680.4,1,129,0
4,3342,2160.0,2160,5.33,5.33,0,8.91,0.22,475.2,0,361,0


In [58]:
data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
count,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0
mean,16772.491551,1591.281685,1678.608634,7.103791,7.543805,0.16075,8.909795,0.125659,189.888983,0.070222,61.935033,0.158833
std,9691.807248,504.340663,654.838599,4.931673,5.560129,0.367313,0.643406,0.241206,371.08401,0.472157,88.207634,0.365533
min,2.0,216.0,216.0,3.86,3.86,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,8368.0,1188.0,1188.0,5.33,5.33,0.0,8.91,0.0,0.0,0.0,0.0,0.0
50%,16711.5,1620.0,1620.0,5.95,6.07,0.0,8.91,0.0,0.0,0.0,11.0,0.0
75%,25187.25,2160.0,2160.0,8.0,8.0,0.0,8.91,0.13,194.4,0.0,105.0,0.0
max,33683.0,2160.0,7020.0,130.94,130.94,1.0,10.0,1.0,2160.0,30.0,464.0,1.0


Here 1st col is ID of customer and last col is Target.

* 1: Customer will buy.

* 0: Not buy

In [59]:
unscaled_data = data.values   # Converting dataframe to np.arrays

In [60]:
unscaled_inputs_all = unscaled_data[:,1:-1]
targets_all = unscaled_data[:,-1]
unscaled_inputs_all.shape

(14084, 10)

## Shuffling data

In [61]:
shuffled_indicies = np.arange(unscaled_inputs_all.shape[0])

In [62]:
np.random.shuffle(shuffled_indicies)

In [63]:
unscaled_inputs_all = unscaled_inputs_all[shuffled_indicies]
unscaled_inputs_all.shape

(14084, 10)

In [64]:
targets_all = targets_all[shuffled_indicies]

In [65]:
sum(targets_all)

2237.0

## Balancing Data

Since out data is unbaalanced... lots of 0's. So we need to balance data.

In [66]:
target_ones = int(np.sum(targets_all))
target_ones

2237

In [67]:
indices_to_remove = []
zero_index_counter = 0

for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_index_counter = zero_index_counter +1
        if zero_index_counter > target_ones:
            indices_to_remove.append(i)
zero_index_counter 

11847

In [68]:
len(indices_to_remove)

9610

In [69]:
unscaled_balanced_input = np.delete(unscaled_inputs_all,indices_to_remove,axis=0 )

In [70]:
target_balanced  = np.delete(targets_all, indices_to_remove, axis=0)

In [71]:
sum(target_balanced)

2237.0

## Again Shuffling the data

In [72]:
shuffled_indicies = np.arange(unscaled_balanced_input.shape[0])

In [73]:
np.random.shuffle(shuffled_indicies)

In [74]:
shuffled_balanced_input = unscaled_balanced_input[shuffled_indicies]

In [75]:
shuffled_balanced_target  = target_balanced[shuffled_indicies]

In [76]:
sum(shuffled_balanced_target)

2237.0

## Standardising Data

In [77]:
from sklearn.preprocessing import StandardScaler

In [78]:
scalar = StandardScaler()

In [79]:
scaled_input = scalar.fit_transform(shuffled_balanced_input)

## Splitting into train.test,validation set

* 80,10,10% split

In [82]:
sample_count = scaled_input.shape[0]

In [83]:
train_count = int(0.8*sample_count)
val_count = int(0.1*sample_count)
test_count = sample_count-train_count-val_count

In [84]:
X_train = scaled_input[ : train_count]

In [85]:
y_train = shuffled_balanced_target[ : train_count]

In [86]:
X_val = scaled_input[train_count : train_count+val_count]
y_val = shuffled_balanced_target[train_count : train_count+val_count]

In [87]:
X_test = scaled_input[train_count+val_count : ]
y_test = shuffled_balanced_target[train_count+val_count : ]

#### Making sure each set is balanced

In [88]:
print(np.sum(y_train), train_count , np.sum(y_train)/train_count)
print(np.sum(y_val), val_count , np.sum(y_val)/val_count)
print(np.sum(y_test), test_count , np.sum(y_test)/test_count)

1774.0 3579 0.4956691813355686
234.0 447 0.5234899328859061
229.0 448 0.5111607142857143


## Saving our preprocessed data in NPZ files

Just to directly load data and build our model.

In [89]:
np.savez('Audiobooks_data_train', inputs=X_train, targets=y_train)
np.savez('Audiobooks_data_validation', inputs=X_val, targets=y_val)
np.savez('Audiobooks_data_test', inputs=X_train, targets=y_test)

Now we can directly load these files using np.load() without performing the above preprocessing steps.

## Preparing data for model

In [90]:
X_train = X_train.astype(np.float)
X_train[7]

array([-0.74815472, -0.74104436, -0.12667601, -0.30233285, -0.44047915,
       -0.01321701, -0.37202696, -0.36570342, -0.18463416, -0.75478296])

In [91]:
y_train = y_train.astype(np.int)
sum(y_train)

1774

In [92]:
X_val =X_val.astype(np.float)

In [93]:
y_val =y_val.astype(np.int)

In [94]:
X_test = X_test.astype(np.float)

In [95]:
y_test =y_test.astype(np.int)

## Model training

In [96]:
import tensorflow as tf

In [105]:
model = tf.keras.Sequential([tf.keras.layers.Dense(50,activation='relu'),
                            tf.keras.layers.Dense(50,activation='relu'),
                            tf.keras.layers.Dense(1,activation='sigmoid')])                            

In [106]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [107]:
earlyStopping = tf.keras.callbacks.EarlyStopping(patience=2)  # It will ckeck previous 2 val_loss and if they are consucutively increasing, it will stop the training to prevent overfitting

In [108]:
model.fit(X_train,y_train,
          epochs=100,
          batch_size=100,
          validation_data=(X_val,y_val),
          callbacks=[earlyStopping])

Train on 3579 samples, validate on 447 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100


<tensorflow.python.keras.callbacks.History at 0x7f77af6f8410>