In [2]:
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

## Data

In [3]:
# Loading the data
raw_csv_data = np.loadtxt("Audiobooks_data.csv", delimiter = ",")
unscaled_inputs_all = raw_csv_data[:, 1:-1]
targets_all = raw_csv_data[:,-1:]
targets_all

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [1.]])

### Balancing the dataset

In [4]:
num_of_one_targets = targets_all.sum()
zero_targets_counter = 0
balanced_data = []
for i in raw_csv_data:
    if i[-1] == 1:
        balanced_data.append(i)
        #print(i)
        
    else:
        if zero_targets_counter != num_of_one_targets:
            balanced_data.append(i)
            zero_targets_counter += 1
balanced_data = np.array(balanced_data)
unscaled_inputs_with_equal_priors = balanced_data[:, 1:-1]
targets_with_equal_priors = balanced_data[:,-1:]
unscaled_inputs_with_equal_priors

array([[1620.  , 1620.  ,   19.73, ..., 1603.8 ,    5.  ,   92.  ],
       [2160.  , 2160.  ,    5.33, ...,    0.  ,    0.  ,    0.  ],
       [2160.  , 2160.  ,    5.33, ...,    0.  ,    0.  ,  388.  ],
       ...,
       [2160.  , 2160.  ,    5.33, ...,    0.  ,    0.  ,    6.  ],
       [1674.  , 3348.  ,    7.99, ...,    0.  ,    0.  ,    0.  ],
       [1674.  , 3348.  ,    5.33, ...,    0.  ,    0.  ,    0.  ]])

## Standardize the inputs

In [5]:
scaled_inputs = preprocessing.scale(unscaled_inputs_with_equal_priors)
scaled_inputs

array([[ 0.21053387, -0.18888517,  1.97823887, ...,  4.80955413,
        11.83828419,  0.09415043],
       [ 1.27894497,  0.41646744, -0.39082475, ..., -0.41569922,
        -0.20183481, -0.80255852],
       [ 1.27894497,  0.41646744, -0.39082475, ..., -0.41569922,
        -0.20183481,  2.979214  ],
       ...,
       [ 1.27894497,  0.41646744, -0.39082475, ..., -0.41569922,
        -0.20183481, -0.7440775 ],
       [ 0.31737498,  1.7482432 ,  0.04679395, ..., -0.41569922,
        -0.20183481, -0.80255852],
       [ 0.31737498,  1.7482432 , -0.39082475, ..., -0.41569922,
        -0.20183481, -0.80255852]])

## Shuffling the data

In [6]:
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_with_equal_priors[shuffled_indices]
len(shuffled_targets)

4474

## Split the dataset into train, validation and test

In [7]:
train_samples_num = int(0.8 *shuffled_inputs.shape[0])
print(train_samples_num)
validation_samples_num = int(0.1 *shuffled_inputs.shape[0])
print(validation_samples_num)

train_data = shuffled_inputs[:train_samples_num]
train_targets = shuffled_targets[:train_samples_num]
print(len(train_data))

validation_data = shuffled_inputs[train_samples_num:train_samples_num + validation_samples_num]
validation_targets = shuffled_targets[train_samples_num:train_samples_num + validation_samples_num]

print(len(validation_targets))

test_data = shuffled_inputs[train_samples_num + validation_samples_num:]
test_targets = shuffled_targets[train_samples_num + validation_samples_num:]
print(len(test_targets))

3579
447
3579
447
448


## Audiobooks save data

In [14]:
np.savez("Audiobook_train_data", inputs = train_data, targets = train_targets)
np.savez("Audiobook_validation_data", inputs = validation_data, targets = validation_targets)
np.savez("Audiobook_test_data", inputs = test_data, targets = test_targets)

In [15]:
train_targets.sum()

1780.0

In [9]:
model = LogisticRegression()
model.fit(train_data, train_targets)

  y = column_or_1d(y, warn=True)


In [28]:
model.predict(test_data)


array([0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1.,
       1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 0., 0., 1., 0., 1., 1.,
       0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 0.,
       1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1., 0., 0.,
       0., 0., 0., 1., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 1., 1., 1.,
       0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 0., 1.,
       1., 1., 1., 1., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 1., 1., 1.,
       1., 1., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 1., 0., 1., 0., 0.,
       1., 0., 1., 1., 1., 0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0.,
       0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 0., 1., 1., 0., 0., 1.,
       1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 0., 1., 1.,
       0., 0., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 1., 0., 0., 0.,
       0., 1., 1., 1., 1.

In [29]:
model.score(test_data, test_targets)

0.7924107142857143

In [24]:
a = []
for i in train_targets:
    a.append(i[0])
a = np.array(a)