In [None]:
import pandas as pd
import numpy as np
np.random.seed(10)

# Reduce the number of rows by random sampling

In order to reduce the number of rows we are selecting random indexs of the original data frame and deleting them.

In [None]:
#reducing the number of rows to 10000
df = pd.read_csv('bank-additional-full.csv',  sep=';', na_values=".")
nRowsToRemove = len(df) - 10000
rowsToDropIndices = np.random.choice(df.index, nRowsToRemove, replace = False)
df_reduced = df.drop(rowsToDropIndices)
df_reduced.to_csv(r'bank-additional-reduced.csv')

# Study if the result is representative
After doing a sample of the original data set we need to test that the result is representative and that for each column we have a similar statistical structure to
the original dataset.

## Description of all the features 
With these descriptions we can already observe for the numerical variables if the mean, std, min and max values or others have changed or not. If the values are very similar, we can accept the previous algorithm to reduce the number of rows and proceed with the project.

In [None]:
#study of the  proportions/frequencies of each feature in the original and row-reduced data set
df.describe(include='all')

In [None]:
df_reduced.describe(include='all')

To validate that the categorical variables values have the same representation in both the original and the reduced data sets is not enough to observe the previous description. We must observe that the percentage of each feature value has not changed much.

In [None]:
print ("Job feature percentatges in the original data set")
print(df.job.value_counts(normalize=True))
print ("Job feature percentatges in the reduced data set")
print(df_reduced.job.value_counts(normalize=True))

In [None]:
print("Difference between the percentage of each 'job' value in the original data set and the reduced data set")
print(df.job.value_counts(normalize=True)-df_reduced.job.value_counts(normalize=True))

In [None]:
print("Difference between the percentage of each 'marital' value in the original data set and the reduced data set")
print(df.marital.value_counts(normalize=True)-df_reduced.marital.value_counts(normalize=True))

In [None]:
print("Difference between the percentage of each 'education' value in the original data set and the reduced data set")
print(df.education.value_counts(normalize=True)-df_reduced.education.value_counts(normalize=True))

In [None]:
print("Difference between the percentage of each 'default' value in the original data set and the reduced data set")
print(df.default.value_counts(normalize=True)-df_reduced.default.value_counts(normalize=True))

In [None]:
print("Difference between the percentage of each 'housing' value in the original data set and the reduced data set")
print(df.housing.value_counts(normalize=True)-df_reduced.housing.value_counts(normalize=True))

In [None]:
print("Difference between the percentage of each 'loan' value in the original data set and the reduced data set")
print(df.loan.value_counts(normalize=True)-df_reduced.loan.value_counts(normalize=True))

In [None]:
print("Difference between the percentage of each 'contact' value in the original data set and the reduced data set")
print(df.contact.value_counts(normalize=True)-df_reduced.contact.value_counts(normalize=True))

In [None]:
print("Difference between the percentage of each 'month' value in the original data set and the reduced data set")
print(df.month.value_counts(normalize=True)-df_reduced.month.value_counts(normalize=True))

In [None]:
print("Difference between the percentage of each 'day_of_week' value in the original data set and the reduced data set")
print(df.day_of_week.value_counts(normalize=True)-df_reduced.day_of_week.value_counts(normalize=True))

In [None]:
print("Difference between the percentage of each 'poutcome' value in the original data set and the reduced data set")
print(df.poutcome.value_counts(normalize=True)-df_reduced.poutcome.value_counts(normalize=True))

In [None]:
print("Difference between the percentage of each 'y' value in the original data set and the reduced data set")
print(df.y.value_counts(normalize=True)-df_reduced.y.value_counts(normalize=True))

We can conclude that the sample obtained is representative and we can use it for our project.

# Reduce the number of rows to balance the classes
The previous reduction of rows does not take into account if the dataset is balanced or not. As our data set is not balanced, we  can reduce the data set by eliminating only rows of the class with bigger proportion.

In [None]:
df.y.value_counts(normalize=True)

As it can be observed, the rows with y=no represent the 88.7% (36548 rows) of the totality and rows with y=yes represent only the 11.26% (4640). Our data set is very  unbalanced so, to solve this problem, we can eliminate rows that have y=no until having 10k in total.

In [None]:
#reducing the number of rows to 10000
nRowsToRemove = len(df) - 10000 #we could higher the number of rows to remove to end with exactly the same proportion of yes and no (9280 rows in total)
rowsToDropIndices = np.random.choice(df[df.y == 'no'].index, nRowsToRemove, replace = False)
df_balanced = df.drop(rowsToDropIndices)
df_balanced.to_csv(r'bank-additional-reduced-balanced.csv')

In [None]:
df_balanced.y.value_counts(normalize=True)

In [None]:
#study of the  proportions/frequencies of each feature in the original and row-reduced data set
df.describe(include='all')

In [None]:
#study of the  proportions/frequencies of each feature in the original and row-reduced data set
df_balanced.describe(include='all')

In [None]:
print("Difference between the percentage of each 'job' value in the original data set and the balanced data set")
print(df.job.value_counts(normalize=True)-df_balanced.job.value_counts(normalize=True))

In [None]:
print("Difference between the percentage of each 'marital' value in the original data set and the balanced data set")
print(df.marital.value_counts(normalize=True)-df_balanced.marital.value_counts(normalize=True))

In [None]:
print("Difference between the percentage of each 'education' value in the original data set and the balanced data set")
print(df.education.value_counts(normalize=True)-df_balanced.education.value_counts(normalize=True))

In [None]:
print("Difference between the percentage of each 'default' value in the original data set and the balanced data set")
print(df.default.value_counts(normalize=True)-df_balanced.default.value_counts(normalize=True))

In [None]:
print("Difference between the percentage of each 'housing' value in the original data set and the balanced data set")
print(df.housing.value_counts(normalize=True)-df_balanced.housing.value_counts(normalize=True))

In [None]:
print("Difference between the percentage of each 'loan' value in the original data set and the balanced data set")
print(df.loan.value_counts(normalize=True)-df_balanced.loan.value_counts(normalize=True))

In [None]:
print("Difference between the percentage of each 'contact' value in the original data set and the balanced data set")
print(df.contact.value_counts(normalize=True)-df_balanced.contact.value_counts(normalize=True))

In [None]:
print("Difference between the percentage of each 'month' value in the original data set and the balanced data set")
print(df.month.value_counts(normalize=True)-df_balanced.month.value_counts(normalize=True))

In [None]:
print("Difference between the percentage of each 'day_of_week' value in the original data set and the balanced data set")
print(df.day_of_week.value_counts(normalize=True)-df_balanced.day_of_week.value_counts(normalize=True))

In [None]:
print("Difference between the percentage of each 'poutcome' value in the original data set and the balanced data set")
print(df.poutcome.value_counts(normalize=True)-df_balanced.poutcome.value_counts(normalize=True))

In this solution now statistics like the means, std, frequencies and other, have changed a bit more than they did in the previous form of reducing. the number of rows studied. One problem is that for the feature "default" we are losing the modality 'yes'. This modality in the original dataset had pretty low representation, exactly only 3 rows had default = 'yes' value.
Despite that, all the other statistical changes are not that drastic.

Although we have now a balanced dataset, we should consider other ways to balance the data set and pick that one that is a better sample and represents better the original data set.
