In [1]:
import pandas as pd
import numpy as np
np.random.seed(10)

# Reduce the number of rows

Since our data set is too big for the scope of our project, we need to reduce the number of rows in both, the training data set and the test data set. However, we are not going to use the same method in the training and test data sets, since in the first mentioned we want a balanced data frame and in the second one we want to mantain the proportion of the feature to be predicted.

## Reducing the test data set: Random sampling

In order to reduce the test data set, we are selecting random indexs of the original test data frame and deleting them.

In [209]:
#reducing the number of rows to 1/3 of 10000 (since in total we want 10000 rows and training must have 2/3 and test 1/3)
df = pd.read_csv(r'../Datasets/testing-dataset.csv')
df = df[df.filter(regex='^(?!Unnamed)').columns] #deleting unwanted column added 
nRowsToRemove = len(df) - 3333
rowsToDropIndices = np.random.choice(df.index, nRowsToRemove, replace = False)
df_reduced = df.drop(rowsToDropIndices)
df_reduced = df_reduced[df_reduced.filter(regex='^(?!Unnamed)').columns] #deleting unwanted column added 
df_reduced.to_csv(r'../Datasets/testing-dataset-reduced.csv')

# Study if the result is representative
After doing a sample of the original test data set we need to prove that the result is representative and that for each column we have a similar statistical structure to
the original dataset.

## Description of all the features 
With these descriptions we can already observe for the numerical variables if the mean, std, min and max values or others have changed or not. If the values are very similar, we can accept the previous algorithm to reduce the number of rows and proceed with the project.

In [210]:
#study of the  proportions/frequencies of each feature in the original and row-reduced test data set
pd.set_option('display.max_columns', None)
df.describe(include='all')

Unnamed: 0,age,campaign,cons.conf.idx,cons.price.idx,contact,day_of_week,default_unknown,default_yes,duration,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,emp.var.rate,euribor3m,housing,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,loan,marital_married,marital_single,marital_unknown,month,nr.employed,pdays,poutcome_nonexistent,poutcome_success,previous,y
count,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0,13593.0
mean,0.284115,0.028533,0.431984,0.537079,0.367469,0.49165,0.21055,0.0,0.608773,0.052674,0.150372,0.235195,0.000294,0.121754,0.29655,0.04208,0.727312,0.679314,0.531781,0.223277,0.034871,0.026852,0.06908,0.044214,0.034356,0.098433,0.023689,0.162804,0.02413,0.006915,0.160119,0.604134,0.285294,0.001913,0.400549,0.770192,0.932907,0.86574,0.03362,0.024424,0.113588
std,0.12864,0.049187,0.193422,0.225537,0.482133,0.351563,0.407714,0.0,0.108204,0.22339,0.357448,0.424136,0.017152,0.327013,0.456753,0.20078,0.326518,0.392728,0.493615,0.416458,0.183459,0.161657,0.253599,0.205578,0.182148,0.29791,0.152083,0.369201,0.153459,0.082873,0.359358,0.489054,0.451571,0.043695,0.225679,0.272984,0.198403,0.340944,0.180256,0.07112,0.317322
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.081538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.185185,0.0,0.338912,0.340608,0.0,0.25,0.0,0.0,0.545207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.160961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.222222,0.512287,1.0,1.0,0.0,0.0,0.0
50%,0.259259,0.018182,0.376569,0.603274,0.0,0.5,0.0,0.0,0.611526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9375,0.957379,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.333333,0.859735,1.0,1.0,0.0,0.0,0.0
75%,0.37037,0.036364,0.60251,0.698753,1.0,0.75,0.0,0.0,0.680381,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.980957,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.555556,1.0,1.0,1.0,0.0,0.0,0.0
max,0.925926,0.763636,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [211]:
df_reduced.describe(include='all')

Unnamed: 0,age,campaign,cons.conf.idx,cons.price.idx,contact,day_of_week,default_unknown,default_yes,duration,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,emp.var.rate,euribor3m,housing,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,loan,marital_married,marital_single,marital_unknown,month,nr.employed,pdays,poutcome_nonexistent,poutcome_success,previous,y
count,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0
mean,0.281899,0.028399,0.433044,0.538135,0.377138,0.488899,0.209721,0.0,0.608181,0.049505,0.144014,0.249025,0.0,0.113111,0.29523,0.045905,0.730267,0.683572,0.537504,0.215122,0.035404,0.030003,0.062706,0.041404,0.034203,0.106811,0.021602,0.149415,0.028503,0.007201,0.159166,0.60336,0.279328,0.0021,0.398106,0.77266,0.936394,0.871887,0.031503,0.022802,0.105611
std,0.126132,0.049758,0.193121,0.224948,0.484743,0.350875,0.407171,0.0,0.109015,0.216952,0.351157,0.432513,0.0,0.316776,0.456214,0.20931,0.323689,0.390081,0.493904,0.410968,0.184825,0.170621,0.24247,0.199253,0.181779,0.308919,0.145402,0.35655,0.166429,0.084564,0.359368,0.489273,0.448736,0.045787,0.226453,0.268699,0.19314,0.334266,0.174699,0.067463,0.307385
min,0.012346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.189326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.185185,0.0,0.338912,0.340608,0.0,0.25,0.0,0.0,0.544059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.163228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.222222,0.512287,1.0,1.0,0.0,0.0,0.0
50%,0.259259,0.018182,0.376569,0.652767,0.0,0.5,0.0,0.0,0.612819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9375,0.957379,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.333333,0.859735,1.0,1.0,0.0,0.0,0.0
75%,0.37037,0.036364,0.60251,0.698753,1.0,0.75,0.0,0.0,0.678924,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.980957,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.555556,1.0,1.0,1.0,0.0,0.0,0.0
max,0.839506,0.563636,1.0,1.0,1.0,1.0,1.0,0.0,0.944811,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.857143,1.0


In [212]:
print("Percentage of each 'y' class in the original test data set and the reduced data set")
print(df.y.value_counts(normalize=False)/ len(df))
print (df_reduced.y.value_counts(normalize=False)/len(df_reduced))

Percentage of each 'y' class in the original test data set and the reduced data set
0.0    0.886412
1.0    0.113588
Name: y, dtype: float64
0.0    0.894389
1.0    0.105611
Name: y, dtype: float64


We can conclude that the sample obtained is representative and we can use it for our project.

# Reduce the number of rows to balance the classes
The previous reduction of rows does not take into account if the dataset is balanced or not. As in the training data set the feature to predict must be balanced and as our data set is not balanced, we can reduce the training data set by eliminating only rows of the class with bigger proportion, so we are reducing the training data set and balancing it at the same time.

In [213]:
df.y.value_counts(normalize=True)

0.0    0.886412
1.0    0.113588
Name: y, dtype: float64

In the original data set, the rows with y=no represent the 88.7% (36548 rows) of the totality and rows with y=yes represent only the 11.26% (4640).

In [214]:
#reducing the number of rows to 2/3 of 10000
df = pd.read_csv(r'../Datasets/training-dataset.csv')
df = df[df.filter(regex='^(?!Unnamed)').columns] #deleting unwanted column added 
nRowsToRemove = len(df) - 6667 
rowsToDropIndices = np.random.choice(df[df.y == 0.0].index, nRowsToRemove, replace = False)
df_balanced = df.drop(rowsToDropIndices)
df_balanced = df_balanced[df_balanced.filter(regex='^(?!Unnamed)').columns] #deleting unwanted column added 
df_balanced.to_csv(r'../Datasets/training-dataset-reduced.csv')

In [215]:
df_balanced.y.value_counts(normalize=True)

0.0    0.535623
1.0    0.464377
Name: y, dtype: float64

In [216]:
#study of the  proportions/frequencies of each feature in the original and row-reduced training data set
df.describe(include='all')

Unnamed: 0,age,campaign,cons.conf.idx,cons.price.idx,contact,day_of_week,default_unknown,default_yes,duration,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,emp.var.rate,euribor3m,housing,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,loan,marital_married,marital_single,marital_unknown,month,nr.employed,pdays,poutcome_nonexistent,poutcome_success,previous,y
count,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0,27595.0
mean,0.284313,0.028486,0.430297,0.535054,0.36416,0.496494,0.207828,0.000109,0.608412,0.057112,0.14499,0.228955,0.000507,0.130024,0.294872,0.042,0.724447,0.676213,0.537869,0.225367,0.035586,0.025186,0.071933,0.040551,0.034571,0.095343,0.02004,0.16416,0.02486,0.008552,0.165483,0.605762,0.278674,0.001957,0.401039,0.768613,0.932584,0.862294,0.033194,0.024849,0.112194
std,0.128668,0.050934,0.193761,0.225603,0.481203,0.35352,0.40576,0.010426,0.107698,0.23206,0.352097,0.420168,0.022519,0.336335,0.455994,0.200594,0.327661,0.39345,0.492181,0.417831,0.185259,0.156692,0.258382,0.197251,0.182695,0.293694,0.140139,0.370428,0.1557,0.092084,0.363002,0.488695,0.448355,0.044194,0.227321,0.273254,0.196203,0.344598,0.179147,0.070494,0.315611
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.185185,0.0,0.338912,0.340608,0.0,0.25,0.0,0.0,0.545207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.160961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.222222,0.512287,1.0,1.0,0.0,0.0,0.0
50%,0.259259,0.018182,0.376569,0.603274,0.0,0.5,0.0,0.0,0.610874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9375,0.957379,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.333333,0.859735,1.0,1.0,0.0,0.0,0.0
75%,0.37037,0.036364,0.60251,0.698753,1.0,0.75,0.0,0.0,0.67782,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.980957,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.555556,1.0,1.0,1.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.981411,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.857143,1.0


In [217]:
#study of the  proportions/frequencies of each feature in the original and row-reduced training data set
df_balanced.describe(include='all')

Unnamed: 0,age,campaign,cons.conf.idx,cons.price.idx,contact,day_of_week,default_unknown,default_yes,duration,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,emp.var.rate,euribor3m,housing,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,loan,marital_married,marital_single,marital_unknown,month,nr.employed,pdays,poutcome_nonexistent,poutcome_success,previous,y
count,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0,6667.0
mean,0.288199,0.025379,0.442242,0.503519,0.293085,0.496325,0.154642,0.00015,0.64918,0.049048,0.126294,0.234738,0.0009,0.129294,0.317384,0.045598,0.617182,0.543304,0.543723,0.19379,0.029999,0.023849,0.068547,0.060597,0.035698,0.083396,0.033598,0.160792,0.028799,0.00885,0.166642,0.569522,0.314684,0.0024,0.413429,0.661377,0.864857,0.783261,0.097345,0.043562,0.464377
std,0.147149,0.046374,0.220976,0.247241,0.455211,0.348862,0.36159,0.012247,0.113982,0.215984,0.332205,0.423867,0.029988,0.33555,0.465493,0.208627,0.358997,0.427864,0.491604,0.395296,0.170596,0.152589,0.2527,0.238607,0.185551,0.2765,0.180206,0.367367,0.167253,0.093662,0.363926,0.49518,0.464425,0.048933,0.243678,0.327802,0.295343,0.412054,0.296449,0.097154,0.498767
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.17284,0.0,0.338912,0.26968,0.0,0.25,0.0,0.0,0.578763,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.139651,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.222222,0.425709,1.0,1.0,0.0,0.0,0.0
50%,0.259259,0.018182,0.376569,0.484412,0.0,0.5,0.0,0.0,0.650921,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6875,0.780322,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.333333,0.859735,1.0,1.0,0.0,0.0,0.0
75%,0.382716,0.036364,0.60251,0.698753,1.0,0.75,0.0,0.0,0.731994,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.980503,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.555556,1.0,1.0,1.0,0.0,0.0,1.0
max,1.0,0.745455,1.0,1.0,1.0,1.0,1.0,1.0,0.981411,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.857143,1.0


In this solution now statistics like the means, std, frequencies and other, have changed a bit more than they did in the reduction of the test data set. 
Despite that, statistical changes are small and not drastic. So, we can assure that this new reduces training data set is representative of the original.

