In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import KFold

**Paso 1: Postulacion de Problema y Recoleccion de Datos**

In [2]:
url = 'https://raw.githubusercontent.com/4GeeksAcademy/logistic-regression-project-tutorial/main/bank-marketing-campaign-data.csv'
df = pd.read_csv(url, sep=';')
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


**Paso 2: Exploracion y Limipeza de Datos**

In [3]:
#dimensiones
n_rows = df.shape[0]
n_columns = df.shape[1]
print(f'La tabla contiene {n_rows} filas y {n_columns} columnas')

La tabla contiene 41188 filas y 21 columnas


In [4]:
# Columnas, tipos de dato y valores nulos
i = 1

for col in df.columns:
    print(f'{i}- {col}: {df[col].dtype}, {df[col].isna().sum()} nulos')
    i += 1

1- age: int64, 0 nulos
2- job: object, 0 nulos
3- marital: object, 0 nulos
4- education: object, 0 nulos
5- default: object, 0 nulos
6- housing: object, 0 nulos
7- loan: object, 0 nulos
8- contact: object, 0 nulos
9- month: object, 0 nulos
10- day_of_week: object, 0 nulos
11- duration: int64, 0 nulos
12- campaign: int64, 0 nulos
13- pdays: int64, 0 nulos
14- previous: int64, 0 nulos
15- poutcome: object, 0 nulos
16- emp.var.rate: float64, 0 nulos
17- cons.price.idx: float64, 0 nulos
18- cons.conf.idx: float64, 0 nulos
19- euribor3m: float64, 0 nulos
20- nr.employed: float64, 0 nulos
21- y: object, 0 nulos


**2.1 Eliminar Duplicados**

In [6]:
# detect duplicates (?)
df[df.duplicated()]

Unnamed: 0,"age;""job"";""marital"";""education"";""default"";""housing"";""loan"";""contact"";""month"";""day_of_week"";""duration"";""campaign"";""pdays"";""previous"";""poutcome"";""emp.var.rate"";""cons.price.idx"";""cons.conf.idx"";""euribor3m"";""nr.employed"";""y"""
1266,"39;""blue-collar"";""married"";""basic.6y"";""no"";""no..."
12261,"36;""retired"";""married"";""unknown"";""no"";""no"";""no..."
14234,"27;""technician"";""single"";""professional.course""..."
16956,"47;""technician"";""divorced"";""high.school"";""no"";..."
18465,"32;""technician"";""single"";""professional.course""..."
20216,"55;""services"";""married"";""high.school"";""unknown..."
20534,"41;""technician"";""married"";""professional.course..."
25217,"39;""admin."";""married"";""university.degree"";""no""..."
28477,"24;""services"";""single"";""high.school"";""no"";""yes..."
32516,"35;""admin."";""married"";""university.degree"";""no""..."


**2.2 Eliminar Informacion Irrelevante**

In [None]:
#add code

**Paso 3: Analisis de Variables Univariadas**

**3.1 Analisis de Variables Categoricas**

In [11]:
#obtener variables categoricas
list_categorical = []
for col in df.columns:
    if (df[col].dtype == 'object') or (df[col].dtype == 'category'):
        list_categorical.append(col)

list_categorical, len(list_categorical)

(['job',
  'marital',
  'education',
  'default',
  'housing',
  'loan',
  'contact',
  'month',
  'day_of_week',
  'poutcome',
  'y'],
 11)

In [None]:
fig, axis = plt.subplots(4, 3, figsize = (10, 10), gridspec_kw={'height_ratios': [6, 1, 6, 1]})

sns.histplot(ax = axis[0, 0], data = df, x = "job").set(xlabel = None)
sns.boxplot(ax = axis[1, 0], data = df, x = "job")
sns.histplot(ax = axis[0, 1], data = df, x = "marital").set(xlabel = None, ylabel = None)
sns.boxplot(ax = axis[1, 1], data = df, x = "marital")
sns.histplot(ax = axis[0, 2], data = df, x = "education").set(xlabel = None)
sns.boxplot(ax = axis[1, 2], data = df, x = "education")
sns.histplot(ax = axis[2, 0], data = df, x = "default").set(xlabel = None, ylabel = None)
sns.boxplot(ax = axis[3, 0], data = df, x = "default")
sns.histplot(ax = axis[2, 1], data = df, x = "housing").set(xlabel = None)
sns.boxplot(ax = axis[3, 1], data = df, x = "housing")
sns.histplot(ax = axis[2, 2], data = df, x = "loan").set(xlabel = None, ylabel = None)
sns.boxplot(ax = axis[3, 2], data = df, x = "loan")
sns.histplot(ax = axis[2, 1], data = df, x = "loan").set(xlabel = None, ylabel = None)
sns.boxplot(ax = axis[3, 1], data = df, x = "loan")
