# Explore here

In [53]:
# Your code here

import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV



In [54]:
df = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/logistic-regression-project-tutorial/main/bank-marketing-campaign-data.csv", sep = ";")
df.head()


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [55]:
df.isnull().sum()


age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [56]:
duplicate_rows = df[df.duplicated()]
print(duplicate_rows)


       age          job   marital            education  default housing loan  \
1266    39  blue-collar   married             basic.6y       no      no   no   
12261   36      retired   married              unknown       no      no   no   
14234   27   technician    single  professional.course       no      no   no   
16956   47   technician  divorced          high.school       no     yes   no   
18465   32   technician    single  professional.course       no     yes   no   
20216   55     services   married          high.school  unknown      no   no   
20534   41   technician   married  professional.course       no     yes   no   
25217   39       admin.   married    university.degree       no      no   no   
28477   24     services    single          high.school       no     yes   no   
32516   35       admin.   married    university.degree       no     yes   no   
36951   45       admin.   married    university.degree       no      no   no   
38281   71      retired    single    uni

In [57]:
new_df = df.drop_duplicates().reset_index(drop = True)
new_df.head()


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [58]:

new_df["job_n"] = pd.factorize(new_df["job"])[0]
new_df["marital_n"] = pd.factorize(new_df["marital"])[0]
new_df["education_n"] = pd.factorize(new_df["education"])[0]
new_df["default_n"] = pd.factorize(new_df["default"])[0]
new_df["housing_n"] = pd.factorize(new_df["housing"])[0]
new_df["loan_n"] = pd.factorize(new_df["loan"])[0]
new_df["contact_n"] = pd.factorize(new_df["contact"])[0]
new_df["month_n"] = pd.factorize(new_df["month"])[0]
new_df["day_of_week_n"] = pd.factorize(new_df["day_of_week"])[0]
new_df["poutcome_n"] = pd.factorize(new_df["poutcome"])[0]
new_df["y_n"] = pd.factorize(new_df["y"])[0]
num_variables = ["job_n", "marital_n", "education_n", "default_n", "housing_n", "loan_n", "contact_n", "month_n", "day_of_week_n", "poutcome_n",
                 "age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed", "y_n"]

scaler = MinMaxScaler()
scal_features = scaler.fit_transform(new_df[num_variables])
new_df_scal = pd.DataFrame(scal_features, index = new_df.index, columns = num_variables)
new_df_scal.head()


Unnamed: 0,job_n,marital_n,education_n,default_n,housing_n,loan_n,contact_n,month_n,day_of_week_n,poutcome_n,...,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y_n
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.05307,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
1,0.090909,0.0,0.142857,0.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.030297,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
2,0.090909,0.0,0.142857,0.0,0.5,0.0,0.0,0.0,0.0,0.0,...,0.045954,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
3,0.181818,0.0,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.030704,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
4,0.090909,0.0,0.142857,0.0,0.0,0.5,0.0,0.0,0.0,0.0,...,0.062424,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0


In [59]:
X = new_df_scal.drop("y_n", axis = 1)
y = new_df_scal["y_n"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 234)

selection_model = SelectKBest(chi2, k = 5)
selection_model.fit(X_train, y_train)
ix = selection_model.get_support()
X_train_sel = pd.DataFrame(selection_model.transform(X_train), columns = X_train.columns.values[ix])
X_test_sel = pd.DataFrame(selection_model.transform(X_test), columns = X_test.columns.values[ix])

X_train_sel.head()



Unnamed: 0,poutcome_n,previous,emp.var.rate,euribor3m,nr.employed
0,0.5,0.142857,0.6875,0.797778,0.877883
1,1.0,0.285714,0.0,0.018363,0.203781
2,0.0,0.0,1.0,0.98141,1.0
3,0.0,0.0,0.333333,0.000453,0.17051
4,0.0,0.0,1.0,0.98073,1.0


In [60]:
X_test_sel.head()


Unnamed: 0,poutcome_n,previous,emp.var.rate,euribor3m,nr.employed
0,0.0,0.0,0.9375,0.957379,0.859735
1,0.0,0.0,0.9375,0.957379,0.859735
2,0.0,0.0,0.9375,0.957379,0.859735
3,0.0,0.0,1.0,0.981864,1.0
4,0.0,0.0,0.9375,0.957379,0.859735


In [61]:
X_train_sel["y_n"] = list(y_train)
X_test_sel["y_n"] = list(y_test)
X_train_sel.to_csv("../data/processed/clean_train.csv", index = False)
X_test_sel.to_csv("../data/processed/clean_test.csv", index = False)


In [62]:
train_data = pd.read_csv("../data/processed/clean_train.csv")
test_data = pd.read_csv("../data/processed/clean_test.csv")

train_data.head()


Unnamed: 0,poutcome_n,previous,emp.var.rate,euribor3m,nr.employed,y_n
0,0.5,0.142857,0.6875,0.797778,0.877883,0.0
1,1.0,0.285714,0.0,0.018363,0.203781,1.0
2,0.0,0.0,1.0,0.98141,1.0,0.0
3,0.0,0.0,0.333333,0.000453,0.17051,0.0
4,0.0,0.0,1.0,0.98073,1.0,0.0


In [63]:
X_train = train_data.drop(["y_n"], axis = 1)
y_train = train_data["y_n"]
X_test = test_data.drop(["y_n"], axis = 1)
y_test = test_data["y_n"]


In [64]:
model = LogisticRegression()
model.fit(X_train, y_train)


In [65]:
y_pred = model.predict(X_test)
y_pred


array([0., 0., 0., ..., 0., 0., 0.])

In [66]:
accuracy_score(y_test, y_pred)


0.8916949975716367

In [67]:
hyperparams = {
    "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "penalty": ["l1", "l2", "elasticnet", None],
    "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
}

grid = GridSearchCV(model, hyperparams, scoring = "accuracy", cv = 10)
grid


In [69]:
def warn(*args, **kwargs):
    pass
import warnings

warnings.warn = warn
grid.fit(X_train, y_train)

print(f"Best hyperparameters: {grid.best_params_}")


Best hyperparameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}


In [71]:
model = LogisticRegression(C = 0.1, penalty = "l2", solver = "lbfgs")
model.fit(X_train, y_train)


In [72]:
y_pred = model.predict(X_test)
y_pred


array([0., 0., 0., ..., 0., 0., 0.])

In [73]:
accuracy_score(y_test, y_pred)


0.8958232151529869