In [1]:
# Importarea bibliotecilor necesare
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [4]:

# 1. Încărcarea setului de date
file_path = r'C:\Users\user\Desktop\ASEM\ML\lab2\bank-full.xls'
try:
    data = pd.read_csv(file_path, sep=';')
    print("Date încărcate cu succes.")
except FileNotFoundError:
    raise FileNotFoundError("Fișierul specificat nu a fost găsit. Verifică calea corectă.")


Date încărcate cu succes.


In [5]:

# 2. Preprocesarea datelor

# a) Vizualizarea primelor 5 rânduri din setul de date
print("Primele 5 rânduri ale dataset-ului:")
print(data.head())

# b) Vizualizarea statisticilor descriptive
print("Statistici descriptive:")
print(data.describe())

# c) Tratarea valorilor lipsă
print("Numărul de valori lipsă per coloană:")
print(data.isnull().sum())
# Eliminăm valorile lipsă
data = data.dropna()

# d) Detectarea și eliminarea outlierilor folosind IQR
numeric_cols = data.select_dtypes(include=[np.number]).columns
Q1 = data[numeric_cols].quantile(0.25)
Q3 = data[numeric_cols].quantile(0.75)
IQR = Q3 - Q1
data = data[~((data[numeric_cols] < (Q1 - 1.5 * IQR)) | (data[numeric_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]


Primele 5 rânduri ale dataset-ului:
   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  
Statistici descriptive:
                age        balance           day   

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Definirea caracteristicilor și a variabilei țintă
X = data.drop('y', axis=1)
y = data['y']

# Conversia variabilelor categorice în variabile dummy
X = pd.get_dummies(X, drop_first=True)

# Împărțirea datelor în seturi de antrenament și testare
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Crearea modelului de regresie logistică
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Prezicerea pe setul de testare
y_pred_log_reg = log_reg.predict(X_test)

# Evaluarea modelului
print("Matricea de confuzie:")
print(confusion_matrix(y_test, y_pred_log_reg))
print("\nRaportul de clasificare:")
print(classification_report(y_test, y_pred_log_reg))


Matricea de confuzie:
[[7922   64]
 [ 393   79]]

Raportul de clasificare:
              precision    recall  f1-score   support

          no       0.95      0.99      0.97      7986
         yes       0.55      0.17      0.26       472

    accuracy                           0.95      8458
   macro avg       0.75      0.58      0.61      8458
weighted avg       0.93      0.95      0.93      8458



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
from sklearn.tree import DecisionTreeClassifier

# Crearea modelului arborelui decisional
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Prezicerea pe setul de testare
y_pred_dt = dt_model.predict(X_test)

# Evaluarea modelului
print("Matricea de confuzie:")
print(confusion_matrix(y_test, y_pred_dt))
print("\nRaportul de clasificare:")
print(classification_report(y_test, y_pred_dt))


Matricea de confuzie:
[[7627  359]
 [ 308  164]]

Raportul de clasificare:
              precision    recall  f1-score   support

          no       0.96      0.96      0.96      7986
         yes       0.31      0.35      0.33       472

    accuracy                           0.92      8458
   macro avg       0.64      0.65      0.64      8458
weighted avg       0.93      0.92      0.92      8458



In [8]:
from sklearn.ensemble import RandomForestClassifier

# Crearea modelului Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Prezicerea pe setul de testare
y_pred_rf = rf_model.predict(X_test)

# Evaluarea modelului
print("Matricea de confuzie:")
print(confusion_matrix(y_test, y_pred_rf))
print("\nRaportul de clasificare:")
print(classification_report(y_test, y_pred_rf))


Matricea de confuzie:
[[7952   34]
 [ 395   77]]

Raportul de clasificare:
              precision    recall  f1-score   support

          no       0.95      1.00      0.97      7986
         yes       0.69      0.16      0.26       472

    accuracy                           0.95      8458
   macro avg       0.82      0.58      0.62      8458
weighted avg       0.94      0.95      0.93      8458



In [9]:
import pandas as pd
from sklearn.metrics import classification_report

# Creăm un DataFrame pentru a stoca rezultatele
results = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])

# Modelul de regresie logistică
log_reg_report = classification_report(y_test, y_pred_log_reg, output_dict=True)
log_reg_results = pd.DataFrame({
    'Model': ['Logistic Regression'],
    'Accuracy': [log_reg_report['accuracy']],
    'Precision': [log_reg_report['yes']['precision']],
    'Recall': [log_reg_report['yes']['recall']],
    'F1-Score': [log_reg_report['yes']['f1-score']]
})

# Modelul arborelui decisional
dt_report = classification_report(y_test, y_pred_dt, output_dict=True)
dt_results = pd.DataFrame({
    'Model': ['Decision Tree'],
    'Accuracy': [dt_report['accuracy']],
    'Precision': [dt_report['yes']['precision']],
    'Recall': [dt_report['yes']['recall']],
    'F1-Score': [dt_report['yes']['f1-score']]
})

# Modelul Random Forest
rf_report = classification_report(y_test, y_pred_rf, output_dict=True)
rf_results = pd.DataFrame({
    'Model': ['Random Forest'],
    'Accuracy': [rf_report['accuracy']],
    'Precision': [rf_report['yes']['precision']],
    'Recall': [rf_report['yes']['recall']],
    'F1-Score': [rf_report['yes']['f1-score']]
})

# Concatenarea rezultatelor
results = pd.concat([results, log_reg_results, dt_results, rf_results], ignore_index=True)

# Afișarea rezultatelor
print("Compararea performanțelor modelelor:")
print(results)


Compararea performanțelor modelelor:
                 Model  Accuracy  Precision    Recall  F1-Score
0  Logistic Regression  0.945968   0.552448  0.167373  0.256911
1        Decision Tree  0.921140   0.313576  0.347458  0.329648
2        Random Forest  0.949279   0.693694  0.163136  0.264151
