In [1]:
# Import pandas
import pandas as pd

# Load dataset
cc_apps = pd.read_csv("cc_approvals.data", header=None)

# Inspect data
print(cc_apps.head(5))

  0      1      2  3  4  5  6     7  8  9   10 11 12     13   14 15
0  b  30.83  0.000  u  g  w  v  1.25  t  t   1  f  g  00202    0  +
1  a  58.67  4.460  u  g  q  h  3.04  t  t   6  f  g  00043  560  +
2  a  24.50  0.500  u  g  q  h  1.50  t  f   0  f  g  00280  824  +
3  b  27.83  1.540  u  g  w  v  3.75  t  t   5  t  g  00100    3  +
4  b  20.17  5.625  u  g  w  v  1.71  t  f   0  f  s  00120    0  +


In [3]:
# Print summary statistics
cc_apps_description = cc_apps.describe()
print(cc_apps_description)

print("\n")

# Print DataFrame information
cc_apps_info = cc_apps.info()
print(cc_apps_info)

print("\n")

# Inspect missing values in the dataset
print(cc_apps.isna().sum().sort_values())

               2           7          10             14
count  690.000000  690.000000  690.00000     690.000000
mean     4.758725    2.223406    2.40000    1017.385507
std      4.978163    3.346513    4.86294    5210.102598
min      0.000000    0.000000    0.00000       0.000000
25%      1.000000    0.165000    0.00000       0.000000
50%      2.750000    1.000000    0.00000       5.000000
75%      7.207500    2.625000    3.00000     395.500000
max     28.000000   28.500000   67.00000  100000.000000


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       690 non-null    object 
 1   1       690 non-null    object 
 2   2       690 non-null    float64
 3   3       690 non-null    object 
 4   4       690 non-null    object 
 5   5       690 non-null    object 
 6   6       690 non-null    object 
 7   7       690 non-null    float64
 8   8       690 no

In [5]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split into train and test sets
cc_apps_train, cc_apps_test = train_test_split(cc_apps, test_size=0.33, random_state=42)

In [15]:
# Import numpy
import numpy as np

# Replace the '?'s with NaN in the train and test sets
cc_apps_train = cc_apps_train.replace("?", np.NaN)
cc_apps_test = cc_apps_test.replace("?", np.NaN)
numeric_cols_train = cc_apps_train.select_dtypes(include=[np.number])
numeric_cols_test = cc_apps_test.select_dtypes(include=[np.number])

# Impute missing values in numeric columns with the mean
cc_apps_train[numeric_cols_train.columns] = numeric_cols_train.fillna(
    numeric_cols_train.mean()
)
cc_apps_test[numeric_cols_test.columns] = numeric_cols_test.fillna(
    numeric_cols_train.mean()
)  # Fill with train set mean

# Count the number of NaNs in the datasets and print the counts to verify
print("Number of NaNs in cc_apps_train:\n", cc_apps_train.isna().sum())
print("Number of NaNs in cc_apps_test:\n", cc_apps_test.isna().sum())
# Iterate over each column of cc_apps_train
for col in cc_apps_train.columns:
    # Check if the column is of object type
    if cc_apps_train[col].dtypes == "object":
        # Impute with the most frequent value
        cc_apps_train = cc_apps_train.fillna(cc_apps_train[col].value_counts().index[0])
        cc_apps_test = cc_apps_test.fillna(cc_apps_test[col].value_counts().index[0])

# Count the number of NaNs in the dataset and print the counts to verify
print(cc_apps_train.isnull().sum())
print(cc_apps_test.isnull().sum())

Number of NaNs in cc_apps_train:
 2           0
7           0
10          0
14          0
0_a         0
           ..
13_00720    0
13_02000    0
13_b        0
15_+        0
15_-        0
Length: 459, dtype: int64
Number of NaNs in cc_apps_test:
 2           0
7           0
10          0
14          0
0_a         0
           ..
13_00720    0
13_02000    0
13_b        0
15_+        0
15_-        0
Length: 459, dtype: int64
2           0
7           0
10          0
14          0
0_a         0
           ..
13_00720    0
13_02000    0
13_b        0
15_+        0
15_-        0
Length: 459, dtype: int64
2           0
7           0
10          0
14          0
0_a         0
           ..
13_00720    0
13_02000    0
13_b        0
15_+        0
15_-        0
Length: 459, dtype: int64


In [17]:
# Convert the categorical features in the train and test sets independently
cc_apps_train = pd.get_dummies(cc_apps_train)
cc_apps_test = pd.get_dummies(cc_apps_test)

# Reindex the columns of the test set aligning with the train set
cc_apps_test = cc_apps_test.reindex(columns=cc_apps_train.columns, fill_value=0)
# Import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

# Segregate features and labels into separate variables
X_train, y_train = cc_apps_train.iloc[:, :-1].values, cc_apps_train.iloc[:, [-1]].values
X_test, y_test = cc_apps_test.iloc[:, :-1].values, cc_apps_test.iloc[:, [-1]].values

# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.transform(X_test)
# Import LogisticRegression
from sklearn.linear_model import LogisticRegression

# Instantiate a LogisticRegression classifier with default parameter values
logreg = LogisticRegression()

# Fit logreg to the train set
logreg.fit(rescaledX_train, y_train)

  y = column_or_1d(y, warn=True)


In [19]:
# Import confusion_matrix
from sklearn.metrics import confusion_matrix

# Use logreg to predict instances from the test set and store it
y_pred = logreg.predict(rescaledX_test)

# Get the accuracy score of logreg model and print it
print(
    "Accuracy of logistic regression classifier: ", logreg.score(rescaledX_test, y_test)
)

# Print the confusion matrix of the logreg model
print(confusion_matrix(y_test, y_pred))

Accuracy of logistic regression classifier:  1.0
[[103   0]
 [  0 125]]
