In [1]:
# Import the moduels
import pandas as pd
from pathlib import Path
import hvplot.pandas

## Import the Pandas DataFrame

In [2]:
# Read in the CSV file as a Pandas Dataframe
ccinfo_df = pd.read_csv(
    Path("../Resources/cc_info_default.csv")
)

In [3]:
# Review the DataFrame
ccinfo_df.tail()

Unnamed: 0,limit_bal,education,marriage,age,bill_amt,pay_amt,default
4994,20000,secondary,yes,36,110994,7293,0
4995,180000,other,yes,34,35240,22066,0
4996,200000,secondary,yes,45,691806,21443,1
4997,310000,post-grad,yes,44,1548067,72000,0
4998,160000,primary,no,40,4440,3725,0


In [4]:
# Review the info
ccinfo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   limit_bal  4999 non-null   int64 
 1   education  4999 non-null   object
 2   marriage   4999 non-null   object
 3   age        4999 non-null   int64 
 4   bill_amt   4999 non-null   int64 
 5   pay_amt    4999 non-null   int64 
 6   default    4999 non-null   int64 
dtypes: int64(5), object(2)
memory usage: 273.5+ KB


## Transform "education" column with get_dummies

In [5]:
# Get value_counts of eduction column
ccinfo_df.education.value_counts()

education
secondary    2267
primary      1862
post-grad     822
other          48
Name: count, dtype: int64

In [6]:
# Transform the education column using get_dummies
education_dummies = pd.get_dummies(ccinfo_df.education).astype(int)
# Display the transformed data
education_dummies

Unnamed: 0,other,post-grad,primary,secondary
0,0,0,0,1
1,0,0,0,1
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1
...,...,...,...,...
4994,0,0,0,1
4995,1,0,0,0
4996,0,0,0,1
4997,0,1,0,0


In [7]:
# Concatenate the df_shopping_transformed and the card_dummies DataFrames
ccinfo_df = pd.concat([ccinfo_df, education_dummies], axis=1)
# Drop the original education column
ccinfo_df = ccinfo_df.drop(columns=["education"])
# Display the DataFrame
ccinfo_df

Unnamed: 0,limit_bal,marriage,age,bill_amt,pay_amt,default,other,post-grad,primary,secondary
0,20000,yes,24,7704,689,1,0,0,0,1
1,120000,no,26,17077,5000,1,0,0,0,1
2,90000,no,34,101653,11018,0,0,0,0,1
3,50000,yes,37,231334,8388,0,0,0,0,1
4,50000,yes,57,109339,59049,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
4994,20000,yes,36,110994,7293,0,0,0,0,1
4995,180000,yes,34,35240,22066,0,1,0,0,0
4996,200000,yes,45,691806,21443,1,0,0,0,1
4997,310000,yes,44,1548067,72000,0,0,1,0,0


## Transform "marriage" column with encoding function

In [8]:
# Encoding the marriage column using a custom function
def encode_marriage(marriage):
    """
    This function will allow me to encode status by settings yes=1 or no=0
    """

# Call the encode_marriage function on the marriage column
    if marriage =="yes":
        return 1
    else:
        return 0

# Review the DataFrame 
ccinfo_df["marriage"] = ccinfo_df["marriage"].apply(encode_marriage)
ccinfo_df.head()

Unnamed: 0,limit_bal,marriage,age,bill_amt,pay_amt,default,other,post-grad,primary,secondary
0,20000,1,24,7704,689,1,0,0,0,1
1,120000,0,26,17077,5000,1,0,0,0,1
2,90000,0,34,101653,11018,0,0,0,0,1
3,50000,1,37,231334,8388,0,0,0,0,1
4,50000,1,57,109339,59049,0,0,0,0,1


## Apply the Standard Scaler to "limit_bal", "bill_amt", "pay_amt"

In [9]:
# Import the module
from sklearn.preprocessing import StandardScaler

In [10]:
# Scaling the numeric columns
ccinfo_df.columns
ccinfo_df_scaled = StandardScaler().fit_transform(ccinfo_df[['limit_bal','bill_amt','pay_amt']])
# Review the scaled data
ccinfo_df_scaled

array([[-1.1173411 , -0.66070266, -0.5427793 ],
       [-0.3499424 , -0.63637003, -0.46399421],
       [-0.58016201, -0.41680786, -0.35401308],
       ...,
       [ 0.26397655,  1.1152494 , -0.16349243],
       [ 1.10811512,  3.33813208,  0.76045505],
       [-0.04298292, -0.66917611, -0.4872953 ]])

In [11]:
# Create a DataFrame of the scaled data
ccinfo_df_scaled = pd.DataFrame(ccinfo_df_scaled, columns = ['limit_bal','bill_amt','pay_amt'])
ccinfo_df_scaled
# Replace the original data with the columns of information from the scaled Data

ccinfo_df['limit_bal'] = ccinfo_df_scaled['limit_bal']
ccinfo_df['bill_amt'] = ccinfo_df_scaled['bill_amt']
ccinfo_df['pay_amt'] = ccinfo_df_scaled['pay_amt']
# Review the DataFrame
ccinfo_df

Unnamed: 0,limit_bal,marriage,age,bill_amt,pay_amt,default,other,post-grad,primary,secondary
0,-1.117341,1,24,-0.660703,-0.542779,1,0,0,0,1
1,-0.349942,0,26,-0.636370,-0.463994,1,0,0,0,1
2,-0.580162,0,34,-0.416808,-0.354013,0,0,0,0,1
3,-0.887121,1,37,-0.080152,-0.402077,0,0,0,0,1
4,-0.887121,1,57,-0.396855,0.523771,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
4994,-1.117341,1,36,-0.392558,-0.422089,0,0,0,0,1
4995,0.110497,1,34,-0.589218,-0.152107,0,1,0,0,0
4996,0.263977,1,45,1.115249,-0.163492,1,0,0,0,1
4997,1.108115,1,44,3.338132,0.760455,0,0,1,0,0


## Elbow Method to find k

In [12]:
# Import the KMeans module from SKLearn
from sklearn.cluster import KMeans

In [13]:
# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1,11))

In [14]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the service_ratings DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(ccinfo_df)
    inertia.append(k_model.inertia_)

In [15]:
# Define a DataFrame to hold the values for k and the corresponding inertia
df_elbow = pd.DataFrame({"k": k, "inertia": inertia})

# Review the DataFrame
df_elbow 

Unnamed: 0,k,inertia
0,1,449413.376075
1,2,152909.452108
2,3,84174.926727
3,4,58169.904941
4,5,48059.393805
5,6,37981.701542
6,7,33824.979465
7,8,29926.535573
8,9,28862.061129
9,10,27229.007209


In [16]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)


## Kmeans algo to cluster data

In [17]:
# Define the model with 3 clusters
model = KMeans(n_clusters=3, random_state=1)
# Fit the model
model.fit(ccinfo_df)
# Make predictions
k_3 = model.predict(ccinfo_df)
# Create a copy of the preprocessed data
ccinfo_predictions_df = ccinfo_df.copy()
# Add a class column with the labels
ccinfo_predictions_df['customer_segments'] = k_3
ccinfo_predictions_df

Unnamed: 0,limit_bal,marriage,age,bill_amt,pay_amt,default,other,post-grad,primary,secondary,customer_segments
0,-1.117341,1,24,-0.660703,-0.542779,1,0,0,0,1,0
1,-0.349942,0,26,-0.636370,-0.463994,1,0,0,0,1,0
2,-0.580162,0,34,-0.416808,-0.354013,0,0,0,0,1,2
3,-0.887121,1,37,-0.080152,-0.402077,0,0,0,0,1,2
4,-0.887121,1,57,-0.396855,0.523771,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
4994,-1.117341,1,36,-0.392558,-0.422089,0,0,0,0,1,2
4995,0.110497,1,34,-0.589218,-0.152107,0,1,0,0,0,2
4996,0.263977,1,45,1.115249,-0.163492,1,0,0,0,1,2
4997,1.108115,1,44,3.338132,0.760455,0,0,1,0,0,2


In [22]:
# Plot the clusters
ccinfo_predictions_df.hvplot.scatter(
    x="pay_amt",
    y="age",
    by="customer_segments"
)