In [1]:
import pandas as pd
import os
import numpy as np
from matplotlib import pyplot as plt

from sklearn.metrics import precision_score, recall_score, f1_score
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
customer_df=pd.read_csv('./customer data/Customers.csv')
customer_df

Unnamed: 0,CustomerID,Gender,Age,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size
0,1,Male,19,15000,39,Healthcare,1,4
1,2,Male,21,35000,81,Engineer,3,3
2,3,Female,20,86000,6,Engineer,1,1
3,4,Female,23,59000,77,Lawyer,0,2
4,5,Female,31,38000,40,Entertainment,2,6
...,...,...,...,...,...,...,...,...
1995,1996,Female,71,184387,40,Artist,8,7
1996,1997,Female,91,73158,32,Doctor,7,7
1997,1998,Male,87,90961,14,Healthcare,9,2
1998,1999,Male,77,182109,4,Executive,7,2


In [3]:
# Check for null values and find sum
customer_df.isna().sum()

CustomerID                 0
Gender                     0
Age                        0
Annual Income ($)          0
Spending Score (1-100)     0
Profession                35
Work Experience            0
Family Size                0
dtype: int64

In [4]:
# drop null values and check to make sure null is all zeros nulls are dropped
customer_df=customer_df.dropna()
customer_df.isna().sum()

CustomerID                0
Gender                    0
Age                       0
Annual Income ($)         0
Spending Score (1-100)    0
Profession                0
Work Experience           0
Family Size               0
dtype: int64

In [5]:
# check for duplicates 
customer_df.duplicated().sum()
## found there were no duplicates

0

In [6]:
# Check dtypes
customer_df.dtypes

CustomerID                 int64
Gender                    object
Age                        int64
Annual Income ($)          int64
Spending Score (1-100)     int64
Profession                object
Work Experience            int64
Family Size                int64
dtype: object

In [7]:
customer_df.describe()
# Average customer age: 48
# Average annual Income: 110616
# Average soending score: 51
# Average family size : 3.75

Unnamed: 0,CustomerID,Age,Annual Income ($),Spending Score (1-100),Work Experience,Family Size
count,1965.0,1965.0,1965.0,1965.0,1965.0,1965.0
mean,1000.309924,48.894656,110616.009669,51.07888,4.092621,3.757252
std,578.443714,28.414889,45833.860195,27.977176,3.926459,1.968335
min,1.0,0.0,0.0,0.0,0.0,1.0
25%,498.0,25.0,74350.0,28.0,1.0,2.0
50%,1000.0,48.0,109759.0,50.0,3.0,4.0
75%,1502.0,73.0,149095.0,75.0,7.0,5.0
max,2000.0,99.0,189974.0,100.0,17.0,9.0


In [8]:
# Drop the Age column because the data seem inaccurate
# Such as Age for a doctor being 0, after further investigating the data I decided to drop the column because there couldn't be a 2 year old engineer
customer_df.drop(['Age'],axis=1,inplace=True)
customer_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,CustomerID,Gender,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size
0,1,Male,15000,39,Healthcare,1,4
1,2,Male,35000,81,Engineer,3,3
2,3,Female,86000,6,Engineer,1,1
3,4,Female,59000,77,Lawyer,0,2
4,5,Female,38000,40,Entertainment,2,6


In [9]:
# set cleaned data to new data frame
clean_customer_df=pd.DataFrame(customer_df)
clean_customer_df.head()

Unnamed: 0,CustomerID,Gender,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size
0,1,Male,15000,39,Healthcare,1,4
1,2,Male,35000,81,Engineer,3,3
2,3,Female,86000,6,Engineer,1,1
3,4,Female,59000,77,Lawyer,0,2
4,5,Female,38000,40,Entertainment,2,6


In [10]:
profession_count=clean_customer_df.groupby(["Profession"]).count()["CustomerID"]
profession_count

Profession
Artist           612
Doctor           161
Engineer         179
Entertainment    234
Executive        153
Healthcare       339
Homemaker         60
Lawyer           142
Marketing         85
Name: CustomerID, dtype: int64

In [11]:
work_experience=clean_customer_df.groupby(["Profession"]).mean()["Work Experience"]
work_experience

Profession
Artist           4.215686
Doctor           4.304348
Engineer         3.955307
Entertainment    3.500000
Executive        4.248366
Healthcare       4.002950
Homemaker        6.133333
Lawyer           3.528169
Marketing        4.305882
Name: Work Experience, dtype: float64

In [12]:
# Create new csv of cleaned data
clean_data=customer_df.to_csv("clean_data.csv", index=True)

In [13]:
# Drop customer id column
customer_df.drop(['CustomerID'],axis=1,inplace=True)
customer_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,Gender,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size
0,Male,15000,39,Healthcare,1,4
1,Male,35000,81,Engineer,3,3
2,Female,86000,6,Engineer,1,1
3,Female,59000,77,Lawyer,0,2
4,Female,38000,40,Entertainment,2,6


In [14]:
# Unique value types of each column
unique=customer_df.nunique()
print(unique)

Gender                       2
Annual Income ($)         1755
Spending Score (1-100)     101
Profession                   9
Work Experience             18
Family Size                  9
dtype: int64


In [15]:
# Create the OneHotEncoder instance
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False)

# Fit the encoder and produce encoded DataFrame
encode_df = pd.DataFrame(enc.fit_transform(customer_df.Gender.values.reshape(-1,1)))

# Rename encoded columns
encode_df.columns = enc.get_feature_names(['Gender'])
encode_df.head()



Unnamed: 0,Gender_Female,Gender_Male
0,0.0,1.0
1,0.0,1.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0


In [16]:
# Merge the two DataFrames together and drop the Gender column
customer_df=customer_df.merge(encode_df,left_index=True,right_index=True).drop("Gender",1)


  


In [17]:
# Fit the encoder and produce encoded DataFrame
encode_df2 = pd.DataFrame(enc.fit_transform(customer_df.Profession.values.reshape(-1,1)))

# Rename encoded columns
encode_df2.columns = enc.get_feature_names(['Profession'])
encode_df2.head()



Unnamed: 0,Profession_Artist,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Merge the two DataFrames together and drop the Gender column
customer_df=customer_df.merge(encode_df2,left_index=True,right_index=True).drop("Profession",1)

  


In [19]:
customer_df.head()

Unnamed: 0,Annual Income ($),Spending Score (1-100),Work Experience,Family Size,Gender_Female,Gender_Male,Profession_Artist,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing
0,15000,39,1,4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,35000,81,3,3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,86000,6,1,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,59000,77,0,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,38000,40,2,6,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# Rename Spending Score column
customer_df=customer_df.rename(columns={'Spending Score (1-100)':'Spending_Score'})
customer_df     

Unnamed: 0,Annual Income ($),Spending_Score,Work Experience,Family Size,Gender_Female,Gender_Male,Profession_Artist,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing
0,15000,39,1,4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,35000,81,3,3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,86000,6,1,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,59000,77,0,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,38000,40,2,6,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1925,105935,46,4,5,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1926,74607,69,6,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1927,151985,29,4,7,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1928,76892,64,5,2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
def change_string(Spending_Score):
    if Spending_Score >= 75:
        return 1
    else:
        return 0
customer_df["Spending_Score"]=customer_df["Spending_Score"].apply(change_string)
customer_df.head()

Unnamed: 0,Annual Income ($),Spending_Score,Work Experience,Family Size,Gender_Female,Gender_Male,Profession_Artist,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing
0,15000,0,1,4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,35000,1,3,3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,86000,0,1,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,59000,1,0,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,38000,0,2,6,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [22]:
# Rename Spending_Score column  to Spending_Score >= 75
customer_df=customer_df.rename({"Spending_Score":"Spending_Score >= 75"},axis=1)
customer_df.head()

Unnamed: 0,Annual Income ($),Spending_Score >= 75,Work Experience,Family Size,Gender_Female,Gender_Male,Profession_Artist,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing
0,15000,0,1,4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,35000,1,3,3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,86000,0,1,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,59000,1,0,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,38000,0,2,6,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [23]:
# Define the features set.
X = customer_df.copy()
X = X.drop("Spending_Score >= 75", axis=1)
X.head()

Unnamed: 0,Annual Income ($),Work Experience,Family Size,Gender_Female,Gender_Male,Profession_Artist,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing
0,15000,1,4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,35000,3,3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,86000,1,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,59000,0,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,38000,2,6,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# Define the target set.
y = customer_df["Spending_Score >= 75"].ravel()
y[:5]


array([0, 1, 0, 1, 0], dtype=int64)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [26]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [27]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [28]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [29]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [30]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,298,37
Actual 1,122,17


In [31]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
print(acc_score)

0.6645569620253164


In [32]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.5100840897528937, 'Annual Income ($)'),
 (0.2042497896929242, 'Work Experience'),
 (0.16144192748605043, 'Family Size'),
 (0.014535466771522515, 'Profession_Artist'),
 (0.013914148368601314, 'Gender_Female'),
 (0.013315162418591247, 'Gender_Male'),
 (0.013176844411399581, 'Profession_Healthcare'),
 (0.011790076558037899, 'Profession_Entertainment'),
 (0.011487589985583075, 'Profession_Engineer'),
 (0.0106792392681002, 'Profession_Lawyer'),
 (0.010542425046544089, 'Profession_Doctor'),
 (0.00890048465121927, 'Profession_Marketing'),
 (0.008768705225498358, 'Profession_Executive'),
 (0.007114050363034274, 'Profession_Homemaker')]

In [33]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,298,37
Actual 1,122,17


Accuracy Score : 0.6645569620253164
Classification Report
              precision    recall  f1-score   support

           0       0.71      0.89      0.79       335
           1       0.31      0.12      0.18       139

    accuracy                           0.66       474
   macro avg       0.51      0.51      0.48       474
weighted avg       0.59      0.66      0.61       474



In [34]:
from sklearn.linear_model import LogisticRegression
from collections import Counter

### OVERSAMPLING
## Native Random Oversampling
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 1072, 1: 1072})

In [35]:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [36]:
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5

In [37]:
#### Decision Tree Model
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [38]:
# Creating the decision tree classifier instance.
from sklearn import tree
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [39]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)

In [40]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,230,105
Actual 1,99,40


In [41]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,230,105
Actual 1,99,40


Accuracy Score : 0.6645569620253164
Classification Report
              precision    recall  f1-score   support

           0       0.70      0.69      0.69       335
           1       0.28      0.29      0.28       139

    accuracy                           0.57       474
   macro avg       0.49      0.49      0.49       474
weighted avg       0.57      0.57      0.57       474

