# Import Library

In [None]:
# importing libraries
import pandas as pd # data science essentials
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # enhanced data visualization
import statsmodels.formula.api as smf # linear regression (statsmodels)
import numpy as np #data science essentials
from sklearn.model_selection import train_test_split # train/test split
from sklearn.linear_model import LogisticRegression # logistic regression (scikit-learn)
from sklearn.metrics import confusion_matrix, classification_report #import confusion matrix and classfication report

# Read File

In [27]:
# setting pandas print options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


# read csv file
file = '/Users/madhuri/Desktop/Query Team 2 test.csv'

retail = pd.read_csv(file)

retail.head(5)


Unnamed: 0,customer_id,Occupation,Employment Type,Type of Client,Total quantity,Average Unit Price,Total Revenue per Customer,> 100 units
0,528056,Exec-managerial,Private,,34,2.746286,93.373714,No
1,1296893,Transport-moving,Private,,109,2.083486,227.1,YES
2,4223924,Sales,Private,Wholesaler,40,3.073,122.92,No
3,4523882,Transport-moving,Self-emp-not-inc,,15,4.62,69.3,No
4,5060461,Adm-clerical,Private,,37,2.154324,79.71,No


# Explore the Dataset

In [28]:
# formatting and printing the dimensions of the dataset
print(f"""
Size of Original Dataset
------------------------
Observations: {retail.shape[0]}
Features:     {retail.shape[1]}
""")

retail.info(verbose=True)


Size of Original Dataset
------------------------
Observations: 4058
Features:     8

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4058 entries, 0 to 4057
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   customer_id                 4058 non-null   int64  
 1   Occupation                  3854 non-null   object 
 2   Employment Type             4058 non-null   object 
 3   Type of Client              974 non-null    object 
 4   Total quantity              4058 non-null   int64  
 5   Average Unit Price          4058 non-null   float64
 6   Total Revenue per Customer  4058 non-null   float64
 7   > 100  units                4058 non-null   object 
dtypes: float64(2), int64(2), object(4)
memory usage: 253.8+ KB


# Split the Dataframe

In [41]:
# Create the dataframe without null values based on Type of Client
with_type = retail[retail["Type of Client"].notnull()]

with_type.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 974 entries, 2 to 4057
Columns: 8 entries, customer_id to > 100  units
dtypes: float64(2), int64(2), object(4)
memory usage: 68.5+ KB


In [36]:
# Create the dataframe with null values based on Type of Client
without_type = retail[retail["Type of Client"].isna()]

without_type.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3084 entries, 0 to 4056
Columns: 8 entries, customer_id to > 100  units
dtypes: float64(2), int64(2), object(4)
memory usage: 216.8+ KB


# Get Dummy Variables

In [37]:
# get dummies variables 
retail_dummies = pd.get_dummies(with_type)
 
retail_dummies =retail_dummies.drop('Type of Client_Wholesaler', axis=1) # 0 - wholesaler | 1 - personal

# read the dataset
retail_dummies.head(n=5)

Unnamed: 0,customer_id,Total quantity,Average Unit Price,Total Revenue per Customer,Occupation_Adm-clerical,Occupation_Craft-repair,Occupation_Exec-managerial,Occupation_Farming-fishing,Occupation_Handlers-cleaners,Occupation_Machine-op-inspct,Occupation_Other-service,Occupation_Priv-house-serv,Occupation_Prof-specialty,Occupation_Protective-serv,Occupation_Sales,Occupation_Tech-support,Occupation_Transport-moving,Employment Type_Federal-gov,Employment Type_Local-gov,Employment Type_Nan,Employment Type_Private,Employment Type_Self-emp-inc,Employment Type_Self-emp-not-inc,Employment Type_State-gov,Type of Client_Personal,> 100 units_No,> 100 units_YES
2,4223924,40,3.073,122.92,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0
12,14606403,59,1.153051,68.03,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
19,22469235,177,2.463898,436.11,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
22,24804864,30,2.158,64.74,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
31,37876710,160,1.456625,233.06,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1


# Split x and y dataset

In [42]:
# list x variables
x_data   = retail_dummies.drop(['Type of Client_Personal'],
                               axis = 1)

# list y target
y_target = retail_dummies.loc[ : , 'Type of Client_Personal']


# creating the testing and training dataset
x_train, x_test, y_train, y_test = train_test_split(
    x_data,
    y_target,
    test_size = 0.25, 
    random_state = 219)

# checking the shapes of the datasets
print(f"""
Training Data
-------------
X-side: {x_train.shape}
y-side: {y_train.shape}


Testing Data
------------
X-side: {x_test.shape}
y-side: {y_test.shape}
""")


Training Data
-------------
X-side: (730, 26)
y-side: (730,)


Testing Data
------------
X-side: (244, 26)
y-side: (244,)



# Logistic Modeling, Confustion Matrix, and Classfication Report

In [None]:
# instantiating the model
logreg = LogisticRegression()

# fitting the model
logreg.fit(x_train, y_train)

# predicting the y variable
y_pred = logreg.predict(x_train)

# create confusion matrix and classification report for train data
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

In [44]:
# instantiating the model
logreg = LogisticRegression()

# fitting the model
logreg.fit(x_train, y_train)

# predicting the y variavle
y_pred = logreg.predict(x_test)

# create confusion matrix and classification report for test data
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[197   0]
 [ 47   0]]
              precision    recall  f1-score   support

           0       0.81      1.00      0.89       197
           1       0.00      0.00      0.00        47

    accuracy                           0.81       244
   macro avg       0.40      0.50      0.45       244
weighted avg       0.65      0.81      0.72       244



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
