In [99]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.utils import resample
from sklearn.metrics import accuracy_score

# Will the customer buy product after the haircut?
This will be based off the survey data we have of over 100 responses

## The Dataset
The following acoustic properties of each voice are measured and included within the CSV:

* Timestamp
* E-mail Address
* Work Zip Code
* Home Zip Code
* Business Name
* City, State last haircut
* Gender
* Age
* Race
* Income Range
* Time since last haircut
* Time between haircuts
* Buy Products
* How much spent last haircut
* Maximum spend for haircut
* How find current barber
* Leave reviews online
* Importance of Price (1-5)
* Importance of Convenience (1-5)
* Importance of Atmosphere (1-5)
* Importance of Additional Services (1-5)
* Additional Comments

In [71]:
# read in CSV
df = pd.read_csv('../data/survey04172018.csv',index_col=None)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 17 columns):
work_zip               104 non-null int64
home_zip               104 non-null int64
gender_values          104 non-null int64
age                    104 non-null float64
race_values            104 non-null float64
income_values          104 non-null float64
days_last_values       104 non-null float64
time_between_values    104 non-null float64
products_values        104 non-null float64
spend_values           104 non-null float64
max_spend_values       104 non-null float64
how_find_values        104 non-null float64
review_values          104 non-null float64
price                  104 non-null int64
convenient             104 non-null int64
atmosphere             104 non-null int64
amenities              104 non-null int64
dtypes: float64(10), int64(7)
memory usage: 13.9 KB


In [84]:
# Separate majority and minority classes
df_majority = df[df.products_values==0]
df_minority = df[df.products_values==1]

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=104,    # to match majority class
                                 random_state=14) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.products_values.value_counts()
# 1    576
# 0    576
# Name: balance, dtype: int64

1.0    104
0.0     87
Name: products_values, dtype: int64

Split our data into training and testing

In [87]:
# Assign X (data) and y (target)

### BEGIN SOLUTION
y = df_upsampled.products_values
X = df_upsampled.drop('products_values', axis=1)

print(X.shape, y.shape)
### END SOLUTION

(191, 16) (191,)


In [100]:
clf_1 = LogisticRegression().fit(X, y)

# Predict on training set
pred_y_1 = clf_1.predict(X)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_1 ) )
# [0 1]
 
# How's our accuracy?
print( accuracy_score(y, pred_y_1) )
# 0.513888888889

[ 0.  1.]
0.769633507853


In [90]:
### BEGIN SOLUTION
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
### END SOLUTION


Create a Logistic Regression Model

In [91]:
### BEGIN SOLUTION
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(class_weight='balanced')
classifier
### END SOLUTION

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

Fit (train) or model using the training data

In [92]:
### BEGIN SOLUTION
classifier.fit(X_train, y_train)
### END SOLUTION

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

Validate the model using the test data

In [101]:
### BEGIN SOLUTION
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")
### END SOLUTION

Training Data Score: 0.7972027972027972
Testing Data Score: 0.6041666666666666


Make predictions

In [102]:
### BEGIN SOLUTION
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")
### END SOLUTION

First 10 Predictions:   [ 0.  1.  1.  0.  1.  1.  0.  0.  1.  1.]
First 10 Actual labels: [1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0]


In [95]:
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Actual,Prediction
0,1.0,0.0
1,0.0,1.0
2,1.0,1.0
3,0.0,0.0
4,0.0,1.0
5,0.0,1.0
6,1.0,0.0
7,0.0,0.0
8,1.0,1.0
9,1.0,1.0
