In [1]:
# Import the dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [2]:
# Read the CSV into a DataFrame
data = pd.read_csv('./Resources/sales.csv')

In [3]:
# List the columns
data.head()


Unnamed: 0,PageVisitsCat1,PageVisitDurationCat1,PageVisitsCat2,PageVisitDurationCat2,PageVisitsCat3,PageVisitDurationCat3,BounceRates,ExitRates,PageValues,HowCloseToSpecialDay,Month,VisitorType,Weekend,Revenue
0,0.0,0.0,0.0,0.0,1.0,0.0,0.2,0.2,0.0,0.0,Feb,Returning_Visitor,False,False
1,0.0,0.0,0.0,0.0,2.0,64.0,0.0,0.1,0.0,0.0,Feb,Returning_Visitor,False,False
2,0.0,0.0,0.0,0.0,2.0,2.666667,0.05,0.14,0.0,0.0,Feb,Returning_Visitor,False,False
3,0.0,0.0,0.0,0.0,10.0,627.5,0.02,0.05,0.0,0.0,Feb,Returning_Visitor,True,False
4,0.0,0.0,0.0,0.0,19.0,154.216667,0.015789,0.024561,0.0,0.0,Feb,Returning_Visitor,False,False


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12283 entries, 0 to 12282
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   PageVisitsCat1         12283 non-null  float64
 1   PageVisitDurationCat1  12283 non-null  float64
 2   PageVisitsCat2         12283 non-null  float64
 3   PageVisitDurationCat2  12283 non-null  float64
 4   PageVisitsCat3         12283 non-null  float64
 5   PageVisitDurationCat3  12283 non-null  float64
 6   BounceRates            12283 non-null  float64
 7   ExitRates              12283 non-null  float64
 8   PageValues             12283 non-null  float64
 9   HowCloseToSpecialDay   12283 non-null  float64
 10  Month                  12283 non-null  object 
 11  VisitorType            12283 non-null  object 
 12  Weekend                12283 non-null  bool   
 13  Revenue                12283 non-null  bool   
dtypes: bool(2), float64(10), object(2)
memory usage: 1.1+ 

### Description of columns:

**PageVisitsCatX:** Number of pages visited by a visitor. Cat1 is administrative, Cat2 is informational, and Cat3 is product related.

**PageVisitDurationCatX:** How long a visitor stayed on the category of pages.

**BounceRates:** Percentage of visitors who landed and exited a page.

**ExitRates:** Percentage of visitors who left the site from that page.

**PageValues:** A measurement of a page's contribution to a sale.

**HowCloseToSpecialDay:** How close is browsing date to a special day or a holiday, such as Valentine's day? Higher numbers are closer.

**Month:** Month of visit.

**VisitorType:** Is the visitor a new or returning visitor?

**Weekend:** Did the visit occur on a weekend day?

**Revenue:** Did the visit conclude in a sale?

## Perform Data Analysis

In [5]:
# Retrieve the number visits for each month.
Visits_per_month_df=data.groupby(['Month'])['PageVisitsCat1','PageVisitsCat2','PageVisitsCat3'].count()
Visits_per_month_df.head(25)

  Visits_per_month_df=data.groupby(['Month'])['PageVisitsCat1','PageVisitsCat2','PageVisitsCat3'].count()


Unnamed: 0_level_0,PageVisitsCat1,PageVisitsCat2,PageVisitsCat3
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aug,433,433,433
Dec,1727,1727,1727
Feb,171,171,171
Jul,431,431,431
June,288,288,288
Mar,1884,1884,1884
May,3357,3357,3357
Nov,2995,2995,2995
Oct,549,549,549
Sep,448,448,448


In [6]:
# Retrieve the number of visits that resulted in a purchase or not.
Visits_sales_df=data.groupby(['Revenue'])['PageVisitsCat1','PageVisitsCat2','PageVisitsCat3'].count()
Visits_sales_df.head(25)

  Visits_sales_df=data.groupby(['Revenue'])['PageVisitsCat1','PageVisitsCat2','PageVisitsCat3'].count()


Unnamed: 0_level_0,PageVisitsCat1,PageVisitsCat2,PageVisitsCat3
Revenue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,10375,10375,10375
True,1908,1908,1908


In [7]:
# What percentage of visits resulted in a purchase?
#  15.5%

In [8]:
# Filter the DataFrame for all purchases.
all_purchases_df=data.groupby(['Revenue']).count()
all_purchases_df


Unnamed: 0_level_0,PageVisitsCat1,PageVisitDurationCat1,PageVisitsCat2,PageVisitDurationCat2,PageVisitsCat3,PageVisitDurationCat3,BounceRates,ExitRates,PageValues,HowCloseToSpecialDay,Month,VisitorType,Weekend
Revenue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
False,10375,10375,10375,10375,10375,10375,10375,10375,10375,10375,10375,10375,10375
True,1908,1908,1908,1908,1908,1908,1908,1908,1908,1908,1908,1908,1908


In [9]:
y=data.groupby(['Revenue']).count()
y

Unnamed: 0_level_0,PageVisitsCat1,PageVisitDurationCat1,PageVisitsCat2,PageVisitDurationCat2,PageVisitsCat3,PageVisitDurationCat3,BounceRates,ExitRates,PageValues,HowCloseToSpecialDay,Month,VisitorType,Weekend
Revenue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
False,10375,10375,10375,10375,10375,10375,10375,10375,10375,10375,10375,10375,10375
True,1908,1908,1908,1908,1908,1908,1908,1908,1908,1908,1908,1908,1908


In [10]:
# What is the number of visits for each month that resulted in a purchase?
visits_purchase_df=data.groupby(['Month','Revenue'])['PageVisitsCat1'].count()
visits_purchase_df

Month  Revenue
Aug    False       357
       True         76
Dec    False      1511
       True        216
Feb    False       168
       True          3
Jul    False       365
       True         66
June   False       259
       True         29
Mar    False      1692
       True        192
May    False      2992
       True        365
Nov    False      2235
       True        760
Oct    False       434
       True        115
Sep    False       362
       True         86
Name: PageVisitsCat1, dtype: int64

In [11]:
# Which month had the greatest number of visits where a purchase was made?

#  November==760


In [12]:
# How many visits were from returning visitors?


data[data['VisitorType']=='Returning_Visitor'].count()

#10504

PageVisitsCat1           10504
PageVisitDurationCat1    10504
PageVisitsCat2           10504
PageVisitDurationCat2    10504
PageVisitsCat3           10504
PageVisitDurationCat3    10504
BounceRates              10504
ExitRates                10504
PageValues               10504
HowCloseToSpecialDay     10504
Month                    10504
VisitorType              10504
Weekend                  10504
Revenue                  10504
dtype: int64

In [13]:
# How many visits were from new visitors?
data[data['VisitorType']=='New_Visitor'].count()
#1694

PageVisitsCat1           1694
PageVisitDurationCat1    1694
PageVisitsCat2           1694
PageVisitDurationCat2    1694
PageVisitsCat3           1694
PageVisitDurationCat3    1694
BounceRates              1694
ExitRates                1694
PageValues               1694
HowCloseToSpecialDay     1694
Month                    1694
VisitorType              1694
Weekend                  1694
Revenue                  1694
dtype: int64

In [14]:
#visits_per_group_df=data.groupby(['VisitorType']).count()
#visits_per_group_df

In [15]:
# How many visits took place on a weekday? And, on the weekend?
visits_weekday_df=data.groupby(['Weekend'])['PageVisitsCat1'].count()
visits_weekday_df

#Weekend=9421
#Weekday=2862

Weekend
False    9421
True     2862
Name: PageVisitsCat1, dtype: int64

## Perform Logistic Regression 

In [22]:
# Convert categorical variables to binary variables 
data=pd.get_dummies(data)

In [24]:
# Separate the features and target variables.
y = data["Revenue"]
X = data.drop(columns="Revenue")

In [69]:
# Split the dataset into training and testing sets
# from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(9212, 24)

In [79]:
# Instantiate a logistic regression model

classifier = LogisticRegression(solver='lbfgs', random_state=1,max_iter=900)
classifier

LogisticRegression(max_iter=900, random_state=1)

In [80]:
# Train the model
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=900, random_state=1)

In [81]:
# Use the testing data to make predictions.
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).head()

Unnamed: 0,Prediction,Actual
3228,False,False
4949,False,False
156,False,False
3085,False,False
2259,False,False


In [82]:
# Calculate the accuracy. 
from sklearn.metrics import accuracy_score
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.8873331162487789

In [83]:
# Print the confusion matrix.

from sklearn.metrics import confusion_matrix, classification_report

y_true = y_test
y_pred = classifier.predict(X_test)
# Print the confusion matrix on the predictions. 
confusion_matrix(y_true, y_pred)

array([[2541,   53],
       [ 293,  184]], dtype=int64)

In [84]:
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
print(tn, fp, fn, tp)

2541 53 293 184


In [85]:
# Print a classification report
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

       False       0.90      0.98      0.94      2594
        True       0.78      0.39      0.52       477

    accuracy                           0.89      3071
   macro avg       0.84      0.68      0.73      3071
weighted avg       0.88      0.89      0.87      3071



## Overall Model Performance
----

- **Question:** Can the logistic regression model predict whether visiting a website will result in a purchase? 

- **Answer:**  


In [None]:
# I say no eventhough the data suggests that it has an accuracy of 89 percent but it has a very low recall for TP and a low 
# precision for TP