# The Naive Bayes Classifier (NB)

In [1]:
# Import required packages for this chapter

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics


# Personal Loan Acceptance.

We will be using the universalbank.csv again for this assignemnt. 

The file universalbank.csv contains data on 5000 customers of Universal Bank. The data include customer demographic information (age, income, etc.), the customerâ€™s relationship with the bank (mortgage, securities account, etc.), and the customer response to the last personal loan campaign (Personal Loan). Among these 5000 customers, only 480 (= 9.6%) accepted the personal loan that was offered to them in the earlier campaign. In this exercise, we focus on three predictors: age, income, experience, and the outcome Personal Loan.

Partition the data into training (60%) and validation (40%) sets.

In [2]:
# Load the data into band_df dataframe bank_df

bank_df=pd.read_csv("C:/Users/lisaj/OneDrive/Documents/Documents/MIS 536/Module6Datasets/universalbank.csv")

# Only keep the columns we need: ncome, Experience, Age, Personal Loan.Drop the rest.

bank_df=bank_df.drop(columns=['ID', 'ZIP Code', 'Family', 'CCAvg', 'Education', 'Mortgage', 'Securities Account', 'CD Account', 'Online', 'CreditCard'])


# Use critical functions to explore the dataframe using print() to show results

print('\n', bank_df.head())
print('\n', bank_df.shape)
print('\n', bank_df.count())

print('\n', bank_df.describe())



    Age  Experience  Income  Personal Loan
0   25           1      49              0
1   45          19      34              0
2   39          15      11              0
3   35           9     100              0
4   35           8      45              0

 (5000, 4)

 Age              5000
Experience       5000
Income           5000
Personal Loan    5000
dtype: int64

                Age   Experience       Income  Personal Loan
count  5000.000000  5000.000000  5000.000000    5000.000000
mean     45.338400    20.104600    73.774200       0.096000
std      11.463166    11.467954    46.033729       0.294621
min      23.000000    -3.000000     8.000000       0.000000
25%      35.000000    10.000000    39.000000       0.000000
50%      45.000000    20.000000    64.000000       0.000000
75%      55.000000    30.000000    98.000000       0.000000
max      67.000000    43.000000   224.000000       1.000000


In [3]:
# Does the data needs further cleaning?

# Yes, the data needs further cleaning.

# If you think so, write your clearning process here.

bank_df=bank_df.rename(columns={'Personal Loan': 'Personal_Loan'})


print(bank_df)

      Age  Experience  Income  Personal_Loan
0      25           1      49              0
1      45          19      34              0
2      39          15      11              0
3      35           9     100              0
4      35           8      45              0
...   ...         ...     ...            ...
4995   29           3      40              0
4996   30           4      15              0
4997   63          39      24              0
4998   65          40      49              0
4999   28           4      83              0

[5000 rows x 4 columns]


In [4]:
# Split the data into training and validation sets


train_df, valid_df=train_test_split(bank_df, test_size=0.4, random_state=1)
print('Training set:', train_df.shape, 'Validation set:', valid_df.shape)




Training set: (3000, 4) Validation set: (2000, 4)


In [5]:
# Construct the model the perform analysis

outcome='Personal_Loan'
X=list(bank_df.columns)
X.remove(outcome)

train_X=train_df[X]
valid_X=valid_df[X]
train_y=train_df[outcome]
valid_y=valid_df[outcome]

print(train_X.head(), "\n", valid_X.head())

classifier=GaussianNB()

classifier.fit(train_X, train_y)


      Age  Experience  Income
4522   31           5      29
2851   61          36      81
2313   58          32      54
982    58          33      52
1164   41          17      94 
       Age  Experience  Income
2764   31           5      84
4767   35           9      45
3814   34           9      35
3499   49          23     114
2735   36          12      70


GaussianNB()

In [9]:
# Predict the classification for test dataset

predicted=classifier.predict(valid_X)
predProb=classifier.predict_proba(valid_X)

print("Predicted Value:", predicted)
print("\nPredicted Probability:", predProb)

y_predict=classifier.predict(valid_X)
y_predProb=classifier.predict_proba(valid_X)


# Append your prediction, predicted probability to the testing dataset and print the new dataset out using print()
# You should be able to view your prediction, observed outcome, and predictors for each data point side by side.

predicted=pd.concat([valid_X, pd.DataFrame(y_predProb, index=valid_X.index)], axis=1)
predicted=pd.concat([predicted, pd.DataFrame(y_predict, index=predicted.index, columns=["Predicted"])], axis=1)
predicted=pd.concat([predicted, pd.DataFrame(valid_y, index=predicted.index)], axis=1)

print(predicted)


Predicted Value: [0 0 0 ... 0 0 0]

Predicted Probability: [[9.72949978e-01 2.70500221e-02]
 [9.98913372e-01 1.08662825e-03]
 [9.99564824e-01 4.35175823e-04]
 ...
 [9.99966379e-01 3.36205279e-05]
 [9.99706813e-01 2.93187432e-04]
 [9.99424300e-01 5.75699971e-04]]
      Age  Experience  Income         0         1  Predicted  Personal_Loan
2764   31           5      84  0.972950  0.027050          0              0
4767   35           9      45  0.998913  0.001087          0              0
3814   34           9      35  0.999565  0.000435          0              0
3499   49          23     114  0.862479  0.137521          0              0
2735   36          12      70  0.991235  0.008765          0              0
...   ...         ...     ...       ...       ...        ...            ...
4372   34          10      41  0.999244  0.000756          0              0
3401   39          15      28  0.999794  0.000206          0              0
1239   51          26      12  0.999966  0.000034    

In [10]:
# calculate the accuracy of your prediction against the observed outcome.

print("Accuracy: ", metrics.accuracy_score(valid_y, y_predict))


# How well do you think the model does?

# The model's prediction accuracy is 90%, which is pretty high.

Accuracy:  0.901


In [None]:
# Interpret your results:

# The first result [[9.72949978e-01 2.70500221e-02] means that the person should not be given a loan since the predicted value is zero. There's a 97.3% chance that person should 
# be assigned to the class where he/she is not given a loan and a less than 1% chance that this person belongs to class 1 where he/she would be given a loan. The Gaussian model 
# created for this assignment is 90% accurate.

# Lesson learned from this lab:

# I used the Gaussian model to predict whether individuals in the bank dataframe would be given a personal loan based on the following predictors: age, experience, and income. The Gaussian
# model is approoriate for this problem since there are just two outcomes and the predictors are continuous. 


# Automobile Accidents

The file accidents.csv contains information on 42,183 actual automobile accidents in 2001 in the United States that involved one of three levels of injury: NO INJURY, INJURY, or FATALITY. For each accident, additional information is recorded, such as day of week, weather conditions, and road type. A firm might be interested in developing a system for quickly classifying the severity of an accident based on three predictors: weather conditions (WEATHER_R), traffic conditions (TRAF_CON_R), and road type (INT_HWY).

Our goal here is to predict whether an accident just reported will involve fatality (MAX_SEV_IR = 2), a non fetal injury (MAX_SEV_IR = 1) or not injury (MAX_SEV_IR = 0).

Partition the data into training (80%) and validation (20%) sets.

In [17]:
# Load the data into band_df dataframe accidents_df

accidents_df=pd.read_csv("C:/Users/lisaj/OneDrive/Documents/Documents/MIS 536/Module6Datasets/accidents.csv")

# Only keep the columns we need.Drop the rest.
accidents_df=accidents_df.drop(columns=['HOUR_I_R', 'ALCHL_I', 'ALIGN_I', 'STRATUM_R', 'WRK_ZONE', 'WKDY_I_R', 'LGTCON_I_R', 'MANCOL_I_R', 'PED_ACC_R', 'RELJCT_I_R', 'REL_RWY_R', 'PROFIL_I_R', 'SPD_LIM', 'SUR_COND', 'TRAF_WAY', 'VEH_INVL', 'INJURY_CRASH', 'NO_INJ_I', 'PRPTYDMG_CRASH', 'FATALITIES'])

# Use critical functions to explore the dataframe using print() to show results

print('\n', accidents_df.head())
print('\n', accidents_df.shape)
print('\n', accidents_df.count())

print('\n', accidents_df.describe())




    INT_HWY  TRAF_CON_R  WEATHER_R  MAX_SEV_IR
0        0           0          1           1
1        1           0          2           0
2        0           1          2           0
3        0           1          1           0
4        0           0          1           0

 (42183, 4)

 INT_HWY       42183
TRAF_CON_R    42183
WEATHER_R     42183
MAX_SEV_IR    42183
dtype: int64

             INT_HWY    TRAF_CON_R     WEATHER_R    MAX_SEV_IR
count  42183.000000  42183.000000  42183.000000  42183.000000
mean       0.150321      0.516322      1.142783      0.519830
std        0.418952      0.749417      0.349855      0.521256
min        0.000000      0.000000      1.000000      0.000000
25%        0.000000      0.000000      1.000000      0.000000
50%        0.000000      0.000000      1.000000      1.000000
75%        0.000000      1.000000      1.000000      1.000000
max        9.000000      2.000000      2.000000      2.000000


In [20]:
# Split dataset into training set and test set: 80% training and 20% validation

train_df, valid_df=train_test_split(accidents_df, test_size=0.2, random_state=109)

print('Training set:', train_df.shape, 'Validation set:', valid_df.shape)

outcome='MAX_SEV_IR'
X=list(accidents_df.columns)
X.remove(outcome)

train_X=train_df[X]
valid_X=valid_df[X]
train_y=train_df[outcome]
valid_y=valid_df[outcome]

print(train_X.head(), "\n", valid_X.head())




Training set: (33746, 4) Validation set: (8437, 4)
       INT_HWY  TRAF_CON_R  WEATHER_R
41800        0           0          1
34599        0           0          1
37150        0           2          1
11254        0           0          1
18669        0           1          1 
        INT_HWY  TRAF_CON_R  WEATHER_R
36877        0           0          1
34874        0           2          1
41190        0           0          1
38131        0           2          1
30951        1           0          1


In [21]:
# Construct the model the perform analysis

mnomial=MultinomialNB()
mnomial.fit(train_X, train_y)



MultinomialNB()

In [23]:
# Predict the classification for test dataset

y_predict=mnomial.predict(valid_X)
y_predProb=mnomial.predict_proba(valid_X)


# Append your prediction, predicted probability to the testing dataset and print the new dataset out using print()
# You should be able to view your prediction, observed outcome, and predictors for each data point side by side.


predicted=pd.concat([valid_X, pd.DataFrame(y_predProb, index=valid_X.index)], axis=1)
predicted=pd.concat([predicted, pd.DataFrame(y_predict, index=predicted.index, columns=["Predicted"])], axis=1)
predicted=pd.concat([predicted, pd.DataFrame(valid_y, index=predicted.index)], axis=1)

print(predicted)


       INT_HWY  TRAF_CON_R  WEATHER_R         0         1         2  \
36877        0           0          1  0.500996  0.487803  0.011201   
34874        0           2          1  0.459190  0.533915  0.006895   
41190        0           0          1  0.500996  0.487803  0.011201   
38131        0           2          1  0.459190  0.533915  0.006895   
30951        1           0          1  0.511824  0.471307  0.016869   
...        ...         ...        ...       ...       ...       ...   
22355        0           2          1  0.459190  0.533915  0.006895   
22728        0           0          1  0.500996  0.487803  0.011201   
9850         0           0          1  0.500996  0.487803  0.011201   
10795        0           0          1  0.500996  0.487803  0.011201   
21219        0           1          1  0.480231  0.510970  0.008799   

       Predicted  MAX_SEV_IR  
36877          0           1  
34874          1           1  
41190          0           1  
38131          1       

In [24]:
# compute model accuracy of your prediction against observed outcomes.

print("Accuracy: ", metrics.accuracy_score(valid_y, y_predict))


# How well do you think the model does?

# The model doesn't do a great job of prediction. It can only predict 51% of the classifications correctly. 


Accuracy:  0.5121488680810715


In [None]:
# Extra Credit: Can you improve the accuracy of the model to above 0.08 by finding a different set of the predictors?
# Show you model below:




In [None]:
# Interpret your results:

# Using the first result as an example, it is incorrect. The greatest probability, 50%, indicates that it will belong in class 0, "no injurty." However, this accident was 
# actually categorized as '1', a "nonfatal injury." Many other predictions in my model are incorrect since the model only predicts 51% of the classifications correctly. 

# Lesson learned from this lab:

# The multinomial classifier is the model that was used in this exercise. It is appropriate to use since there are more than two outcomes possible in this model. The three possible
# outcomes are 0=no injurty, 1-=nonfatal injury, 2=fatality. The classifiers used in this model are traffic conditions, weather conditions, and road type. 

