In [1]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

In [2]:
data_file = "golf_play2.csv"
golf_df = pd.read_csv(data_file, skipinitialspace=True)
print(golf_df)

    Row No   Outlook  Temperature  Humidity   Wind Play
0        1     Sunny           85        85  False   No
1        2     Sunny           80        90   True   No
2        3  Overcast           83        78  False  Yes
3        4      Rain           70        96  False  Yes
4        5      Rain           68        80  False  Yes
5        6      Rain           65        70   True   No
6        7  Overcast           64        65   True  Yes
7        8     Sunny           72        95  False   No
8        9     Sunny           69        70  False  Yes
9       10      Rain           75        80  False  Yes
10      11     Sunny           72        75   True  Yes
11      12  Overcast           74        90   True  Yes
12      13  Overcast           86        70  False  Yes
13      14      Rain           70        95  False   No


In [3]:
# Remove the Row No column as it is not an important feature
golf_df = golf_df.drop("Row No", axis=1)
print("*** Original Dataset ***")
print(golf_df)

*** Original Dataset ***
     Outlook  Temperature  Humidity   Wind Play
0      Sunny           85        85  False   No
1      Sunny           80        90   True   No
2   Overcast           83        78  False  Yes
3       Rain           70        96  False  Yes
4       Rain           68        80  False  Yes
5       Rain           65        70   True   No
6   Overcast           64        65   True  Yes
7      Sunny           72        95  False   No
8      Sunny           69        70  False  Yes
9       Rain           75        80  False  Yes
10     Sunny           72        75   True  Yes
11  Overcast           74        90   True  Yes
12  Overcast           86        70  False  Yes
13      Rain           70        95  False   No


In [4]:
# Map string vales for Outlook column to numbers
code = {"Sunny": 1, "Overcast": 2, "Rain": 3}
golf_df["Outlook"] = [code[item] for item in golf_df["Outlook"]]
print("*** Transformed Dataset ***")
print(golf_df)

*** Transformed Dataset ***
    Outlook  Temperature  Humidity   Wind Play
0         1           85        85  False   No
1         1           80        90   True   No
2         2           83        78  False  Yes
3         3           70        96  False  Yes
4         3           68        80  False  Yes
5         3           65        70   True   No
6         2           64        65   True  Yes
7         1           72        95  False   No
8         1           69        70  False  Yes
9         3           75        80  False  Yes
10        1           72        75   True  Yes
11        2           74        90   True  Yes
12        2           86        70  False  Yes
13        3           70        95  False   No


In [5]:
# split the data into training and test data
train, test = train_test_split(golf_df, test_size=0.3, random_state=0)

# initialize Gaussian Naive Bayes
clf = GaussianNB()

# Use all columns apart from the Play column as features
train_features = train.iloc[:, 0:4]
# Use the play column as the label
train_label = train.iloc[:, 4]

# Repeat above for test data
test_features = test.iloc[:, 0:4]
test_label = test.iloc[:, 4]

# Train the naive bayes model
clf.fit(train_features, train_label)

# build a dataframe to show the expected vs predicted values
test_data = pd.concat([test_features, test_label], axis=1)
test_data["prediction"] = clf.predict(test_features)

print("*** Test Results ***")
print(test_data)

*** Test Results ***
    Outlook  Temperature  Humidity   Wind Play prediction
8         1           69        70  False  Yes        Yes
6         2           64        65   True  Yes        Yes
4         3           68        80  False  Yes        Yes
11        2           74        90   True  Yes         No
2         2           83        78  False  Yes        Yes


In [6]:
# Use the score function and output the prediction accuracy
print("Naive Bayes Accuracy:", clf.score(test_features, test_label))

Naive Bayes Accuracy: 0.8
