In [62]:
# We import the required libraries
import pandas as pd

In [63]:
df = pd.read_csv('citrus.csv')

In [64]:
df.head()

Unnamed: 0,name,diameter,weight,red,green,blue
0,orange,2.96,86.76,172,85,2
1,orange,3.91,88.05,166,78,3
2,orange,4.42,95.17,156,81,2
3,orange,4.47,95.6,163,81,4
4,orange,4.48,95.76,161,72,9


In [65]:
df.tail()

Unnamed: 0,name,diameter,weight,red,green,blue
9995,grapefruit,15.35,253.89,149,77,20
9996,grapefruit,15.41,254.67,148,68,7
9997,grapefruit,15.59,256.5,168,82,20
9998,grapefruit,15.92,260.14,142,72,11
9999,grapefruit,16.45,261.51,152,74,2


In [66]:
# To explore the shape of the data
print(df.shape)

(10000, 6)


In [68]:
# The first thing is we convert our name column in text form to categorical/numeric form as shown 
df['name']=pd.factorize(df['name'])[0]
df

Unnamed: 0,name,diameter,weight,red,green,blue
0,0,2.96,86.76,172,85,2
1,0,3.91,88.05,166,78,3
2,0,4.42,95.17,156,81,2
3,0,4.47,95.60,163,81,4
4,0,4.48,95.76,161,72,9
...,...,...,...,...,...,...
9995,1,15.35,253.89,149,77,20
9996,1,15.41,254.67,148,68,7
9997,1,15.59,256.50,168,82,20
9998,1,15.92,260.14,142,72,11


In [69]:
# We name our variable of interest as the target variable since our predictions will be based on this
target = df.name
target

0       0
1       0
2       0
3       0
4       0
       ..
9995    1
9996    1
9997    1
9998    1
9999    1
Name: name, Length: 10000, dtype: int64

In [70]:
# To separate our independent variables, we drop our target column which is 'name' 
input = df.drop('name', axis = 'columns')

In [71]:
# These are all of our independent variables
input.head()

Unnamed: 0,diameter,weight,red,green,blue
0,2.96,86.76,172,85,2
1,3.91,88.05,166,78,3
2,4.42,95.17,156,81,2
3,4.47,95.6,163,81,4
4,4.48,95.76,161,72,9


##### we can see clearly that we have separated our X and Y variables

In [72]:
# We then check for NaN values among our independent variables
input.columns[input.isna().any()]

Index([], dtype='object')

##### This shows that we do not have missing values amongst our independent variables

In [73]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train, y_test = train_test_split(input,target,test_size=0.3)

In [74]:
from sklearn.naive_bayes import GaussianNB
model= GaussianNB()

In [75]:
# Next we train the model
model.fit(x_train,y_train)

GaussianNB()

In [76]:
#use the result on the testing part to get the model score result or accuracy
model.score(x_test,y_test)

0.9266666666666666

In [None]:
## 0.92666 shows that our model is accurate

In [77]:
# We check the first 10 samples of our y_test
y_test[:10]

8909    1
5634    1
6824    1
1067    0
8809    1
6307    1
1844    0
9848    1
1111    0
6061    1
Name: name, dtype: int64

In [79]:
# We also check for our x_test samples
x_test[:10]

Unnamed: 0,diameter,weight,red,green,blue
8909,12.46,212.11,153,77,2
5634,10.05,175.11,158,57,12
6824,11.03,190.71,151,74,11
1067,7.5,138.09,158,80,2
8809,12.38,210.89,135,74,18
6307,10.7,184.99,155,55,13
1844,8.07,146.65,170,88,7
9848,13.79,233.56,143,73,15
1111,7.53,138.62,145,78,23
6061,10.51,181.86,162,83,2


In [78]:
model.predict(x_test[0:10])

array([1, 1, 1, 0, 1, 1, 0, 1, 0, 0], dtype=int64)

#### Comparing the 'model.predict(x_test[0:10])' with the result of 'y_test[:10]' above, we see that our model almost accurately predicted the name of our fruit. To correct this, we use the probability function to determine if we correctly predicted citrus fruit to orange or grape

In [81]:
#probability of naming correctly
model.predict_proba(x_test[:10])

array([[3.00648241e-04, 9.99699352e-01],
       [1.21918473e-01, 8.78081527e-01],
       [1.75445021e-02, 9.82455498e-01],
       [9.99985233e-01, 1.47674107e-05],
       [1.07920855e-05, 9.99989208e-01],
       [6.71868566e-03, 9.93281314e-01],
       [9.99964053e-01, 3.59465924e-05],
       [1.28002898e-07, 9.99999872e-01],
       [9.98607573e-01, 1.39242697e-03],
       [6.35373595e-01, 3.64626405e-01]])