# Loading Data 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv ('fruit.csv')
df.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [2]:
df.drop(['fruit_subtype'],axis = 1, inplace = True)

In [3]:
df.head()

Unnamed: 0,fruit_label,fruit_name,mass,width,height,color_score
0,1,apple,192,8.4,7.3,0.55
1,1,apple,180,8.0,6.8,0.59
2,1,apple,176,7.4,7.2,0.6
3,2,mandarin,86,6.2,4.7,0.8
4,2,mandarin,84,6.0,4.6,0.79


Here we will use this data to predict fruit label on the basis of mass, height and color score

In [4]:
x = df.iloc [:,2:5].values
y = df.iloc [:,0].values
print(x)
y

[[192.    8.4   7.3]
 [180.    8.    6.8]
 [176.    7.4   7.2]
 [ 86.    6.2   4.7]
 [ 84.    6.    4.6]
 [ 80.    5.8   4.3]
 [ 80.    5.9   4.3]
 [ 76.    5.8   4. ]
 [178.    7.1   7.8]
 [172.    7.4   7. ]
 [166.    6.9   7.3]
 [172.    7.1   7.6]
 [154.    7.    7.1]
 [164.    7.3   7.7]
 [152.    7.6   7.3]
 [156.    7.7   7.1]
 [156.    7.6   7.5]
 [168.    7.5   7.6]
 [162.    7.5   7.1]
 [162.    7.4   7.2]
 [160.    7.5   7.5]
 [156.    7.4   7.4]
 [140.    7.3   7.1]
 [170.    7.6   7.9]
 [342.    9.    9.4]
 [356.    9.2   9.2]
 [362.    9.6   9.2]
 [204.    7.5   9.2]
 [140.    6.7   7.1]
 [160.    7.    7.4]
 [158.    7.1   7.5]
 [210.    7.8   8. ]
 [164.    7.2   7. ]
 [190.    7.5   8.1]
 [142.    7.6   7.8]
 [150.    7.1   7.9]
 [160.    7.1   7.6]
 [154.    7.3   7.3]
 [158.    7.2   7.8]
 [144.    6.8   7.4]
 [154.    7.1   7.5]
 [180.    7.6   8.2]
 [154.    7.2   7.2]
 [194.    7.2  10.3]
 [200.    7.3  10.5]
 [186.    7.2   9.2]
 [216.    7.3  10.2]
 [196.    7.3

array([1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4], dtype=int64)

# Dividing data into train and test

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split (x, y, test_size = .20,
                                                    random_state = 0)

In [6]:
print("X train shape", x_train.shape)
print("Y train shape", y_train.shape)
print("X test shape", x_test.shape)
print("Y test shape", y_test.shape)

X train shape (47, 3)
Y train shape (47,)
X test shape (12, 3)
Y test shape (12,)


# Using decision tree classifier algorithm

In [7]:
from sklearn import tree

clf_2 = tree.DecisionTreeClassifier(min_samples_split=2)

clf_2.fit(x_train, y_train)

pred_2 = clf_2.predict(x_test)

In [8]:
pred_train = clf_2.predict(x_train)
pred_train

array([3, 3, 3, 3, 4, 2, 1, 3, 4, 3, 3, 4, 1, 4, 3, 1, 2, 3, 1, 4, 1, 4,
       1, 1, 3, 1, 4, 4, 4, 3, 1, 1, 4, 3, 2, 1, 3, 1, 1, 1, 3, 4, 2, 1,
       4, 4, 4], dtype=int64)

# Checking accuracy

In [9]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score
print("Accuracy of test data", accuracy_score(y_test, pred_2))
print("Accuracy on training data", accuracy_score(y_train, pred_train))

Accuracy of test data 0.75
Accuracy on training data 1.0


Thus getting an accuracy of 1.0 i.e. 100% means the model is overfit.

# Using extended dataset to overcome overfitting

In [10]:
df = pd.read_csv ('fruit_new.csv')
df.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [11]:
df.shape

(79, 7)

In [12]:
df.drop(['fruit_subtype'],axis = 1, inplace = True)

In [13]:
x = df.iloc [:,2:5].values
y = df.iloc [:,0].values
print(x)
y

[[192.    8.4   7.3]
 [180.    8.    6.8]
 [176.    7.4   7.2]
 [ 86.    6.2   4.7]
 [ 84.    6.    4.6]
 [ 80.    5.8   4.3]
 [ 80.    5.9   4.3]
 [ 76.    5.8   4. ]
 [178.    7.1   7.8]
 [172.    7.4   7. ]
 [166.    6.9   7.3]
 [172.    7.1   7.6]
 [154.    7.    7.1]
 [164.    7.3   7.7]
 [152.    7.6   7.3]
 [156.    7.7   7.1]
 [156.    7.6   7.5]
 [168.    7.5   7.6]
 [162.    7.5   7.1]
 [162.    7.4   7.2]
 [160.    7.5   7.5]
 [156.    7.4   7.4]
 [140.    7.3   7.1]
 [170.    7.6   7.9]
 [342.    9.    9.4]
 [356.    9.2   9.2]
 [362.    9.6   9.2]
 [204.    7.5   9.2]
 [140.    6.7   7.1]
 [160.    7.    7.4]
 [158.    7.1   7.5]
 [210.    7.8   8. ]
 [164.    7.2   7. ]
 [190.    7.5   8.1]
 [142.    7.6   7.8]
 [150.    7.1   7.9]
 [160.    7.1   7.6]
 [154.    7.3   7.3]
 [158.    7.2   7.8]
 [144.    6.8   7.4]
 [154.    7.1   7.5]
 [180.    7.6   8.2]
 [154.    7.2   7.2]
 [194.    7.2  10.3]
 [200.    7.3  10.5]
 [186.    7.2   9.2]
 [216.    7.3  10.2]
 [196.    7.3

array([1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 3, 3,
       3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split (x, y, test_size = .20,
                                                    random_state = 0)

In [15]:
print("X train shape", x_train.shape)
print("Y train shape", y_train.shape)
print("X test shape", x_test.shape)
print("Y test shape", y_test.shape)

X train shape (63, 3)
Y train shape (63,)
X test shape (16, 3)
Y test shape (16,)


In [16]:
from sklearn import tree

clf_2 = tree.DecisionTreeClassifier(min_samples_split=2)

clf_2.fit(x_train, y_train)

pred_2 = clf_2.predict(x_test)

In [17]:
pred_train = clf_2.predict(x_train)
pred_train

array([2, 1, 2, 3, 3, 3, 3, 1, 4, 1, 2, 2, 2, 4, 1, 2, 3, 4, 3, 1, 4, 1,
       1, 3, 4, 2, 1, 3, 3, 1, 4, 1, 3, 3, 3, 1, 1, 1, 1, 2, 3, 1, 4, 1,
       1, 1, 1, 3, 3, 4, 3, 3, 4, 1, 3, 3, 1, 1, 1, 3, 3, 4, 4],
      dtype=int64)

In [18]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score
print("Accuracy of test data", accuracy_score(y_test, pred_2))
print("Accuracy on training data", accuracy_score(y_train, pred_train))

Accuracy of test data 0.8125
Accuracy on training data 1.0
