In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

In [2]:
!pip install openpyxl # to read excel file

data_path = "../input/date-fruit-datasets/Date_Fruit_Datasets/Date_Fruit_Datasets.xlsx"
data=pd.read_excel(data_path)

train = data.copy()
train=train.sample(frac=1.0)

In [3]:
train.describe(),train.info()

In [4]:
le=LabelEncoder()
train["target"]=le.fit_transform(train["Class"])

In [5]:
train.isnull().sum()

In [6]:
f,ax=plt.subplots(1,1,figsize=(25,20))
corr=abs(train.corr())
plt.title('Annoteat cell with numeric value', fontsize=5)
sns.heatmap(corr,ax=ax,annot=True, fmt='.1f')

# preprocessing

In [7]:
from sklearn.model_selection import StratifiedKFold
skf=StratifiedKFold(n_splits=5, shuffle=True,random_state=5)
train["fold"]=-1
for i,(_,val_idx) in enumerate(skf.split(train,train.target)):
    train.loc[val_idx,"fold"]=i

In [8]:
df_train=train[train.fold!=4]
df_test=train[train.fold==4]

In [9]:
y=train.target
sel_features=[col for col in train.columns if col not in ["Class","fold","target"]]
X=train[sel_features].values

# Model

In [10]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve

In [11]:
random_state = 2
classifiers = []
classifiers.append(SVC(random_state=random_state))
classifiers.append(DecisionTreeClassifier(random_state=random_state))
classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),random_state=random_state,learning_rate=0.1))
classifiers.append(RandomForestClassifier(random_state=random_state))
classifiers.append(ExtraTreesClassifier(random_state=random_state))
classifiers.append(GradientBoostingClassifier(random_state=random_state))
classifiers.append(MLPClassifier(random_state=random_state))
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression(random_state = random_state))
classifiers.append(LinearDiscriminantAnalysis())
classifiers.append(xgb.XGBClassifier())


In [12]:
cv_results = []
cv=StratifiedKFold(n_splits=5)
for classifier in classifiers :
    cv_results.append(cross_val_score(classifier, X=X, y = y, scoring = "accuracy", cv = cv, n_jobs=4))

In [13]:
cv_means = []
cv_std = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())

In [14]:
cv_res = pd.DataFrame({"CrossValMeans":cv_means,"CrossValerrors": cv_std,"Algorithm":["SVC","DecisionTree","AdaBoost",
"RandomForest","ExtraTrees","GradientBoosting","MultipleLayerPerceptron","KNeighboors","LogisticRegression","LinearDiscriminantAnalysis","XGBoost"]})

f,ax=plt.subplots(1,1,figsize=(15,7))
ax.set(xlabel='common xlabel', ylabel='common ylabel')
g = sns.barplot("CrossValMeans","Algorithm",data = cv_res, palette="Set3",orient = "h",**{'xerr':cv_std},ax=ax)
g.set_xlabel("Mean Accuracy")
g = g.set_title("Cross validation scores")
print(cv_res)

In [15]:
RFC = RandomForestClassifier()

## Search grid for optimal parameters
rf_param_grid = {"max_depth": [None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["gini"]}


gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=cv, scoring="accuracy", n_jobs= 4, verbose = 1)
gsRFC.fit(X,y)
RFC_best = gsRFC.best_estimator_

# Best score
gsRFC.best_score_

In [16]:
# Gradient boosting tunning
GBC = GradientBoostingClassifier()
gb_param_grid = {'loss' : ["deviance"],
              'n_estimators' : [100,200,300],
              'learning_rate': [0.1, 0.05, 0.01],
              'max_depth': [4, 8],
              'min_samples_leaf': [100,150],
              'max_features': [0.3, 0.1] 
              }

gsGBC = GridSearchCV(GBC,param_grid = gb_param_grid, cv=cv, scoring="accuracy", n_jobs= 4, verbose = 1)
gsGBC.fit(X,y)
GBC_best = gsGBC.best_estimator_

# Best score
gsGBC.best_score_

In [17]:
# LinearDiscriminantAnalysis tunning
LDA = LinearDiscriminantAnalysis()
ld_param_grid = {"solver" : ["svd"],
              "tol" : [0.0001,0.0002,0.0003]}

gsLDC = GridSearchCV(LDA,param_grid = ld_param_grid, cv=cv, scoring="accuracy", n_jobs= 4, verbose = 1)
gsLDC.fit(X,y)
LDC_best = gsGBC.best_estimator_

# Best score
gsLDC.best_score_

In [18]:
# XGBoost
clf = xgb.XGBClassifier()
# xgb_param_grid = {"max_depth" : [5],
#                   "n_estimators":[1000],
#                  "objective": ["multi:softprob"]}
 
# gsXGBC = GridSearchCV(clf,param_grid = xgb_param_grid, cv=cv, scoring="accuracy", n_jobs= 4, verbose = 1)
gsXGBC = GridSearchCV(clf, cv=cv, scoring="accuracy", n_jobs= 4, verbose = 1)

gsXGBC.fit(X, y)  
XGBC_best = gsXGBC.best_estimator_

# Best score
gsXGBC.best_score_

In [None]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)):
    """Generate a simple plot of the test and training learning curve"""
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

g = plot_learning_curve(gsRFC.best_estimator_,"RF mearning curves",X,y,cv=cv)
# g = plot_learning_curve(gsExtC.best_estimator_,"ExtraTrees learning curves",X,y,cv=cv)
# g = plot_learning_curve(gsSVMC.best_estimator_,"SVC learning curves",X,y,cv=cv)
# g = plot_learning_curve(gsadaDTC.best_estimator_,"AdaBoost learning curves",X,y,cv=cv)
g = plot_learning_curve(gsGBC.best_estimator_,"GradientBoosting learning curves",X,y,cv=cv)
g = plot_learning_curve(gsLDC.best_estimator_,"LinearDiscriminantAnalysis learning curves",X,y,cv=cv)
g = plot_learning_curve(gsXGBC.best_estimator_,"XGBoost learning curves",X,y,cv=cv)

In [None]:
nrows = ncols = 2
fig, axes = plt.subplots(nrows = nrows, ncols = ncols, sharex="all", figsize=(15,15))

names_classifiers = [("LinearDiscriminant", LDC_best),("RandomForest",RFC_best),("GradientBoosting",GBC_best),("XGBoost",XGBC_best)]

nclassifier = 0
for row in range(nrows):
    for col in range(ncols):
        name = names_classifiers[nclassifier][0]
        classifier = names_classifiers[nclassifier][1]
        indices = np.argsort(classifier.feature_importances_)[::-1][:40]
        g = sns.barplot(y=train.columns[indices][:40],x = classifier.feature_importances_[indices][:40] , orient='h',ax=axes[row][col])
        g.set_xlabel("Relative importance",fontsize=12)
        g.set_ylabel("Features",fontsize=12)
        g.tick_params(labelsize=9)
        g.set_title(name + " feature importance")
        nclassifier += 1