# Classify Raisins with hyperparameter tunning project


## Data set exploration


In [6]:
# 1. Setup
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

raisins = pd.read_excel(r"C:\Programowanie\codecademy_projects\data_sets_codecademy\Raisin_Dataset\Raisin_Dataset.xlsx")

raisins.head()

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class
0,87524,442.246011,253.291155,0.819738,90546,0.758651,1184.04,Kecimen
1,75166,406.690687,243.032436,0.801805,78789,0.68413,1121.786,Kecimen
2,90856,442.267048,266.328318,0.798354,93717,0.637613,1208.575,Kecimen
3,45928,286.540559,208.760042,0.684989,47336,0.699599,844.162,Kecimen
4,79408,352.19077,290.827533,0.564011,81463,0.792772,1073.251,Kecimen


In [7]:
# 2. Create predictor and target variables, X and y
X = raisins.drop('Class', axis=1)
y = raisins['Class']

In [8]:
# 3. Examine the dataset
print("Number of features:", X.shape[1])
print("Total number of samples:", len(y))
print("Samples belonging to class '1':", y.sum())

Number of features: 7
Total number of samples: 900
Samples belonging to class '1': KecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimenKecimen

In [9]:
# 4. Split the data set into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 19)

In [10]:
# 5. Create a Decision Tree model
tree = DecisionTreeClassifier()

In [11]:
# 6. Dictionary of parameters for GridSearchCV
parameters = {'min_samples_split': [2,3,4], 'max_depth': [3,5,7]}

In [12]:
# 7. Create a GridSearchCV model
grid = GridSearchCV(tree, parameters)

#Fit the GridSearchCV model to the training data
grid.fit(X_train, y_train)

In [13]:
# 8. Print the model and hyperparameters obtained by GridSearchCV
print(grid.best_estimator_)

# Print best score
print(grid.best_score_)
# Print the accuracy of the final model on the test data
print(grid.score(X_test, y_test))

DecisionTreeClassifier(max_depth=5, min_samples_split=3)
0.8711111111111112
0.8133333333333334


In [14]:
# 9. Print a table summarizing the results of GridSearchCV
df = pd.concat([pd.DataFrame(grid.cv_results_['params']), pd.DataFrame(grid.cv_results_['mean_test_score'], columns=['Score'])], axis=1)
print(df)

   max_depth  min_samples_split     Score
0          3                  2  0.860741
1          3                  3  0.859259
2          3                  4  0.860741
3          5                  2  0.865185
4          5                  3  0.871111
5          5                  4  0.862222
6          7                  2  0.847407
7          7                  3  0.850370
8          7                  4  0.840000


In [15]:
# 10. The logistic regression model
lr = LogisticRegression(solver = 'liblinear', max_iter = 1000)

In [16]:
# 11. Define distributions to choose hyperparameters from
from scipy.stats import uniform
distributions = {'penalty': ['l1', 'l2'], 'C': uniform(loc=0, scale=100)}

In [17]:
# 12. Create a RandomizedSearchCV model
clf = RandomizedSearchCV(lr, distributions, n_iter=8)

# Fit the random search model
clf.fit(X_train, y_train)

In [18]:
# 13. Print best esimatore and best score
print(clf.best_estimator_)
print (clf.best_score_)

#Print a table summarizing the results of RandomSearchCV
df = pd.concat([pd.DataFrame(clf.cv_results_['params']), pd.DataFrame(clf.cv_results_['mean_test_score'], columns=['Accuracy'])] ,axis=1)
print(df.sort_values('Accuracy', ascending = False))

LogisticRegression(C=46.36736628866094, max_iter=1000, penalty='l1',
                   solver='liblinear')
0.8755555555555556
           C penalty  Accuracy
1  46.367366      l1  0.875556
2  31.154816      l1  0.875556
7  37.257709      l1  0.875556
6  59.717281      l2  0.875556
0   4.765967      l2  0.874074
4  58.958249      l2  0.874074
5  96.636052      l1  0.874074
3   6.500340      l2  0.872593
