In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
data_path = 'Fish.csv'
data = pd.read_csv(data_path)

Exploring data

In [3]:
print(data.head())

  Species  Weight  Length1  Length2  Length3   Height   Width
0   Bream   242.0     23.2     25.4     30.0  11.5200  4.0200
1   Bream   290.0     24.0     26.3     31.2  12.4800  4.3056
2   Bream   340.0     23.9     26.5     31.1  12.3778  4.6961
3   Bream   363.0     26.3     29.0     33.5  12.7300  4.4555
4   Bream   430.0     26.5     29.0     34.0  12.4440  5.1340


In [4]:
print(data.isnull().sum())

Species    0
Weight     0
Length1    0
Length2    0
Length3    0
Height     0
Width      0
dtype: int64


Data Preprocessing

In [5]:
Y = data['Species']
data = data.drop(['Species'], axis=1)


names= data.columns

from sklearn.preprocessing import Normalizer

norms= Normalizer().fit(data)
data_norms=norms.transform(data)
data_norms= np.asarray(data_norms)

nomred_data = pd.DataFrame(data_norms)

nomred_data

Unnamed: 0,0,1,2,3,4,5
0,0.981470,0.094091,0.103014,0.121670,0.046721,0.016304
1,0.985942,0.081595,0.089415,0.106074,0.042429,0.014638
2,0.989711,0.069571,0.077139,0.090529,0.036031,0.013670
3,0.989406,0.071684,0.079043,0.091309,0.034697,0.012144
4,0.992301,0.061153,0.066923,0.078461,0.028717,0.011848
...,...,...,...,...,...,...
154,0.491631,0.463423,0.491631,0.539988,0.084238,0.056159
155,0.521604,0.455430,0.482678,0.525497,0.094589,0.049397
156,0.474363,0.470475,0.505469,0.536574,0.088535,0.048828
157,0.619759,0.415270,0.449875,0.478189,0.090378,0.065034


In [6]:
nomred_data.columns =names

print(nomred_data)

       Weight   Length1   Length2   Length3    Height     Width
0    0.981470  0.094091  0.103014  0.121670  0.046721  0.016304
1    0.985942  0.081595  0.089415  0.106074  0.042429  0.014638
2    0.989711  0.069571  0.077139  0.090529  0.036031  0.013670
3    0.989406  0.071684  0.079043  0.091309  0.034697  0.012144
4    0.992301  0.061153  0.066923  0.078461  0.028717  0.011848
..        ...       ...       ...       ...       ...       ...
154  0.491631  0.463423  0.491631  0.539988  0.084238  0.056159
155  0.521604  0.455430  0.482678  0.525497  0.094589  0.049397
156  0.474363  0.470475  0.505469  0.536574  0.088535  0.048828
157  0.619759  0.415270  0.449875  0.478189  0.090378  0.065034
158  0.603857  0.418755  0.455169  0.491582  0.088976  0.057024

[159 rows x 6 columns]


In [7]:
Y= Y.replace({"Bream": 0, "Roach": 1,
              "Whitefish":2, "Parkki":3, 
              "Perch":4, "Pike":5, "Smelt":6})

print(Y.unique())

[0 1 2 3 4 5 6]


In [8]:
data_prepared = pd.concat([nomred_data,Y], axis=1)

In [9]:
print(data_prepared)

       Weight   Length1   Length2   Length3    Height     Width  Species
0    0.981470  0.094091  0.103014  0.121670  0.046721  0.016304        0
1    0.985942  0.081595  0.089415  0.106074  0.042429  0.014638        0
2    0.989711  0.069571  0.077139  0.090529  0.036031  0.013670        0
3    0.989406  0.071684  0.079043  0.091309  0.034697  0.012144        0
4    0.992301  0.061153  0.066923  0.078461  0.028717  0.011848        0
..        ...       ...       ...       ...       ...       ...      ...
154  0.491631  0.463423  0.491631  0.539988  0.084238  0.056159        6
155  0.521604  0.455430  0.482678  0.525497  0.094589  0.049397        6
156  0.474363  0.470475  0.505469  0.536574  0.088535  0.048828        6
157  0.619759  0.415270  0.449875  0.478189  0.090378  0.065034        6
158  0.603857  0.418755  0.455169  0.491582  0.088976  0.057024        6

[159 rows x 7 columns]


Droping outliers

In [10]:
print(data_prepared['Width'].max())

0.09107570235510182


Get the index of the outlier and drop

In [11]:
data_4_cat= data_prepared.loc[data_prepared.Species==4]
print(data_4_cat['Width'].idxmax())
indx_to_drop = data_4_cat['Width'].idxmax()
data_prepared =data_prepared.drop([indx_to_drop], axis=0)

data_1_cat= data_prepared.loc[data_prepared.Species==1]
print(data_1_cat['Width'].idxmax())
indx_to_drop = data_1_cat['Width'].idxmax()
data_prepared =data_prepared.drop([indx_to_drop], axis=0)

data_1_cat= data_prepared.loc[data_prepared.Species==1]
print(data_1_cat['Width'].idxmax())
indx_to_drop = data_1_cat['Width'].idxmax()
data_prepared =data_prepared.drop([indx_to_drop], axis=0)


72
40
35


Preparing the model and the data for training

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [13]:
SVC = SVC()

In [14]:
parameters = {'C': [10,100,1000], 'gamma': [4,5,6,7],
              'kernel': ['rbf', 'poly', 'sigmoid']}
grid_search = GridSearchCV(SVC, parameters, n_jobs=20, cv=6, refit=True,verbose=10)


In [15]:
Y = data_prepared['Species']
X= data_prepared.drop(['Species'], axis=1)

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=40, test_size=0.3)

In [17]:
grid_search.fit(X_train, Y_train)

Fitting 6 folds for each of 36 candidates, totalling 216 fits




In [18]:
print("Best parameters for SVC Clasiifier",grid_search.best_estimator_)

Best parameters for SVC Clasiifier SVC(C=1000, gamma=5, kernel='poly')


Evaluate the model

In [19]:
from sklearn.metrics import accuracy_score

In [20]:
preds = grid_search.predict(X_test)

In [21]:
score= accuracy_score(Y_test, preds)
print("_Accuracy = %",score*100)

_Accuracy = % 97.87234042553192


Exporting Model pkl file 

In [24]:
import pickle

# Assuming you have a trained model named 'grid_search'
# Save the model to a pickle file
with open('FISH_CLASSIFICATION_MODEL_SVC.pkl', 'wb') as f:
    pickle.dump(grid_search.best_estimator_, f)

Testing if the exported model file works fine

In [25]:
import pickle
import numpy as np


# Load the model from the pickle file
with open('FISH_CLASSIFICATION_MODEL_SVC.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

input_data = np.array(['0.995553','0.049922','0.053385','0.056703','0.015253','0.009186'])
input_data = input_data.reshape(1, -1)


predictions = loaded_model.predict(input_data)

# Print or use the predictions as needed
print(predictions)


[4]


