# Abalone_Case_Study

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Loading the dataset

df = pd.read_csv("abalone.csv")
df.head()

In [None]:
# checking the shape of dataset

df.shape

In [None]:
# checking the null values

df.isnull().sum()

In [None]:
# checking the data types

df.dtypes

In [None]:
# checking the distribution

df.describe()

As per the describe method we clearly see that in **Height** column's min value is **0** that means in Height column have some error.
 - How is it possible ?
 - Let's check.

In [None]:
# Finding the rows which cantains the 0 values

df[df['Height'] == 0]

In [None]:
# Droping the useless rows and saving the changes using inplace=True

df.drop(index=[1257,3996], inplace=True)

In [None]:
# Re-checking the dataset

df.shape

In [None]:
# checking the population of M, F and I (infant)

df['Sex'].value_counts()

In [None]:
# changing the data types from alplabetical to numerics

from sklearn.preprocessing import LabelEncoder

le =  LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])

In [None]:
df.tail()

In [None]:
# changing the data types

df['Sex'] = df['Sex'].astype('float')
df['Rings'] = df['Rings'].astype('float')

In [None]:
# Re-checking the data types

df.dtypes

# Correlation between input variables

In [None]:
# checking the coloumns in dataset

df.columns

In [None]:
# checking the correlation

df.corr()

In [None]:
# checking the dependencies using heatmap

plt.figure(figsize=(22,20))
sns.heatmap(df[['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']])

In [None]:
# Rings vs Length and Sex

plt.figure(figsize=(18,9))
sns.boxenplot(df['Rings'], df['Length'], hue=df['Sex'], palette='Set1')
plt.title('Rings VS Length and Sex', fontsize = 20)

In [None]:
# Rings vs Diameter and Sex

plt.figure(figsize=(18,9))
sns.boxenplot(df['Rings'], df['Diameter'], hue=df['Sex'], palette='Set1')
plt.title('Rings VS Diameter and Sex', fontsize = 20)

In [None]:
# Rings vs Height and Sex

plt.figure(figsize=(18,9))
sns.boxenplot(df['Rings'], df['Height'], hue=df['Sex'], palette='Set1')
plt.title('Rings VS Height and Sex', fontsize = 20)

In [None]:
# Rings vs Whole weight

plt.figure(figsize=(18,9))
sns.swarmplot(df['Rings'], df['Whole weight'])
plt.title('Rings VS Whole weight', fontsize=20)

In [None]:
# Rings vs Shucked weight

plt.figure(figsize=(18,9))
sns.swarmplot(df['Rings'], df['Shucked weight'], palette='dark')
plt.title('Rings VS Shucked weight', fontsize=20)

In [None]:
# Rings vs Viscera weight

plt.figure(figsize=(18,9))
sns.stripplot(df['Rings'], df['Viscera weight'])
plt.title('Rings VS Viscera weight', fontsize=20)

In [None]:
# Ringd vs Shell weight

plt.figure(figsize=(18,9))
sns.regplot(df['Rings'], df['Shell weight'])
plt.title('Rings VS Shell weight', fontsize=20)

In [None]:
# spliting the lable data

y = df['Rings']
x = df.drop(['Rings'], axis=1)

In [None]:
# Spliting the dataset

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y , test_size=0.2, random_state=0)

# Checking the shape after split
print("Shape of x_Train :", x_train.shape)
print("Shape of x_Test :", x_test.shape)
print("Shape of y_Train :", y_train.shape)
print("Shape of x_Test :", y_test.shape)

In [None]:
# Modelling 
# Random forest regression

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

model = RandomForestClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

# Evaluation

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("RMSE :", rmse)

# R2 score
r2 = r2_score(y_test, y_pred)
print("R2 Score :", r2)

In [None]:
# KNN

from sklearn.neighbors import KNeighborsClassifier

# Initializing classifier and giving hyperparameter K=3
knn = KNeighborsClassifier(n_neighbors=3)
# Traning classifier
knn.fit(x_train, y_train)
# Evaluate the classifier
print(knn.score(x_test, y_test))

# Try changing hyperparameter
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train, y_train)
print(knn.score(x_test, y_test))

In [None]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

# Initializing classifier with one-v-rest approach
logr = LogisticRegression(multi_class= 'ovr', random_state=10)
# Traning classifier
logr.fit(x_train, y_train)
# Evaluate the classifier
print(logr.score(x_test, y_test))

In [None]:
# Decision Tree

from sklearn.tree import plot_tree
from sklearn.tree import DecisionTreeClassifier

# Initializing classifier
dtc = DecisionTreeClassifier(random_state=8, max_depth=6)
# Traning classifier
dtc.fit(x_train, y_train)
# Evaluate the classifier
print(dtc.score(x_test, y_test))

# Thank you