In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Load the train and test datasets
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_train.head(10)

Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,0,I,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928,9
1,1,I,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194,8
2,2,M,1.3875,1.1125,0.375,24.777463,11.3398,5.556502,6.662133,9
3,3,F,1.7,1.4125,0.5,50.660556,20.354941,10.991839,14.996885,11
4,4,I,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395,8
5,5,M,1.5,1.175,0.4125,28.845616,13.409313,6.789705,7.93786,10
6,6,M,1.575,1.1375,0.35,30.02212,11.93514,7.342521,8.646598,11
7,7,I,1.3125,1.025,0.35,18.299602,8.249704,3.898056,5.6699,11
8,8,F,1.6,1.2875,0.4375,38.82464,16.967176,7.413394,10.77281,12
9,9,M,1.025,0.7625,0.2625,10.305043,4.493396,2.126212,2.976698,11


In [3]:
# Drop the id column since we will not need it

df_train = df_train.drop("id", axis=1)
df_test = df_test.drop("id", axis=1)

In [4]:
# Get info of the data

df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74051 entries, 0 to 74050
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             74051 non-null  object 
 1   Length          74051 non-null  float64
 2   Diameter        74051 non-null  float64
 3   Height          74051 non-null  float64
 4   Weight          74051 non-null  float64
 5   Shucked Weight  74051 non-null  float64
 6   Viscera Weight  74051 non-null  float64
 7   Shell Weight    74051 non-null  float64
 8   Age             74051 non-null  int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 5.1+ MB


In [5]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49369 entries, 0 to 49368
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             49369 non-null  object 
 1   Length          49369 non-null  float64
 2   Diameter        49369 non-null  float64
 3   Height          49369 non-null  float64
 4   Weight          49369 non-null  float64
 5   Shucked Weight  49369 non-null  float64
 6   Viscera Weight  49369 non-null  float64
 7   Shell Weight    49369 non-null  float64
dtypes: float64(7), object(1)
memory usage: 3.0+ MB


In [6]:
# Check for null columns in the data
df_train.isna().sum()

Sex               0
Length            0
Diameter          0
Height            0
Weight            0
Shucked Weight    0
Viscera Weight    0
Shell Weight      0
Age               0
dtype: int64

In [7]:
df_test.isna().sum()

Sex               0
Length            0
Diameter          0
Height            0
Weight            0
Shucked Weight    0
Viscera Weight    0
Shell Weight      0
dtype: int64

1. Sex is an object and might need to be preprocessed to be used in the prediction
2. There are no missing data in the both train and test datasets

In [8]:
df_train["Sex"].unique()

array(['I', 'M', 'F'], dtype=object)

In [9]:
df_test["Sex"].unique()

array(['I', 'F', 'M'], dtype=object)

The unique values for sex in both train and test are the same 3 which bodes well

In [10]:
# Get data description to get a sense of if test and train are similar

df_train.describe(percentiles=[0.01, 0.99])

Unnamed: 0,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
count,74051.0,74051.0,74051.0,74051.0,74051.0,74051.0,74051.0,74051.0
mean,1.31746,1.024496,0.348089,23.385217,10.10427,5.058386,6.72387,9.967806
std,0.287757,0.237396,0.092034,12.648153,5.618025,2.792729,3.584372,3.175189
min,0.1875,0.1375,0.0,0.056699,0.028349,0.042524,0.042524,1.0
1%,0.5,0.3625,0.1125,1.020582,0.354369,0.226796,0.29767,4.0
50%,1.375,1.075,0.3625,23.799405,9.90815,4.989512,6.931453,10.0
99%,1.7875,1.4125,0.525,53.438808,24.309696,11.793392,15.677274,20.0
max,2.012815,1.6125,2.825,80.101512,42.184056,21.54562,28.491248,29.0


In [11]:
df_test.describe(percentiles=[0.01, 0.99])

Unnamed: 0,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight
count,49369.0,49369.0,49369.0,49369.0,49369.0,49369.0,49369.0
mean,1.319799,1.02636,0.348693,23.464739,10.130272,5.072647,6.745908
std,0.286948,0.236591,0.092259,12.6416,5.611734,2.791544,3.575621
min,0.1875,0.1375,0.0,0.056699,0.028349,0.014175,0.042524
1%,0.5,0.3625,0.1125,1.048931,0.368544,0.226796,0.29767
50%,1.3875,1.075,0.3625,23.81358,9.979024,4.989512,6.945627
99%,1.7875,1.4125,0.525,53.438808,24.385106,11.850091,15.733973
max,2.0375,1.625,2.825,80.101512,42.184056,21.54562,28.491248


It is possible height, weight, shucked weight, viscera weight and shell weight have outliers because the 99% percentile value is 50-75% lower than the max value.

Data visualization of these columns will help understand the data better

Positives
1. The mean and standard deviations of both train and test are similar
2. The min and max values are similar, identical in most features which bodes well for the modelling
3. The 1% percentile values are similar so if there are truly outliers it is at the upper bound of the data

In [12]:
# Check how many unique values each feature and the target have

df_train.nunique()

Sex                  3
Length             144
Diameter           122
Height              65
Weight            3096
Shucked Weight    1766
Viscera Weight     967
Shell Weight      1048
Age                 28
dtype: int64

In [13]:
df_test.nunique()

Sex                  3
Length             140
Diameter           122
Height              57
Weight            2948
Shucked Weight    1692
Viscera Weight     938
Shell Weight       987
dtype: int64

In [14]:
# Find the percentage of unique out the total data. The lower the more categorical the column might be

unique_percent_train = (df_train.nunique() / len(df_train)) * 100
unique_percent_train

Sex               0.004051
Length            0.194461
Diameter          0.164751
Height            0.087777
Weight            4.180902
Shucked Weight    2.384843
Viscera Weight    1.305857
Shell Weight      1.415241
Age               0.037812
dtype: float64

In [15]:
# Are there identical rows
df_train.loc[df_train.duplicated()]

Unnamed: 0,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age


In [16]:
df_test.loc[df_test.duplicated()]

Unnamed: 0,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight


In [17]:
# Combine train and test, drop the target column and get if there is any duplicated row
df = pd.concat([df_train, df_test])
df = df.drop("Age", axis=1)
df.loc[df.duplicated()]

Unnamed: 0,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight
49368,I,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928
