In [1]:
# importing pandas library for dataframes
import pandas as pd

In [2]:
# specifying column names from https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/crabs.html
col_names=["sp", "sex", "index", "FL", "RW", "CL", "CW", "BD"]

In [3]:
# loading crabs dataset from csv file using read.csv
data = pd.read_csv("australian-crabs.csv", names=col_names)

In [4]:
# printing first 5 rows
data.head(5)

Unnamed: 0,sp,sex,index,FL,RW,CL,CW,BD
0,Blue,Male,1,8.1,6.7,16.1,19.0,7.0
1,Blue,Male,2,8.8,7.7,18.1,20.8,7.4
2,Blue,Male,3,9.2,7.8,19.0,22.4,7.7
3,Blue,Male,4,9.6,7.9,20.1,23.1,8.2
4,Blue,Male,5,9.8,8.0,20.3,23.0,8.2


In [5]:
# displaying unique values for categorical variables sp and sex
print(data.sp.value_counts(), "\n")
print(data.sex.value_counts(), "\n")

sp
Blue      100
Orange    100
Name: count, dtype: int64 

sex
Male      100
Female    100
Name: count, dtype: int64 



In [6]:
# applying one-hot-encoding from pandas library, using get_dummies
# get_dummies is applied only to categorical variables
data_dummies = pd.get_dummies(data)

# printing column names after one-hot-encoding is applied
# continuous variables are untouched
list(data_dummies.columns)

['index',
 'FL',
 'RW',
 'CL',
 'CW',
 'BD',
 'sp_Blue',
 'sp_Orange',
 'sex_Female',
 'sex_Male']

In [7]:
# displaying shape of data_dummies processed dataset which includes one-hot-encoded variables
data_dummies.shape

(200, 10)

In [8]:
#Seperating target variable and using species as the target variable
features = data_dummies.loc[:, 'FL':'BD']
X = features.values
y = data_dummies['sp_Orange'].values

#printing the shape of the dataset after seperating the target variable
print(X.shape, y.shape)

# building a Non-Negative Matrix Factorization model
from sklearn.decomposition import NMF
nmf = NMF(n_components=15, random_state=0)

#spliting dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=0)

#fitting the model on the training data
nmf.fit(X_train)
X_train_nmf = nmf.transform(X_train)
X_test_nmf = nmf.transform(X_test)

#train a classifier on the transformed data
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train_nmf, y_train)

#predict and calculate accuracy for the training dataset
from sklearn.metrics import accuracy_score
y_train_pred = classifier.predict(X_train_nmf)
train_accuracy = accuracy_score(y_train, y_train_pred)

#predict and calculate accuracy for the test dataset
y_test_pred = classifier.predict(X_test_nmf)
test_accuracy = accuracy_score(y_test, y_test_pred)

#displaying the results
print("Train Accuracy: ", train_accuracy)
print("Test Accuracy: ", test_accuracy)

(200, 5) (200,)
Train Accuracy:  1.0
Test Accuracy:  0.98


In [9]:
#importing needed packeges for model-based feature collection
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

#building model-based feature collection
#selectfrommodel selects all features that have an importance measure of the feature
#greater than the threshold
select = SelectFromModel(
    RandomForestClassifier(n_estimators=100, random_state=42),
    threshold="median")

#fitting the model
select.fit(X_train, y_train)
X_train_l1 = select.transform(X_train)

#printing the shape
print("X_train.shape: {}".format(X_train.shape))
print("X_train_l1.shape: {}".format(X_train_l1.shape))

#Performance of the model
X_test_l1 = select.transform(X_test)
score = LogisticRegression().fit(X_train_l1, y_train).score(X_test_l1, y_test)
print("Test score: {:.3f}".format(score))

X_train.shape: (150, 5)
X_train_l1.shape: (150, 3)
Test score: 1.000


In [12]:
#feature selction for the iterative feature selection
from sklearn.feature_selection import RFE
select = RFE(RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=5)

select.fit(X_train, y_train)

#testing the accuracy of the logistic regression model when using rfe for feature selection
X_train_rfe = select.transform(X_train)
X_test_rfe = select.transform(X_test)

#Printing the accuracy of the model
score = LogisticRegression().fit(X_train_rfe, y_train).score(X_test_rfe, y_test)
print("Test score: {:.3f}".format(score))

Test score: 1.000


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=bf1e1982-939d-44bc-9921-4cc94f3c4e83' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>