# Preparing the model

In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pickle

In [2]:
penguin_df = pd.read_csv('penguins.csv')

In [33]:
penguin_df.sample(5)

Unnamed: 0,rowid,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
160,161,Gentoo,Biscoe,43.3,13.4,209.0,4400.0,female,2007
320,321,Chinstrap,Dream,50.9,17.9,196.0,3675.0,female,2009
189,190,Gentoo,Biscoe,44.4,17.3,219.0,5250.0,male,2008
294,295,Chinstrap,Dream,46.4,18.6,190.0,3450.0,female,2007
28,29,Adelie,Biscoe,37.9,18.6,172.0,3150.0,female,2007


In [4]:
penguin_df.shape

(344, 9)

In [5]:
penguin_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   rowid              344 non-null    int64  
 1   species            344 non-null    object 
 2   island             344 non-null    object 
 3   bill_length_mm     342 non-null    float64
 4   bill_depth_mm      342 non-null    float64
 5   flipper_length_mm  342 non-null    float64
 6   body_mass_g        342 non-null    float64
 7   sex                333 non-null    object 
 8   year               344 non-null    int64  
dtypes: float64(4), int64(2), object(3)
memory usage: 24.3+ KB


In [37]:
penguin_df.isnull().values.any()

True

In [38]:
penguin_df.isnull().sum()

rowid                 0
species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
dtype: int64

In [6]:
penguin_df.dropna(inplace=True)

In [7]:
output = penguin_df['species']

In [8]:
features = penguin_df[['island', 'bill_length_mm', 'bill_depth_mm',
 'flipper_length_mm', 'body_mass_g', 
'sex']]

In [9]:
features = pd.get_dummies(features)

In [10]:
output, uniques = pd.factorize(output)

In [12]:
uniques

Index(['Adelie', 'Gentoo', 'Chinstrap'], dtype='object')

In [13]:
uniques

Index(['Adelie', 'Gentoo', 'Chinstrap'], dtype='object')

In [13]:
x_train, x_test, y_train, y_test = train_test_split(features, output, test_size=.3)

In [14]:
rfc = RandomForestClassifier(random_state=15)

In [15]:
rfc.fit(x_train, y_train)

RandomForestClassifier(random_state=15)

In [16]:
y_pred = rfc.predict(x_test)

In [17]:
score = accuracy_score(y_pred, y_test)

In [18]:
print('Our accuracy score for this model is {}'.format(score))

Our accuracy score for this model is 0.99


# Saving the model

In [19]:
rf_pickle = open('random_forest_penguin.pickle', 'wb')

In [20]:
pickle.dump(rfc, rf_pickle)

In [21]:
rf_pickle.close()

In [22]:
output_pickle = open('output_penguin.pickle', 'wb')

In [23]:
pickle.dump(uniques, output_pickle)

In [24]:
output_pickle.close()

In [29]:
rf_pickle = open('random_forest_penguin.pickle', 'rb')
map_pickle = open('output_penguin.pickle', 'rb')

rfc = pickle.load(rf_pickle)
unique_penguin_mapping = pickle.load(map_pickle)

rf_pickle.close()
map_pickle.close()

In [30]:
rfc

RandomForestClassifier(random_state=15)

In [31]:
unique_penguin_mapping

Index(['Adelie', 'Gentoo', 'Chinstrap'], dtype='object')