In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [17]:
# read data, remove unused column, drop rows with missing values
data = pd.read_csv("data/stars.csv")
data.dropna(inplace=True)
data

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
0,3068,0.002400,0.1700,16.12,0,Red,M
1,3042,0.000500,0.1542,16.60,0,Red,M
2,2600,0.000300,0.1020,18.70,0,Red,M
3,2800,0.000200,0.1600,16.65,0,Red,M
4,1939,0.000138,0.1030,20.06,0,Red,M
...,...,...,...,...,...,...,...
235,38940,374830.000000,1356.0000,-9.93,5,Blue,O
236,30839,834042.000000,1194.0000,-10.63,5,Blue,O
237,8829,537493.000000,1423.0000,-10.73,5,White,A
238,9235,404940.000000,1112.0000,-11.23,5,White,A


In [18]:
def factorize_objs(df: pd.DataFrame):
    for colname, dtype in zip(df.keys(), df.dtypes.to_list()):
        if dtype == np.dtypes.ObjectDType:
            vals, keys = pd.factorize(df[colname])
            df[colname] = vals
            print(keys)

factorize_objs(data)
data

Index(['Red', 'Blue White', 'White', 'Yellowish White', 'Blue white',
       'Pale yellow orange', 'Blue', 'Blue-white', 'Whitish', 'yellow-white',
       'Orange', 'White-Yellow', 'white', 'Blue ', 'yellowish', 'Yellowish',
       'Orange-Red', 'Blue white ', 'Blue-White'],
      dtype='object')
Index(['M', 'B', 'A', 'F', 'O', 'K', 'G'], dtype='object')


Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
0,3068,0.002400,0.1700,16.12,0,0,0
1,3042,0.000500,0.1542,16.60,0,0,0
2,2600,0.000300,0.1020,18.70,0,0,0
3,2800,0.000200,0.1600,16.65,0,0,0
4,1939,0.000138,0.1030,20.06,0,0,0
...,...,...,...,...,...,...,...
235,38940,374830.000000,1356.0000,-9.93,5,6,4
236,30839,834042.000000,1194.0000,-10.63,5,6,4
237,8829,537493.000000,1423.0000,-10.73,5,2,2
238,9235,404940.000000,1112.0000,-11.23,5,2,2


In [19]:
target = "Star type"
features = data.keys().to_list()
features.remove("Star type")

X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], train_size=0.8, random_state=42)

In [20]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42, n_estimators=50, max_depth=6)

model.fit(X_train, y_train)

In [21]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       1.00      1.00      1.00         7
           2       1.00      1.00      1.00         6
           3       1.00      1.00      1.00         8
           4       1.00      1.00      1.00         8
           5       1.00      1.00      1.00        11

    accuracy                           1.00        48
   macro avg       1.00      1.00      1.00        48
weighted avg       1.00      1.00      1.00        48



In [22]:
import dill
dill.settings["recurse"] = True

test = X_test.join(y_test)
test.to_csv("clean_data/stars_test.csv")

with open("models/stars_rf.modelfile", 'wb') as f:
    dill.dump(model, f)
