In [28]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from xgboost import XGBClassifier
from sklearn.metrics import classification_report


In [7]:
df = pd.read_csv('Star39552_balanced.csv')
df.head()

Unnamed: 0,Vmag,Plx,e_Plx,B-V,SpType,Amag,TargetClass
0,10.0,31.66,6.19,1.213,K7V,22.502556,1
1,8.26,3.21,1.0,1.13,K0III,15.792525,0
2,8.27,12.75,1.06,0.596,F9V,18.797552,1
3,6.54,5.23,0.76,1.189,K1III,15.132508,0
4,8.52,0.96,0.72,0.173,B8V,13.431356,1


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39552 entries, 0 to 39551
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Vmag         39552 non-null  float64
 1   Plx          39552 non-null  float64
 2   e_Plx        39552 non-null  float64
 3   B-V          39552 non-null  float64
 4   SpType       39552 non-null  object 
 5   Amag         39552 non-null  float64
 6   TargetClass  39552 non-null  int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 2.1+ MB


In [10]:
df.drop('SpType',axis=1,inplace=True)

In [13]:
features = df.drop('TargetClass',axis=1)
target = df['TargetClass']

In [14]:
scaller = MinMaxScaler(feature_range=(0,1))
X_scalled = pd.DataFrame(data=scaller.fit_transform(features),columns=features.columns)
X_scalled

Unnamed: 0,Vmag,Plx,e_Plx,B-V,Amag
0,0.788419,0.074359,0.143497,0.420052,0.741990
1,0.659243,0.038804,0.014424,0.398438,0.524125
2,0.659985,0.050727,0.015916,0.259375,0.621694
3,0.531552,0.041329,0.008456,0.413802,0.502695
4,0.678545,0.035992,0.007461,0.149219,0.447461
...,...,...,...,...,...
39547,0.478842,0.035005,0.002487,0.227604,0.238067
39548,0.569414,0.057438,0.012435,0.214583,0.606863
39549,0.729770,0.039654,0.025864,0.163281,0.568516
39550,0.714922,0.037455,0.025864,0.486198,0.519559


In [15]:
df_test = pd.read_csv("Star3642_balanced.csv")
df_test.head()

Unnamed: 0,Vmag,Plx,e_Plx,B-V,SpType,Amag,TargetClass
0,5.99,13.73,0.58,1.318,K5III,16.678352,0
1,8.7,2.31,1.29,-0.045,B1II,15.51806,0
2,5.77,5.5,1.03,0.855,G5III,14.471813,0
3,6.72,5.26,0.74,-0.015,B7V,15.324928,1
4,8.76,13.44,1.16,0.584,G0V,19.401997,1


In [16]:
Y_test = df_test['TargetClass']
X_test = df_test.drop('TargetClass',axis=1)

In [18]:
X_test.drop("SpType",axis=1,inplace=True)

In [25]:
X_test_scalled = pd.DataFrame(data=scaller.transform(X_test),columns=X_test.columns)
X_test_scalled

Unnamed: 0,Vmag,Plx,e_Plx,B-V,Amag
0,0.490720,0.051951,0.003979,0.447396,0.552886
1,0.691908,0.037679,0.021636,0.092448,0.515213
2,0.474388,0.041666,0.015170,0.326823,0.481243
3,0.544915,0.041366,0.007958,0.100260,0.508943
4,0.696362,0.051589,0.018403,0.256250,0.641319
...,...,...,...,...,...
3637,0.587231,0.038867,0.013181,0.569271,0.493720
3638,0.661470,0.042766,0.014424,0.210417,0.573528
3639,0.499629,0.037817,0.009202,0.537500,0.434400
3640,0.635486,0.040966,0.061676,0.158854,0.544129


In [26]:
classifiers = {
    "LogisticRegression" : LogisticRegression(),
    "SVC" : SVC(),
    "DecisionTree" : DecisionTreeClassifier(),
    "RandomForest" : RandomForestClassifier(),
    "XGBoost" : XGBClassifier()
}

In [27]:
train_score = []
test_score = []
for key,model in classifiers.items():
    model.fit(X_scalled,target)
    train_score.append(model.score(X_scalled,target))
    test_score.append(model.score(X_test_scalled,Y_test))
print(train_score)
print(9*"*")
print(test_score)

[0.8788683252427184, 0.879904935275081, 1.0, 0.9999241504854369, 0.909106998381877]
*********
[0.8956617243272927, 0.8967600219659527, 0.9772103239978034, 0.9854475562877539, 0.9181768259198243]


In [29]:
xgb = XGBClassifier()
model = xgb.fit(X_scalled, target)
prediction = xgb.predict(X_test_scalled)

In [30]:
print(classification_report(Y_test, prediction))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92      1821
           1       0.92      0.92      0.92      1821

    accuracy                           0.92      3642
   macro avg       0.92      0.92      0.92      3642
weighted avg       0.92      0.92      0.92      3642

