# Random Forest

## Loading Packages and Reading Data

In [31]:
# utility packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


# modeling packages
from sklearn.model_selection import train_test_split, KFold, RepeatedKFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [4]:
# read the dataset
df = pd.read_csv('./star_classification 2.csv')

# encode values for class column
df.replace({'class': {'GALAXY': 0, 'STAR': 1, 'QSO':2}}, inplace=True)

# remove all columns containing ID at the end
cleaned = df.drop(df.filter(regex='ID$').columns, axis=1)
# drop the date column
cleaned = cleaned.drop('MJD', axis=1)

cleaned = cleaned.reset_index()

# make the X and y varialbes
X = cleaned.drop('class', axis=1)
y = cleaned['class']

# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)

## Modeling

In [None]:
# Trial 1
# Trying number of tress 10, 20, 30, ...

In [10]:
n = []
accuracy = []
for i in range(10, 1001, 10):
    clf=RandomForestClassifier(n_estimators=i)
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    print(f"#{i} Accuracy:",metrics.accuracy_score(y_test, y_pred))
    n.append(i)
    accuracy.append(metrics.accuracy_score(y_test, y_pred))

#10 Accuracy: 0.9771333333333333
#20 Accuracy: 0.9789
#30 Accuracy: 0.9789333333333333
#40 Accuracy: 0.9793666666666667
#50 Accuracy: 0.9789
#60 Accuracy: 0.9791666666666666
#70 Accuracy: 0.9791666666666666
#80 Accuracy: 0.9791
#90 Accuracy: 0.9792
#100 Accuracy: 0.9793333333333333
#110 Accuracy: 0.9791
#120 Accuracy: 0.9791333333333333
#130 Accuracy: 0.9789333333333333
#140 Accuracy: 0.9793666666666667
#150 Accuracy: 0.9788333333333333
#160 Accuracy: 0.9796666666666667
#170 Accuracy: 0.9790333333333333
#180 Accuracy: 0.9790333333333333
#190 Accuracy: 0.9791
#200 Accuracy: 0.9794666666666667
#210 Accuracy: 0.9794
#220 Accuracy: 0.9793333333333333
#230 Accuracy: 0.9792
#240 Accuracy: 0.9795
#250 Accuracy: 0.9792666666666666
#260 Accuracy: 0.9793
#270 Accuracy: 0.9792333333333333
#280 Accuracy: 0.9794
#290 Accuracy: 0.9795333333333334
#300 Accuracy: 0.9792
#310 Accuracy: 0.9792666666666666
#320 Accuracy: 0.9794666666666667
#330 Accuracy: 0.9791
#340 Accuracy: 0.9792333333333333
#350 Accu

In [11]:
max(accuracy)
# accuracy = 0.9797

0.9797

In [None]:
n.iloc(max(accuracy))
# n = 260 

In [None]:
# Trial 2
# Using grid search to find the optimal paramerts

In [21]:
# Random forest with Grid Search for paramerts tuning
rfc = RandomForestClassifier()
parameters = {
    "n_estimators":[5,10,50,100,250],
    "max_depth":[2,4,8,16,32,None]
}

In [22]:
cv = GridSearchCV(rfc, parameters, cv=5)
cv.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [2, 4, 8, 16, 32, None],
                         'n_estimators': [5, 10, 50, 100, 250]})

In [25]:
 # The highest accuracy achieved
print(cv.best_score_)

0.977997852463188


In [26]:
 # The paramerts that yeild the best score
print(cv.best_params_)

{'max_depth': None, 'n_estimators': 250}


Questions:
- What is the best n?
- how does random forest prevent overfitting? via bootstrap samples and bagging
- what is rf bad at? can be black box, not so good at regression
- what is rf good at?

Observations:
* Quick to compute (average computing time for different RF with different no of trees and depths is less than a min)
* Diffrent value after each run though its close
* Highest accuraccy (0.9799) when 250-260 tree and no max depth (20m)
* 3h 1-1000 tress w max 0.9797
* RF doesnt assume any distribution


In [None]:
# Trial 3
# Trying different set of columns

In [36]:
# read the dataset
df = pd.read_csv('./star_classification 2.csv')

# encode values for class column
df.replace({'class': {'GALAXY': 0, 'STAR': 1, 'QSO':2}}, inplace=True)

# remove unneeded columns
cleaned = df.drop(['spec_obj_ID','run_ID', 'field_ID', 'plate', 'MJD'], axis=1)

cleaned = cleaned.reset_index()

# make the X and y varialbes
X = cleaned.drop('class', axis=1)
y = cleaned['class']

# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)

In [39]:
# Random forest with Grid Search for paramerts tuning
rfc = RandomForestClassifier()
parameters = {
    "n_estimators":[5,10,50,100,250],
    "max_depth":[2,4,8,16,32,None]
}

In [40]:
cv = GridSearchCV(rfc, parameters, cv=5)
cv.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [2, 4, 8, 16, 32, None],
                         'n_estimators': [5, 10, 50, 100, 250]})

In [41]:
 # The highest accuracy achieved
print(cv.best_score_)

0.9767428571428571


In [42]:
 # The paramerts that yeild the best score
print(cv.best_params_)

{'max_depth': None, 'n_estimators': 100}
