<a href="https://colab.research.google.com/github/kaankoken/random_forest_vs_svm/blob/master/svm_vs_random_forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from sklearn import metrics, datasets, preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC, SVC

from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report

In [0]:
def randomOneHoldout(X_train, Y_train):
    x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=0)
    return x_train, x_test, y_train, y_test

In [0]:
def stratifiedOneHoldout(X_train, Y_train):
    x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=0)
    return x_train, x_test, y_train, y_test

In [0]:
def visulize_class(df):
	# look at the last column on data frame (the classification value column)
	df.iloc[:, -1].value_counts().plot(kind='bar')
	plt.title("Output Distribution: Breast Cancer DS")
	plt.xlabel("Classification")
	plt.ylabel("Frequency")
	plt.show()	

In [0]:
def identify_noise(df):

	noise = df[df.isnull().any(axis=1)].count()
	total_noise = noise.sum()
	print("{0} null values were found.".format(str(total_noise)))
	if(total_noise > 0):	
		print(noise)
	print("\n\nShowing all data types:\n\n")
	print(df.dtypes)

In [0]:
def heat_map(df):
	fig, ax = plt.subplots()
	corr = df.corr()
	sns.heatmap(corr, annot=True, cmap='hot')
	plt.show()	

In [0]:
def filter_features(data, bad_indices):
	# eliminate above column indices from the data and return new set
	filtered_data = np.delete(data, bad_indices, axis=1)

	return filtered_data

In [0]:
def vis_all_feat(data, class_):
    for col_ind in range(data.shape[1]):
		    print("Viewing Feature #{0}".format(str(col_ind)))
		    vis_single_feat(data, class_, col_ind)

In [0]:
def vis_single_feat(data, class_, ind):
	# create graph of classification and feature values	
	plt.figure(100) # display two plots on separate figures
	df = pd.DataFrame(data)
	feat_vals = df.iloc[:, ind]
	plt.scatter(feat_vals, class_)
	plt.title("Plot of Feature {0}".format(str(ind)))
	plt.xlabel("Feature Value")
	plt.ylabel("Classification")
	
	# create bar graph of mean feature values for each classification
	plt.figure(200)
	plt.title("Mean Values of Feature {0}".format(str(ind)))
	plt.xlabel("Classification")
	plt.ylabel("Mean Feature Value")
	mean_df = pd.concat([df.iloc[:, ind], pd.Series(class_)], axis=1)
	mean_df.columns = ["values", "classif"]	
	mean_df.groupby("classif", as_index=False)["values"].mean().loc[:,"values"].plot(kind='bar')
	
	plt.show()


In [0]:
def plot_pairplot(data, class_):
	data_df = pd.DataFrame(data)
	# add classification so the plot can be colored by it
	data_df.loc[:, "classif"] = pd.Series(class_)
	sns.pairplot(data_df, hue='classif')
	plt.show()


In [0]:
def fourError(X, Y, model):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)
    
    Train_x, TrainDev_x, Train_y, TrainDev_y = train_test_split(X_train, Y_train, test_size=0.2, random_state=0, stratify=Y_train)
    Dev_x, Test_x, Dev_y, Test_y = train_test_split(X_test, Y_test, test_size=0.5, random_state=0, stratify=Y_test)

    model.fit(Train_x, Train_y)

    y_true, trainDev_pred = TrainDev_y, model.predict(TrainDev_x)

    print("Train-Train Dev,   e1:", metrics.mean_squared_error(TrainDev_y, trainDev_pred),"\n")
    print("Accuracy: ", 1 - metrics.mean_squared_error(TrainDev_y, trainDev_pred))
    print( '\nClassification report\n' )
    print(classification_report(y_true, trainDev_pred))

    y_true, dev_pred = Dev_y, model.predict(Dev_x)
    print("Train-Dev,   e2", metrics.mean_squared_error(Dev_y, dev_pred),"\n")
    print("Accuracy: ", 1 - metrics.mean_squared_error(Dev_y, dev_pred))
    print( '\nClassification report\n' )
    print(classification_report(y_true, dev_pred))

    y_true, test_pred = Test_y, model.predict(Test_x)
    print("Train-Test,   e3: ", metrics.mean_squared_error(Test_y, test_pred),"\n")
    print("Accuracy: ", 1 - metrics.mean_squared_error(Test_y, test_pred))
    print( '\nClassification report\n' )
    print(classification_report(y_true, test_pred))

    y_true, devTest_pred = Y_test, model.predict(X_test)
    print("Train-(Dev+Test),   e4: ", metrics.mean_squared_error(Y_test, devTest_pred),"\n")
    print("Accuracy: ", 1 - metrics.mean_squared_error(Y_test, devTest_pred))
    print( '\nClassification report\n' )
    print(classification_report(y_true, devTest_pred))

In [0]:
def svm(X_train, Y_train, kernel, weight, gamma):

    svm = SVC(C=1, kernel=kernel, degree=3, gamma=gamma, coef0=0.0, shrinking=True, 
          probability=False, tol=0.001, cache_size=200, class_weight=weight,
          max_iter=-1, decision_function_shape="ovr", random_state = 0)

    #5-Fold
    cv_result_svm_5 = cross_val_score(svm, X_train, Y_train, cv=5, scoring='accuracy')
  
    #10-Fold
    cv_result_svm_10 = cross_val_score(svm, X_train, Y_train, cv=10, scoring='accuracy')
  
    #Random One Holdout
    x_train, x_test, y_train, y_test_random = randomOneHoldout(X_train, Y_train)
    svm.fit(x_train, y_train)
    y_pred_svm_random = svm.predict(x_test)
  
    #Stratified One Holdout
    x_train, x_test, y_train, y_test_stratified = stratifiedOneHoldout(X_train, Y_train)
    svm.fit(x_train, y_train)
    y_pred_svm_stratified = svm.predict(x_test)

    print("5 Fold")
    print("SVM Accuracy: ", cv_result_svm_5.mean())
  
    print("10 Fold")
    print("SVM Accuracy: ", cv_result_svm_10.mean())

    print("Random One Hold Out")
    print("SVM Accuracy: ", 1 - metrics.mean_squared_error(y_test_random, y_pred_svm_random))
  
    print("Stratified One Hold Out Fold")
    print("SVM Accuracy: ", 1 - metrics.mean_squared_error(y_test_stratified, y_pred_svm_stratified))

In [0]:
def random_forest(X_train, Y_train, forest_size, max_depth, criterion, min_samples_split, class_weight):
    rf = RandomForestClassifier(n_estimators=forest_size, oob_score=True, n_jobs=-1, max_depth=max_depth, criterion=criterion, min_samples_split = min_samples_split, class_weight=class_weight, random_state=0)
	
	  #5-Fold
    cv_result_rf_5 = cross_val_score(rf, X_train, Y_train, cv=5, scoring='accuracy')
  
    #10-Fold
    cv_result_rf_10 = cross_val_score(rf, X_train, Y_train, cv=10, scoring='accuracy')
  
    #Random One Holdout
    x_train, x_test, y_train, y_test_random = randomOneHoldout(X_train, Y_train)
    rf.fit(x_train, y_train)
    y_pred_rf_random = rf.predict(x_test)
  
    #Stratified One Holdout
    x_train, x_test, y_train, y_test_stratified = stratifiedOneHoldout(X_train, Y_train)
    rf.fit(x_train, y_train)
    y_pred_rf_stratified = rf.predict(x_test)

    print("Stratified One Hold Out Fold")
    print("Random Forest  Accuracy: ", 1 - metrics.mean_squared_error(y_test_stratified, y_pred_rf_stratified))

    print("5 Fold")
    print("Random Forest Accuracy: ", cv_result_rf_5.mean())
  
    print("10 Fold")
    print("Random Forest  Accuracy: ", cv_result_rf_10.mean())

    print("Random One Hold Out")
    print("Random Forest  Accuracy: ", 1 - metrics.mean_squared_error(y_test_random, y_pred_rf_random))

In [0]:
def tuningDepth(x_train, x_test, y_train, y_test_stratified):

    max_depth_range = list(range(1, 10))
    max_depth_range.append(str("None"))
    
    for depth in max_depth_range:
        if(depth == "None"):
            clf = RandomForestClassifier(random_state = 0)
            clf.fit(x_train, y_train)
        else:  
            clf = RandomForestClassifier(max_depth = depth, random_state = 0)
            clf.fit(x_train, y_train)

        accuracy = clf.score(x_test, y_test_stratified)*100
        print("Depth: ", depth, " Accuracy: ", accuracy)

In [0]:
def tuningSplit(x_train, x_test, y_train, y_test_stratified):
    criterion = ["gini", "entropy"]

    for i in criterion:
        clf = RandomForestClassifier(criterion = i, max_depth = 7, random_state = 0)
        clf.fit(x_train, y_train)

        accuracy = clf.score(x_test, y_test_stratified)*100
        print("Criterion: ", i, "   Accuracy: ", accuracy)

    for i in range(2, 10):
        clf = RandomForestClassifier(max_depth = 7, min_samples_split = i, random_state = 0)
        clf.fit(x_train, y_train)

        accuracy = clf.score(x_test, y_test_stratified)*100
        print("min_samples_split: ", i, "   Accuracy: ", accuracy)

In [0]:
def tuningClassWeight(x_train, x_test, y_train, y_test_stratified):
    # No class weight
    clf = RandomForestClassifier(max_depth = 7, random_state = 0)
    clf.fit(x_train, y_train)

    accuracy = clf.score(x_test, y_test_stratified)*100
    print("Class weight: None           Accuracy: ", accuracy)

    # Balanced class weight
    clf = RandomForestClassifier(max_depth = 7, random_state = 0, class_weight ='balanced')
    clf.fit(x_train, y_train)

    accuracy = clf.score(x_test, y_test_stratified)*100
    print("Class weight: Balanced       Accuracy: ", accuracy)

In [0]:
def AdaBoost(model, n_estimators, learning_rate, X_train, Y_train, X_test, Y_test):
    clf = BaggingClassifier(base_estimator = model, n_estimators= n_estimators, learning_rate=learning_rate, random_state=0)
    clf.fit(X_train, Y_train)
    clf.predict(X_test)
    return clf.score(X_train, Y_train)

In [0]:
def bagging(model, n_estimators, X_train, Y_train, X_test, Y_test):
    clf = AdaBoostClassifier(base_estimator = model, n_estimators= n_estimators, random_state=0)
    clf.fit(X_train, Y_train)
    clf.predict(X_test)
    return clf.score(X_train, Y_train)

In [0]:
def displayAccuracy(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

    kernel = ["linear", "rbf", "poly", "sigmoid"]
    weight = [None, "balanced"]
    gamma = ["auto", "scale"]
    
    for i in kernel:
        for j in weight:
            for k in gamma:
                if i != "linear":
                    print("Kernel: {} - Weight: {} - Gamma: {}".format(str(i), j, k))
                    svm(X_train, Y_train, i, j, k)
                else:
                    print("Kernel: {} - Weight: {} - Gamma: {}".format(str(i), j, "auto"))
                    svm(X_train, Y_train, i, j, k)
        print()

    combined approach
    for depth in max_depth_range:
        for c in criterion:
            for i in range(2, 10):
                for w in weight:
                    print("Depth: {}, Criterion: {}, Min Split {}, Weight: {}".format(depth, c, i, w))
                    random_forest(X_train, Y_train, 100, depth, c, i, w)
        print("\n")

    forest_size = [25, 50, 75, 100, 125, 150, 175, 200]
    
    for s in forest_size:
        random_forest(X_train, Y_train, s, 7, "entropy", 3, "balanced")

In [0]:
def compareBaseModels(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

    svm(X_train, Y_train, "rbf", None, "scale")
    print("")
    random_forest(X_train, Y_train, 100, None, "gini", 2, None)

    x_train, x_test, y_train, y_test_stratified = stratifiedOneHoldout(X_train, Y_train)
    #Individual tuning
    print("\n")
    tuningDepth(x_train, x_test, y_train, y_test_stratified);
    tuningSplit(x_train, x_test, y_train, y_test_stratified)
    tuningClassWeight( x_train, x_test, y_train, y_test_stratified)

In [0]:
if __name__ == '__main__':

    breast_cancer = datasets.load_breast_cancer()
    X = breast_cancer.data
    Y = breast_cancer.target
  
    #Shape of the data
    print(X.shape, end="\n")

    feauture = pd.DataFrame(Y)
    df = pd.DataFrame(X)
    rf = RandomForestClassifier(n_estimators=100, oob_score=True, n_jobs=-1, max_depth=7, criterion="entropy", min_samples_split = 3, class_weight="balanced", random_state=0)
    svm = SVC(C=1, kernel="linear", degree=3, gamma="scale", coef0=0.0, shrinking=True, 
          probability=True, tol=0.001, cache_size=200, class_weight=None,
          max_iter=-1, decision_function_shape="ovr", random_state = 0)
    fourError(X, Y, svm)
    visulize_class(feauture)
    compareBaseModels(X, Y)
    displayAccuracy(X, Y)

    identify_noise(df)
    heat_map(df)
    
    #filter strongly correlated features - can see which ones in correlation map
    X = filter_features(X, [2, 3, 20, 22, 23, 12, 13])

    vis_all_feat(X, Y)
    X = filter_features(X, [1, 2, 6, 7, 9, 10, 14, 15])
    print("Cleaned data")
    fourError(X, Y, svm)
    #remaining features
    plot_pairplot(X, Y)

 

In [0]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
#best model 

learning_rate = [0.0001, 0.001, 0.01, 0.1, 1 ,2 ,3 ,4, 5]

result_ada_2 = []
for i in learning_rate: #i -> Learning Rate
    x = []
    for j in range(50, 150, 25): #j -> N estimators
        x.append(AdaBoost(svm, j, i, X_train, Y_train, X_test, Y_test))
    result_ada_2.append(x)

In [27]:
x = pd.DataFrame(result_ada_2)
x

Unnamed: 0,0,1,2,3
0,0.952261,0.952261,0.952261,0.952261
1,0.952261,0.952261,0.952261,0.952261
2,0.949749,0.949749,0.949749,0.949749
3,0.939698,0.944724,0.939698,0.927136
4,0.736181,0.69598,0.665829,0.650754
5,0.904523,0.909548,0.904523,0.904523
6,0.894472,0.854271,0.816583,0.796482
7,0.773869,0.726131,0.69598,0.655779
8,0.861809,0.796482,0.736181,0.678392


In [0]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
#best model 


result_bagging = []
for j in range(50, 150, 25): #j -> N estimators
    result_bagging.append(bagging(svm, j, X_train, Y_train, X_test, Y_test))


In [35]:
y = pd.DataFrame(result_bagging)
y

Unnamed: 0,0
0,0.736181
1,0.69598
2,0.665829
3,0.650754
