In [1]:
from sklearn.utils import shuffle
from __future__ import division
import numpy as np
import scipy as sp
import pandas as pd
import csv
import random as rn

import matplotlib.pyplot as plt

In [2]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import KernelPCA, PCA
from sklearn.linear_model import LogisticRegressionCV
from sklearn import svm, cross_validation, grid_search
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import f_classif, RFECV, SelectKBest
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler



In [3]:
dataLoc = r'/home/jytay/Documents/cogs118a/project/skins/Skin_NonSkin.txt'
skinSet = np.loadtxt(dataLoc) #load into numpy array
skinDF = pd.read_csv(dataLoc, sep="\t", header=None)
skinDF.columns = ["b","g","r", "y"]

In [4]:
skinDF = shuffle(skinDF)
skinArray = skinDF.as_matrix()
skinDF.head()

Unnamed: 0,b,g,r,y
149347,176,176,130,2
156730,7,5,5,2
150994,183,184,142,2
205457,57,59,23,2
116497,50,51,17,2


In [5]:
skinArray.shape

(245057, 4)

In [6]:
X = skinArray[:5000,:-1]
Y = skinArray[:5000,-1]

X.shape
Y.shape

(5000,)

In [7]:
# first proportion : 20 80
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X,Y, train_size=.2)

# 2nd proportion : 40 60
X_train2, X_test2, Y_train2, Y_test2 = cross_validation.train_test_split(X,Y, train_size = .4)

# 3rd proportion: 50 50
X_train3, X_test3, Y_train3, Y_test3 = cross_validation.train_test_split(X,Y, train_size = .5)

In [8]:
# Logit L1 for 20 80
logit_clf = LogisticRegressionCV(solver='liblinear', Cs=[.001,.01,.1,1,10,100], cv=5, penalty='l1')
logit_clf.fit(X_train, Y_train)
log11_score = logit_clf.score(X_test, Y_test)
print(log11_score)

0.9215


In [9]:
# Logit L1  for 40 60
logit_clf = LogisticRegressionCV(solver='liblinear', Cs=[.001,.01,.1,1,10,100], cv=5, penalty='l1')
logit_clf.fit(X_train2, Y_train2)
log12_score = logit_clf.score(X_test2, Y_test2)
print(log12_score)

0.924


In [10]:
# Logit L1 for 50 50
logit_clf = LogisticRegressionCV(solver='liblinear', Cs=[.001,.01,.1,1,10,100], cv=5, penalty='l1')
logit_clf.fit(X_train3, Y_train3)
log13_score = logit_clf.score(X_test3, Y_test3)
print(log13_score)

0.9168


In [11]:
# Logit l2 for 20 80
logit_clf = LogisticRegressionCV(solver='newton-cg', Cs=[.001,.01,.1,1,10,100], cv=5,
                                 multi_class='multinomial', penalty='l2')
logit_clf.fit(X_train, Y_train)
log21_score = logit_clf.score(X_test, Y_test)
print(log21_score)

0.79175


In [12]:
# Logit l2 for 40 60
logit_clf = LogisticRegressionCV(solver='newton-cg', Cs=[.001,.01,.1,1,10,100], cv=5,
                                 multi_class='multinomial', penalty='l2')
logit_clf.fit(X_train2, Y_train2)
log22_score = logit_clf.score(X_test2, Y_test2)
print(log22_score)

0.799666666667


In [13]:
# Logit l2 for 50 50
logit_clf = LogisticRegressionCV(solver='newton-cg', Cs=[.001,.01,.1,1,10,100], cv=5,
                                 multi_class='multinomial', penalty='l2')
logit_clf.fit(X_train3, Y_train3)
log23_score = logit_clf.score(X_test3, Y_test3)
print(log23_score)

0.788


In [14]:
# average values for L1 and L2 proportions
L1_logit_avg = (log11_score + log12_score + log12_score)/ 3
L2_logit_avg = (log21_score + log22_score + log23_score) / 3

print(L1_logit_avg)
print(L2_logit_avg)

0.923166666667
0.793138888889


In [15]:
# code to speed up svm
# proportion 1
scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
X_train = scaling.transform(X_train)
X_test = scaling.transform(X_test)

#proportion 2
scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train2)
X_train2 = scaling.transform(X_train2)
X_test2 = scaling.transform(X_test2)

#proportion 3
scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train3)
X_train3 = scaling.transform(X_train3)
X_test3 = scaling.transform(X_test3)



In [18]:
# SVM 
svm_params = {'C':[0.01,1,10,100,500,700,1200,1500,1800], 'degree':[1,2,3,4]}
clf = svm.SVC(kernel='poly')
svm_clf = grid_search.GridSearchCV(clf, svm_params, cv=5, n_jobs=4)
svm_clf.fit(X_train,Y_train)
print(svm_clf.best_score_)
svm_clf.best_params_















0.956


{'C': 500, 'degree': 3}

In [20]:
# SVM 2 
svm_params = {'C':[.01,1,10,100,500,700,1200,1500,1800], 'degree':[1,2,3,4]}
clf = svm.SVC(kernel='poly')
svm_clf = grid_search.GridSearchCV(clf, svm_params, cv=5, n_jobs=4)
svm_clf.fit(X_train2,Y_train2)
print(svm_clf.best_score_)
svm_clf.best_params_















0.9525


{'C': 1200, 'degree': 3}

In [21]:
# SVM 3
svm_params = {'C':[.01,1,10,100,500,700,1200,1500,1800], 'degree':[1,2,3,4]}
clf = svm.SVC(kernel='poly')
svm_clf = grid_search.GridSearchCV(clf, svm_params, cv=5, n_jobs=4)
svm_clf.fit(X_train3,Y_train3)
print(svm_clf.best_score_)
svm_clf.best_params_















0.9648


{'C': 500, 'degree': 3}

In [28]:
# Random Forests 20 80
rf_params = {"max_features":[1,2,3], 
             "n_estimators":[300,400,500,800]}
clf = RandomForestClassifier()
rf_clf = grid_search.GridSearchCV(clf, rf_params, cv=5, n_jobs=4)
rf_clf.fit(X_train,Y_train)
print(rf_clf.best_score_)
print(rf_clf.best_params_)

rf1_score = rf_clf.best_score_
rf1_bestParams = rf_clf.best_params_

0.99
{'max_features': 1, 'n_estimators': 400}


In [29]:
# Random Forests 40 60
rf_params = {"max_features":[1,2,3], 
             "n_estimators":[500,800,1024]}
clf = RandomForestClassifier()
rf_clf = grid_search.GridSearchCV(clf, rf_params, cv=5, n_jobs=4)
rf_clf.fit(X_train2,Y_train2)
print(rf_clf.best_score_)
print(rf_clf.best_params_)

rf2_score = rf_clf.best_score_
rf2_bestParams = rf_clf.best_params_

0.994
{'max_features': 1, 'n_estimators': 1024}


In [31]:
# Random Forests 50 50
rf_params = {"max_features":[1,2,3], 
             "n_estimators":[600,800,1024,1500]}
clf = RandomForestClassifier()
rf_clf = grid_search.GridSearchCV(clf, rf_params, cv=5, n_jobs=4)
rf_clf.fit(X_train2,Y_train2)
print(rf_clf.best_score_)
print(rf_clf.best_params_)
rf3_score = rf_clf.best_score_
rf3_bestParams = rf_clf.best_params_

0.994
{'max_features': 1, 'n_estimators': 600}


In [32]:
# avg score for each proportion of splitting

RF_avgScore = (rf1_score + rf2_score + rf3_score) / 3
print(RF_avgScore)

0.9926666666666666


In [21]:
# KNN 20 80
knn_params = {"n_neighbors":[5,7,10,15,25,50,100,500]}
clf = KNeighborsClassifier()
knn_clf = grid_search.GridSearchCV(clf, knn_params, cv=5, n_jobs=4)
knn_clf.fit(X_train,Y_train);
print(knn_clf.best_score_)
knn_clf.best_params_

KNN_1score = knn_clf.best_score_

0.992


In [22]:
# KNN 40 60
knn_clf.fit(X_train2,Y_train2);
print(knn_clf.best_score_)
knn_clf.best_params_

KNN_2score = knn_clf.best_score_

0.99675


In [23]:
# KNN 50 50
knn_clf.fit(X_train3,Y_train3);
print(knn_clf.best_score_)
knn_clf.best_params_

KNN_3score = knn_clf.best_score_

0.998


In [24]:
# avg KNN score
KNN_avg = (KNN_1score + KNN_2score + KNN_3score)/3

In [28]:
# Gradient Boosting
gb_params = {"n_estimators":[512,1024],
            "learning_rate":[.01,.1]}
clf = GradientBoostingClassifier()
gb_clf = grid_search.GridSearchCV(clf, gb_params, cv=5, n_jobs=4)
gb_clf.fit(X_train,Y_train);
print(gb_clf.best_score_)
gb_clf.best_params_
GB_1score = gb_clf.best_score_

0.994


In [31]:
gb_clf.fit(X_train2,Y_train2);
print(gb_clf.best_score_)
gb_clf.best_params_
GB_2score = gb_clf.best_score_

0.9945


In [32]:
gb_clf.fit(X_train3,Y_train3);
print(gb_clf.best_score_)
gb_clf.best_params_
GB_3score = gb_clf.best_score_

0.9956


In [33]:
# avg B score
GB_avg = (GB_1score + GB_2score + GB_3score) / 3
print(GB_avg)

0.9947


In [None]:
#for classifer in [logit_clf, svm_clf, rf_clf, knn_clf, gb_clf]:
 #   print classifer.score(X_test, y_test)