In [2]:
from readData import readTaobaoData
import numpy as np

dataLines, _ = readTaobaoData()

# 将dataLines的数据，label分开
X_all = np.array([line[2] for line in dataLines])
Y1_all = [line[0] for line in dataLines]
Y2_all = [line[1] for line in dataLines]

# 划分数据，前500条为训练集，后72条为测试集
X_train = X_all[0:500]
Y1_train = Y1_all[0:500]
Y2_train = Y2_all[0:500]

X_test = X_all[500:]
Y1_test = Y1_all[500:]
Y2_test = Y2_all[500:]

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.763 seconds.
Prefix dict has been built succesfully.


In [70]:
sum(Y1_test==np.ones(len(Y1_test)))

22

In [74]:
sum(Y2_all==np.ones(len(Y1_all)))

173

In [44]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.grid_search import GridSearchCV

def printScores(Y_true, Y_pred):
    head_format = '{:10}{:10}{:10}{:10}'
    num_format = '%-10.2f%-10.2f%-10.2f%-10.2f'
    print(head_format.format('accuracy', 'precision', 'recall', 'f1 score'))
    print(num_format % (accuracy_score(Y_true, Y_pred)*100, 
                        precision_score(Y_true, Y_pred)*100,
                        recall_score(Y_true, Y_pred)*100, 
                        f1_score(Y_true, Y_pred)*100))
    
def evaluate(X_train, Y_train, X_test, Y_test, clfClass, para):
    clf = clfClass(**para)
    clf.fit(X_train, Y_train)
    Y_clf = clf.predict(X_test)
    printScores(Y_test, Y_clf)
    
def gridSearchEvaluate(X_train, Y_train, X_test, Y_test,
                       clfClass, gridPara, searchPara):
    grid = GridSearchCV(clfClass(), searchPara, **gridPara)
    grid.fit(np.concatenate((X_test, X_train)), np.concatenate((Y_test, Y_train)))
    print('The best params is ', grid.best_params_, 
          '\nWith scores %.2f%%' % (grid.best_score_*100))
    evaluate(X_train, Y_train, X_test, Y_test, clfClass, grid.best_params_)

## 方法评价

In [46]:
gridSearchAccuracy = {'cv': 5, 'scoring': 'accuracy', 'n_jobs': -1}
gridSearchPrecision = {'cv': 5, 'scoring': 'precision', 'n_jobs': -1}

def gridSearchEvaluateWrapper(clfClass, gridPara, searchPara1, searchPara2):
    gridSearchEvaluate(X_train, Y1_train, X_test, Y1_test, 
                       clfClass, gridPara, searchPara1)
    print()
    gridSearchEvaluate(X_train, Y2_train, X_test, Y2_test, 
                       clfClass, gridPara, searchPara2)
    
def scoringWrapper(clfClass, searchPara1, searchPara2):
    print("Using scoring: accuracy")
    gridSearchEvaluateWrapper(clfClass, gridSearchAccuracy, searchPara1, searchPara2)
    print()
    print("Using scoring: precision")
    gridSearchEvaluateWrapper(clfClass, gridSearchPrecision, searchPara1, searchPara2)

## knn

In [47]:
from sklearn.neighbors import KNeighborsClassifier

k_list=[i+1  for i in range(40)]
knn_para = dict(n_neighbors=k_list, weights=['uniform','distance'])
scoringWrapper(KNeighborsClassifier, knn_para, knn_para)

Using scoring: accuracy
The best params is  {'n_neighbors': 6, 'weights': 'distance'} 
With scores 93.79%
accuracy  precision recall    f1 score  
93.75     87.50     63.64     73.68     

The best params is  {'n_neighbors': 16, 'weights': 'distance'} 
With scores 83.79%
accuracy  precision recall    f1 score  
88.12     90.00     51.43     65.45     

Using scoring: precision
The best params is  {'n_neighbors': 16, 'weights': 'distance'} 
With scores 87.13%
accuracy  precision recall    f1 score  
95.00     93.75     68.18     78.95     

The best params is  {'n_neighbors': 24, 'weights': 'uniform'} 
With scores 86.94%
accuracy  precision recall    f1 score  
86.88     88.89     45.71     60.38     


## SVM

In [48]:
from sklearn.svm import SVC

svm_para = [{'kernel': ['rbf'], 'gamma': [1e-3,1e-4], 'C': [1,10,100]},
            {'kernel': ['linear'], 'C': [1,10,100]}]
scoringWrapper(SVC, svm_para, svm_para)

Using scoring: accuracy
The best params is  {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'} 
With scores 94.24%
accuracy  precision recall    f1 score  
95.62     94.12     72.73     82.05     

The best params is  {'C': 1, 'kernel': 'linear'} 
With scores 84.55%
accuracy  precision recall    f1 score  
87.50     80.00     57.14     66.67     

Using scoring: precision
The best params is  {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'} 
With scores 85.99%
accuracy  precision recall    f1 score  
95.62     94.12     72.73     82.05     

The best params is  {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'} 
With scores 86.26%
accuracy  precision recall    f1 score  
86.88     88.89     45.71     60.38     


## Naive Bayes

In [49]:
from sklearn.naive_bayes import GaussianNB

gnb1 = GaussianNB()
gnb1.fit(X_train,Y1_train)
Y1_bayes = gnb1.predict(X_test)
printScores(Y1_test, Y1_bayes)

gnb2 = GaussianNB()
gnb2.fit(X_train,Y2_train)
Y2_bayes = gnb2.predict(X_test)
printScores(Y2_test, Y2_bayes)

accuracy  precision recall    f1 score  
92.50     72.73     72.73     72.73     
accuracy  precision recall    f1 score  
83.75     69.57     45.71     55.17     


## Logistic Regression

In [50]:
from sklearn.linear_model import LogisticRegression

solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag','saga']
logistic_para = [{'solver': solvers, 'penalty': ['l2'], "max_iter": [int(1e5), int(1e4)]},
                {'penalty': ['l1'], "max_iter": [int(1e5), int(1e4)]}]
scoringWrapper(LogisticRegression, logistic_para, logistic_para)

Using scoring: accuracy
The best params is  {'max_iter': 100000, 'penalty': 'l1'} 
With scores 93.48%
accuracy  precision recall    f1 score  
93.75     80.00     72.73     76.19     

The best params is  {'max_iter': 100000, 'penalty': 'l2', 'solver': 'liblinear'} 
With scores 83.94%
accuracy  precision recall    f1 score  
85.62     71.43     57.14     63.49     

Using scoring: precision
The best params is  {'max_iter': 100000, 'penalty': 'l1'} 
With scores 82.86%
accuracy  precision recall    f1 score  
93.75     80.00     72.73     76.19     

The best params is  {'max_iter': 100000, 'penalty': 'l2', 'solver': 'liblinear'} 
With scores 78.19%
accuracy  precision recall    f1 score  
85.62     71.43     57.14     63.49     


## Decision Tree

In [54]:
from sklearn.tree import DecisionTreeClassifier

dTree_para = [{'max_features' :[i for i in range(10,30)], 
               'max_depth': [i for i in range(1,20)]}]
scoringWrapper(DecisionTreeClassifier, dTree_para, dTree_para)

Using scoring: accuracy
The best params is  {'max_depth': 2, 'max_features': 19} 
With scores 93.79%
accuracy  precision recall    f1 score  
91.25     68.18     68.18     68.18     

The best params is  {'max_depth': 1, 'max_features': 10} 
With scores 84.09%
accuracy  precision recall    f1 score  
85.62     80.00     45.71     58.18     

Using scoring: precision
The best params is  {'max_depth': 2, 'max_features': 26} 
With scores 84.86%
accuracy  precision recall    f1 score  
93.12     86.67     59.09     70.27     

The best params is  {'max_depth': 2, 'max_features': 15} 
With scores 87.32%
accuracy  precision recall    f1 score  
83.12     83.33     28.57     42.55     


## AdaBoost

In [63]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Due to bad design, we need two seperate wrapper and run scoringWrapper two times.
def adaBoostWrapper1(n_estimators=50):
    return AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, max_features=26), 
                              algorithm="SAMME", n_estimators=n_estimators)

def adaBoostWrapper2(n_estimators=50):
    return AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, max_features=15), 
                              algorithm="SAMME", n_estimators=n_estimators)

ada_para = [{'n_estimators': [i for i in range(1,100,2)]}]
scoringWrapper(adaBoostWrapper1, ada_para, ada_para)
print()
scoringWrapper(adaBoostWrapper2, ada_para, ada_para)

Using scoring: accuracy
The best params is  {'n_estimators': 21} 
With scores 93.48%
accuracy  precision recall    f1 score  
91.88     73.68     63.64     68.29     

The best params is  {'n_estimators': 53} 
With scores 84.55%
accuracy  precision recall    f1 score  
85.00     70.37     54.29     61.29     

Using scoring: precision
The best params is  {'n_estimators': 35} 
With scores 84.60%
accuracy  precision recall    f1 score  
95.00     93.75     68.18     78.95     

The best params is  {'n_estimators': 1} 
With scores 86.14%
accuracy  precision recall    f1 score  
83.75     71.43     42.86     53.57     

Using scoring: accuracy
The best params is  {'n_estimators': 37} 
With scores 93.79%
accuracy  precision recall    f1 score  
92.50     77.78     63.64     70.00     

The best params is  {'n_estimators': 81} 
With scores 85.30%
accuracy  precision recall    f1 score  
84.38     64.71     62.86     63.77     

Using scoring: precision
The best params is  {'n_estimators': 9}

## Gradient Boosting

In [57]:
from sklearn.ensemble import GradientBoostingClassifier

gdb1 = GradientBoostingClassifier()
gdb1.fit(X_train, Y1_train)
Y1_gdb = gdb1.predict(X_test)
printScores(Y1_test, Y1_gdb)

gdb2 = GradientBoostingClassifier()
gdb2.fit(X_train, Y2_train)
Y2_gdb = gdb2.predict(X_test)
printScores(Y2_test, Y2_gdb)

accuracy  precision recall    f1 score  
91.88     71.43     68.18     69.77     
accuracy  precision recall    f1 score  
86.25     70.97     62.86     66.67     


## Random Forest

In [58]:
from sklearn.ensemble import RandomForestClassifier

rf_para = [{'max_features': [i for i in range(1,32,4)], 
            'max_depth': [i for i in range(1,20,5)], 
            'n_estimators': [10,155]}]
scoringWrapper(RandomForestClassifier, rf_para, rf_para)

Using scoring: accuracy
The best params is  {'max_depth': 16, 'max_features': 1, 'n_estimators': 155} 
With scores 94.09%
accuracy  precision recall    f1 score  
91.88     71.43     68.18     69.77     

The best params is  {'max_depth': 11, 'max_features': 17, 'n_estimators': 10} 
With scores 85.76%
accuracy  precision recall    f1 score  
85.00     72.00     51.43     60.00     

Using scoring: precision
The best params is  {'max_depth': 1, 'max_features': 1, 'n_estimators': 155} 
With scores 87.64%
accuracy  precision recall    f1 score  
93.12     92.31     54.55     68.57     

The best params is  {'max_depth': 1, 'max_features': 1, 'n_estimators': 155} 
With scores 87.92%
accuracy  precision recall    f1 score  
85.00     82.35     40.00     53.85     
