In [1]:
import pandas as pd
import numpy as np
import timeit
import sys

from sklearn import linear_model
from sklearn.svm import SVC
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import precision_recall_curve, roc_curve

from scipy.sparse import csr_matrix

Importing the dense matrix files

In [None]:
# Import the training file
df = pd.read_csv('densemats/train_mat.csv', header=None) 
#n,d = df.shape
X_train_raw = df.values[:, 1 :]
y_train = df.values[:, 0]
n, d = X_train_raw.shape

# Evaluate the tf-idf transformation on the counts
transformer = TfidfTransformer()
X_train_tfidf = transformer.fit_transform(X_train_raw)

# Import the testing file
df = pd.read_csv('densemats/test_mat.csv', header=None) # Import the datafile
X_test_raw = df.values[:, 1 :]
y_test = df.values[:, 0]

X_test_tfidf = transformer.fit_transform(X_test_raw)

Importing the sparse matrix files

In [2]:
# Import the training file
df_train = pd.read_csv('spmats/train_Xsp.csv', header=None)
df_test = pd.read_csv('spmats/test_Xsp.csv', header=None)
d = max(df_train.values[:, 1].max(), df_test.values[:, 1].max()) + 1
n_train, n_test = df_train.values[:, 0].max() + 1, df_test.values[:, 0].max() + 1

X_train_raw = csr_matrix((df_train.values[:, 2], (df_train.values[:, 0], df_train.values[:, 1])), shape=(n_train, d), dtype="float64")
y_train = pd.read_csv('spmats/train_y.csv', header=None).values[:, 0] 
n = X_train_raw.shape[0]

# Evaluate the tf-idf transformation on the counts
transformer = TfidfTransformer()
X_train_tfidf = transformer.fit_transform(X_train_raw)

# Import the testing file
X_test_raw = csr_matrix((df_test.values[:, 2], (df_test.values[:, 0], df_test.values[:, 1])), shape=(n_test, d), dtype="float64")
y_test = pd.read_csv('spmats/test_y.csv', header=None).values[:, 0] 

X_test_tfidf = transformer.fit_transform(X_test_raw)

In [3]:
# Construct classifier objects for cross validation
cv = KFold(n, n_folds=5)
C_range = np.logspace(-1, 7, 9) # Range of Parameter C, larger C means less regularization

classifier_svc = SVC(kernel='linear') # The linear svm classifier on the raw counts
classifier_logreg = linear_model.LogisticRegression() # The logistic classifier

In [4]:
# Tune the classifiers and evaluate their performances
case = 0

precision = dict()
recall = dict()
fpr = dict()
tpr = dict()

for classifier in [classifier_svc, classifier_logreg]:
    for X_train, X_test in [(X_train_raw, X_test_raw), (X_train_tfidf, X_test_tfidf)]:
        start_time = timeit.default_timer()

        print("Tuning case {0}...".format(case))

        grid = GridSearchCV(classifier, param_grid=dict(C=C_range), cv=cv, verbose=1, n_jobs=4)
        grid.fit(X_train, y_train)

        print(" - The best parameters are {0} with a score of {1}".format(grid.best_params_, grid.best_score_))
        print(" - The score on the test set is {0}".format(grid.score(X_test, y_test)))
        
        y_score = grid.best_estimator_.decision_function(X_test)
        precision[case], recall[case], _ = precision_recall_curve(y_test, y_score)
        fpr[case], tpr[case], _ = roc_curve(y_test, y_score)
        
        print(" - Elapsed time is {0}".format(timeit.default_timer() - start_time))
        sys.stdout.flush()
        
        case = case + 1

Tuning case 0...
Fitting 5 folds for each of 9 candidates, totalling 45 fits
 - The best parameters are {'C': 1.0} with a score of 0.9837691614066727
 - The score on the test set is 0.9881956155143339
 - Elapsed time is 4.391517632999239
Tuning case 1...
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed:    3.7s finished


 - The best parameters are {'C': 10.0} with a score of 0.9871505861136158
 - The score on the test set is 0.9904440697020798
 - Elapsed time is 14.584976989001007
Tuning case 2...
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed:   13.0s finished


 - The best parameters are {'C': 1000.0} with a score of 0.986248872858431
 - The score on the test set is 0.9865092748735245
 - Elapsed time is 3.1231523290007317
Tuning case 3...
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed:    3.0s finished


 - The best parameters are {'C': 1000000.0} with a score of 0.9873760144274121
 - The score on the test set is 0.9910061832490163
 - Elapsed time is 1.9424515230002726


[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed:    1.8s finished


In [5]:
# Visualization
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib qt

axis_font = {'size':'20'}
mpl.rcParams['xtick.labelsize'] = 16
mpl.rcParams['ytick.labelsize'] = 16

labels = ['SVC-raw', 'SVC-tfidf', 'LogReg-raw', 'LogReg-tfidf']
line_specs = ['b-', 'b--', 'r-', 'r--']

# The PR curve
plt.figure()
for case in range(4):
    plt.plot(recall[case], precision[case], line_specs[case], label=labels[case], linewidth=2)

plt.xlim([0.0, 1.0])
plt.ylim([0.9, 1.05])
plt.grid()
plt.xlabel('Recall',  **axis_font)
plt.ylabel('Precision',  **axis_font)
plt.legend(loc="lower left", prop={'size':16})

# The ROC curve
plt.figure()
for case in range(4):
    plt.plot(fpr[case], tpr[case], line_specs[case], label=labels[case], linewidth=2)

#plt.plot([0, 1], [0, 1], 'k--', linewidth=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.9, 1.05])
plt.grid()
plt.xlabel('False Positive Rate',  **axis_font)
plt.ylabel('True Positive Rate',  **axis_font)
plt.legend(loc="lower right", prop={'size':16})

<matplotlib.legend.Legend at 0x7fd6140bd400>