Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

added test probability investigations

  • Loading branch information...
commit 3021d9726b4f87c9393823b79ca3dec682dfc36a 1 parent 44eff77
Ian Ozsvald authored

Showing 1 changed file with 35 additions and 5 deletions. Show diff stats Hide diff stats

  1. +35 5 learn1_experiments.py
40 learn1_experiments.py
@@ -16,9 +16,9 @@
16 16 from sklearn import tree
17 17 from sklearn import svm
18 18 from sklearn import cross_validation
19   -from sklearn.metrics import roc_curve, auc, precision_recall_curve
  19 +#from sklearn.metrics import roc_curve, auc, precision_recall_curve
20 20 from matplotlib import pyplot as plt
21   -from matplotlib import cm
  21 +#from matplotlib import cm
22 22 from nltk.corpus import stopwords
23 23 import unicodecsv
24 24
@@ -91,7 +91,7 @@ def show_errors(cross_entropy_errors_by_fold, method="cross entropy", lower_is_b
91 91 # examples (we can plot this further below using --termmatrix)
92 92 stopWords = stopwords.words('english')
93 93 MIN_DF = 2
94   - NGRAM_RANGE = (1, 2)
  94 + NGRAM_RANGE = (1, 3)
95 95 vectorizer_binary = CountVectorizer(stop_words=stopWords, min_df=MIN_DF, binary=True, ngram_range=NGRAM_RANGE)
96 96 #vectorizer_binary = CountVectorizer(stop_words=stopWords, min_df=MIN_DF, binary=True, ngram_range=(1, 2))
97 97 #vectorizer_binary = CountVectorizer(stop_words=stopWords, min_df=MIN_DF, binary=True, ngram_range=(1, 3))
@@ -101,17 +101,25 @@ def show_errors(cross_entropy_errors_by_fold, method="cross entropy", lower_is_b
101 101 print(vectorizer)
102 102
103 103 #clf = linear_model.LogisticRegression(penalty='l2', C=1.2)
104   - _ = linear_model.LogisticRegression()
  104 + clf = linear_model.LogisticRegression()
105 105 _ = svm.LinearSVC()
106   - clf = BernoulliNB() # useful for binary inputs (MultinomialNB is useful for counts)
  106 + _ = BernoulliNB() # useful for binary inputs (MultinomialNB is useful for counts)
107 107 _ = tree.DecisionTreeClassifier(compute_importances=True, max_depth=5)
108 108
109 109 kf = cross_validation.KFold(n=len(target), n_folds=5, shuffle=True)
110 110
  111 + f = plt.figure(1)
  112 + f.clf()
  113 +
111 114 # try the idea of calculating a cross entropy score per fold
112 115 cross_entropy_errors_test_by_fold = np.zeros(len(kf))
113 116 cross_entropy_errors_train_by_fold = np.zeros(len(kf))
  117 +
114 118 precisions_by_fold = np.zeros(len(kf))
  119 + # build arrays of all the class 0 and 1 probabilities (matching the class 0
  120 + # and 1 gold tags)
  121 + probabilities_class_0_Y_test_all_folds = np.array([])
  122 + probabilities_class_1_Y_test_all_folds = np.array([])
115 123 for i, (train_rows, test_rows) in enumerate(kf):
116 124 tweets_train_rows = train_set[train_rows] # select training rows
117 125 tweets_test_rows = train_set[test_rows] # select testing rows
@@ -122,6 +130,16 @@ def show_errors(cross_entropy_errors_by_fold, method="cross entropy", lower_is_b
122 130
123 131 clf.fit(X_train, Y_train)
124 132 probas_test_ = clf.predict_proba(X_test)
  133 +
  134 + # select and concatenate the class 0 and 1 probabilities to their
  135 + # respective arrays for later investigation
  136 + probabilities_class_1_Y_test = probas_test_[np.where(Y_test == 1)] # get all probabilities for class 1
  137 + nbr_features_X_test = [np.sum(row) for row in X_test[np.where(Y_test == 1)]]
  138 + class_1_labels = plt.scatter(nbr_features_X_test, probabilities_class_1_Y_test[:, 1], c='c', edgecolor='none', label="Class 1")
  139 + probabilities_class_0_Y_test = probas_test_[np.where(Y_test == 0)] # get all probabilities for class 0
  140 + nbr_features_X_test = [np.sum(row) for row in X_test[np.where(Y_test == 0)]]
  141 + class_0_labels = plt.scatter(nbr_features_X_test, probabilities_class_0_Y_test[:, 1], c='k', edgecolor='none', label="Class 0")
  142 +
125 143 probas_train_ = clf.predict_proba(X_train)
126 144 # compute cross entropy for all trained and tested items in this fold
127 145 if True:
@@ -131,6 +149,18 @@ def show_errors(cross_entropy_errors_by_fold, method="cross entropy", lower_is_b
131 149 cross_entropy_errors_train_by_fold[i] = np.average(cross_entropy_errors_train)
132 150 precisions_by_fold[i] = precision_score(Y_test, clf.predict(X_test))
133 151
  152 + print(len(test_rows))
  153 + probabilities_class_0_Y_test_all_folds = np.concatenate((probabilities_class_0_Y_test_all_folds, probabilities_class_0_Y_test[:, 1]))
  154 + probabilities_class_1_Y_test_all_folds = np.concatenate((probabilities_class_1_Y_test_all_folds, probabilities_class_1_Y_test[:, 1]))
  155 +
  156 + plt.legend((class_1_labels, class_0_labels), (class_1_labels.get_label(), class_0_labels.get_label()), scatterpoints=2, loc=7)
  157 + plt.xlim(xmin=-1)
  158 + plt.ylim(-0.05, 1.05)
  159 + plt.xlabel('Number of features for example')
  160 + plt.ylabel('Probability of class 1 for example')
  161 + plt.title("{} class probabilities with {} features".format(str(clf.__class__).split('.')[-1][:-2], len(vectorizer.get_feature_names())))
  162 + plt.show()
  163 +
134 164 if isinstance(clf, tree.DecisionTreeClassifier):
135 165 # print the most important features
136 166 feature_importances = zip(clf.feature_importances_, vectorizer.get_feature_names())

0 comments on commit 3021d97

Please sign in to comment.
Something went wrong with that request. Please try again.