/
sentiment1.py
57 lines (44 loc) · 1.62 KB
/
sentiment1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import KFold
def make_xy(data,vectorizer=None):
if vectorizer==None:
vectorizer = CountVectorizer(min_df=0)
text = [words for i,words in data.text.iteritems()]
vectorizer.fit(text)
x = vectorizer.transform(text)
#x = x.toarray()
x = sparse.csc_matrix(x)
performance = data.performance
perfarray = np.array(performance)
return x,perfarray
def log_likelihood(clf,x,y):
mat = clf.predict_log_proba(x)
return sum(mat[y==1,1])+sum(mat[y==0,0])
def cv_score(clf, x, y, score_func):
"""
Uses 5-fold cross validation to estimate a score of a classifier
Inputs
------
clf : Classifier object
x : Input feature vector
y : Input class labels
score_func : Function like log_likelihood, that takes (clf, x, y) as input,
and returns a score
Returns
-------
The average score obtained by randomly splitting (x, y) into training and
test sets, fitting on the training set, and evaluating score_func on the test set
Examples
cv_score(clf, x, y, log_likelihood)
"""
result = 0
nfold = 5
for train, test in KFold(y.size, nfold): # split data into train/test groups, 5 times
clf.fit(x[train], y[train]) # fit
result += score_func(clf, x[test], y[test]) # evaluate score function on held-out data
return result / nfold # average