In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import oll

corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
stopwords = stopwords.words("portuguese")

In [2]:
#corpus = corpus[corpus['_golden'] == False]
corpus = corpus[corpus['qual_a_melhor_classificao_para_esse_texto:confidence'] == 1]
corpus = corpus.reset_index()
corpus.shape

(551, 28)

In [4]:
# fix labels to binary
def classFit(x):
    if x['qual_a_melhor_classificao_para_esse_texto'] == "diario":
        return 1
    else:
        return -1
    
corpus['class'] = corpus.apply(classFit,axis=1)
target = corpus['class'].values

print(corpus['qual_a_melhor_classificao_para_esse_texto'].values[:2])
print(corpus['class'][:2])

['diario' 'diario']
0    1
1    1
Name: class, dtype: int64


In [None]:
import nltk.stem
portuguese_stemmer = nltk.stem.RSLPStemmer()
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer,self).build_analyzer()
        return lambda doc: (portuguese_stemmer.stem(w) for w in analyzer(doc))

In [17]:
## manual 10-fold cross-validation
kf = KFold(n_splits=2, random_state=None, shuffle=False)

for i in range(500,5000,100):
    accuracy = []
    precision = []
    recall = []
    
    data = TfidfVectorizer(max_features=i, strip_accents='unicode', stop_words=stopwords).fit_transform(corpus.content)

    for train_index, test_index in kf.split(data):
        model = oll.oll("CW", C=1)
        
        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = target[train_index], target[test_index]
        model.fit(X_train, y_train)
        predicted = model.predict(X_test)

        accuracy.append(accuracy_score(y_test, predicted))
        precision.append(precision_score(y_test, predicted))
        recall.append(recall_score(y_test, predicted))

    print(str(i) + ': acc(' + str(np.mean(accuracy)) 
          + '), prec(' + str(np.mean(precision))
          + '), rec(' + str(np.mean(recall)) + ')'
         )

500: acc(0.773162055336), prec(0.787614678899), rec(0.89360743365)
600: acc(0.77314229249), prec(0.782922654913), rec(0.901671949779)
700: acc(0.774953886693), prec(0.776339147029), rec(0.918810988874)
800: acc(0.782233201581), prec(0.780231023102), rec(0.927279507738)
900: acc(0.79312911726), prec(0.788461538462), rec(0.933261855926)
1000: acc(0.78222002635), prec(0.778484043711), rec(0.930169681149)
1100: acc(0.78220685112), prec(0.778299011711), rec(0.929967679781)
1200: acc(0.782213438735), prec(0.775792464115), rec(0.935748026602)
1300: acc(0.778563899868), prec(0.774526919264), rec(0.929967679781)
1400: acc(0.77674571805), prec(0.770164348925), rec(0.935344023867)
1500: acc(0.771297760211), prec(0.76364628821), rec(0.93803219591)
1600: acc(0.778557312253), prec(0.767070484581), rec(0.946702716141)
1700: acc(0.782173913043), prec(0.770436303766), rec(0.946500714774)
1800: acc(0.785816864295), prec(0.770419916744), rec(0.955171235005)
1900: acc(0.782180500659), prec(0.768126338762)

In [18]:
vectorizer = TfidfVectorizer(max_features=900, strip_accents='unicode', stop_words=stopwords)
data = vectorizer.fit_transform(corpus.content)

accuracy = []
precision = []
recall = []

for train_index, test_index in kf.split(data):

    model = oll.oll("CW", C=1)
    
    X_train, X_test = data[train_index], data[test_index]
    y_train, y_test = target[train_index], target[test_index]
    model.fit(X_train, y_train)
    predicted = model.predict(X_test)

    accuracy.append(accuracy_score(y_test, predicted))
    precision.append(precision_score(y_test, predicted))
    recall.append(recall_score(y_test, predicted))

print(str(len(vectorizer.get_feature_names())) + ': acc(' + str(np.mean(accuracy)) 
          + '), prec(' + str(np.mean(precision))
          + '), rec(' + str(np.mean(recall)) + ')'
         )

900: acc(0.79312911726), prec(0.788461538462), rec(0.933261855926)


In [19]:
feature_names = vectorizer.get_feature_names()
pd.DataFrame(feature_names).to_csv('feature_names_cw.csv')