In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import precision_score, make_scorer, recall_score, f1_score
from sklearn import svm
from nltk.corpus import stopwords

comments = pd.read_csv('video_comments.tsv', sep='\t')
stopwords = stopwords.words("portuguese")

In [3]:
comments.sample(10)

Unnamed: 0,professor,ritmo,comment
10,Expert 1,Não,O aluno pode diminuir o tamanho do passo para ...
37,Expert 1,Sim,Soltar os braços para estar mais a vontade.
63,Expert 2,Não,Começa a musica na marcação errada.
201,Expert 5,Não,O participante iniciou a primiera marcacao for...
138,Expert 3,,Não consigo avaliar o passo básico do xote com...
274,Expert 6,Não,"Olhando para baixo prejudicando a postura, tr..."
136,Expert 3,,Não consigo avaliar o passo básico do xote com...
141,Expert 3,,Não consigo avaliar o passo básico do xote com...
156,Expert 4,Não,O aluno deve trabalhar a pausa no meio do movi...
92,Expert 2,Não,Acelera entre as marcações. Atropela


### filter text

In [4]:
import re

def wc(x): 
    try:
        return len(re.findall(r'\w+', x['comment']))
    except:
        return 0
    
comments['wc'] = comments.apply(wc,axis=1)
comments = comments[comments['wc'] > 1]

target = comments['professor'].values

### evaluate features with M NB

In [5]:
model = MultinomialNB(alpha=0.001)
average = 'macro'

for i in range(100,2000,100):
    data = TfidfVectorizer(max_features=i, strip_accents='unicode', stop_words=stopwords).fit_transform(comments.comment)

    f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring=make_scorer(f1_score, average=average)).mean()
    recall = cross_val_score(model, data.toarray(), target, cv=10, scoring=make_scorer(recall_score, average=average)).mean()
    precision = cross_val_score(model, data.toarray(), target, cv=10, scoring=make_scorer(precision_score, average=average)).mean()
    
    print(str(i) + ': ' + 'f1(' + str(round(f1,4)) 
          + '), precision(' + str(round(precision,4)) 
          + '), recall(' + str(round(recall,4)) + ')')

100: f1(0.8134), precision(0.8555), recall(0.8275)
200: f1(0.8952), precision(0.9208), recall(0.8975)
300: f1(0.8912), precision(0.9181), recall(0.895)
400: f1(0.9104), precision(0.9322), recall(0.9133)
500: f1(0.9147), precision(0.9356), recall(0.9175)
600: f1(0.9147), precision(0.9356), recall(0.9175)
700: f1(0.9147), precision(0.9356), recall(0.9175)
800: f1(0.9147), precision(0.9356), recall(0.9175)
900: f1(0.9144), precision(0.935), recall(0.9175)
1000: f1(0.9144), precision(0.935), recall(0.9175)
1100: f1(0.9144), precision(0.935), recall(0.9175)
1200: f1(0.9144), precision(0.935), recall(0.9175)
1300: f1(0.9144), precision(0.935), recall(0.9175)
1400: f1(0.9144), precision(0.935), recall(0.9175)
1500: f1(0.9144), precision(0.935), recall(0.9175)
1600: f1(0.9144), precision(0.935), recall(0.9175)
1700: f1(0.9144), precision(0.935), recall(0.9175)
1800: f1(0.9144), precision(0.935), recall(0.9175)
1900: f1(0.9144), precision(0.935), recall(0.9175)


### evaluate features with Linear SVM

In [6]:
model = svm.LinearSVC(C=2.15)
average = 'macro'

for i in range(100,2000,100):
    data = TfidfVectorizer(max_features=i, strip_accents='unicode', stop_words=stopwords).fit_transform(comments.comment)

    f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring=make_scorer(f1_score, average=average)).mean()
    recall = cross_val_score(model, data.toarray(), target, cv=10, scoring=make_scorer(recall_score, average=average)).mean()
    precision = cross_val_score(model, data.toarray(), target, cv=10, scoring=make_scorer(precision_score, average=average)).mean()
    
    print(str(i) + ': ' + 'f1(' + str(round(f1,4)) 
          + '), precision(' + str(round(precision,4)) 
          + '), recall(' + str(round(recall,4)) + ')')

100: f1(0.8749), precision(0.9041), recall(0.8758)
200: f1(0.9251), precision(0.9443), recall(0.9258)
300: f1(0.9266), precision(0.9456), recall(0.9308)
400: f1(0.9219), precision(0.9428), recall(0.9267)
500: f1(0.9271), precision(0.9428), recall(0.9308)
600: f1(0.9248), precision(0.9386), recall(0.9283)
700: f1(0.9294), precision(0.943), recall(0.9317)
800: f1(0.9289), precision(0.9408), recall(0.9317)
900: f1(0.9359), precision(0.9479), recall(0.9392)
1000: f1(0.9287), precision(0.9439), recall(0.9317)
1100: f1(0.9287), precision(0.9439), recall(0.9317)
1200: f1(0.9287), precision(0.9439), recall(0.9317)
1300: f1(0.9287), precision(0.9439), recall(0.9317)
1400: f1(0.9287), precision(0.9439), recall(0.9317)
1500: f1(0.9287), precision(0.9439), recall(0.9317)
1600: f1(0.9287), precision(0.9439), recall(0.9317)
1700: f1(0.9287), precision(0.9439), recall(0.9317)
1800: f1(0.9287), precision(0.9439), recall(0.9317)
1900: f1(0.9287), precision(0.9439), recall(0.9317)
