In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn import svm
from nltk.corpus import stopwords

comments = pd.read_csv('video_comments.tsv', sep='\t')
stopwords = stopwords.words("portuguese")

In [37]:
comments.sample(10)

Unnamed: 0,professor,ritmo,comment
241,Expert 6,Não,"Falta transferência de peso, braço/ombro esque..."
117,Expert 3,Não,O participante está quase no ritmo da música m...
219,Expert 5,Sim,O participante se apresenta no ritmo durante t...
23,Expert 1,Sim,"Olhar menos para o chão, soltar os braços e re..."
111,Expert 3,Não,O participante ouve a música e consegue identi...
244,Expert 6,Não,"Falta transferência de peso, braço/ombro esque..."
124,Expert 3,Não,A participante está no compasso da música e re...
159,Expert 4,Sim,O aluno ainda não parece ter muita confiança n...
20,Expert 1,Sim,"O aluno se atrapalhou algumas vezes, mas teve ..."
42,Expert 1,Sim,"O passo deve ser mais arrastado, sem ponta de ..."


### binary target

In [38]:
def classFit(x):
    if x['ritmo'] == "Sim":
        return 1
    else:
        return -1
    
comments['class'] = comments.apply(classFit,axis=1)

### filter text

In [39]:
import re

def wc(x): 
    try:
        return len(re.findall(r'\w+', x['comment']))
    except:
        return 0
    
comments['wc'] = comments.apply(wc,axis=1)
comments = comments[comments['wc'] > 1]

target = comments['class'].values

### evaluate features with M NB

In [42]:
model = MultinomialNB(alpha=0.001)

for i in range(100,2000,100):
    data = TfidfVectorizer(max_features=i, strip_accents='unicode', stop_words=stopwords).fit_transform(comments.comment)

    f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
    acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
    recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()
    precision = cross_val_score(model, data.toarray(), target, cv=10, scoring='precision').mean()
    
    print(str(i) + ': ' + 'f1(' + str(round(f1,4)) 
          + '), acc(' + str(round(acc,4)) 
          + '), precision(' + str(round(precision,4)) 
          + '), recall(' + str(round(recall,4)) + ')')

100: f1(0.5698), acc(0.6797), precision(0.6536), recall(0.5727)
200: f1(0.5271), acc(0.632), precision(0.5844), recall(0.5545)
300: f1(0.5125), acc(0.6109), precision(0.5769), recall(0.5645)
400: f1(0.5273), acc(0.6254), precision(0.5946), recall(0.5655)
500: f1(0.4933), acc(0.6081), precision(0.5539), recall(0.5282)
600: f1(0.4981), acc(0.6116), precision(0.5568), recall(0.5373)
700: f1(0.5003), acc(0.6079), precision(0.5529), recall(0.5473)
800: f1(0.4975), acc(0.601), precision(0.5418), recall(0.5473)
900: f1(0.4914), acc(0.5899), precision(0.5331), recall(0.5473)
1000: f1(0.4914), acc(0.5899), precision(0.5331), recall(0.5473)
1100: f1(0.4914), acc(0.5899), precision(0.5331), recall(0.5473)
1200: f1(0.4914), acc(0.5899), precision(0.5331), recall(0.5473)
1300: f1(0.4914), acc(0.5899), precision(0.5331), recall(0.5473)
1400: f1(0.4914), acc(0.5899), precision(0.5331), recall(0.5473)
1500: f1(0.4914), acc(0.5899), precision(0.5331), recall(0.5473)
1600: f1(0.4914), acc(0.5899), preci

### evaluate features with Linear SVM

In [43]:
model = svm.LinearSVC(C=2.15)

for i in range(100,2000,100):
    data = TfidfVectorizer(max_features=i, strip_accents='unicode', stop_words=stopwords).fit_transform(comments.comment)

    f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring='f1').mean()
    acc = cross_val_score(model, data.toarray(), target, cv=10, scoring='accuracy').mean()
    recall = cross_val_score(model, data.toarray(), target, cv=10, scoring='recall').mean()
    precision = cross_val_score(model, data.toarray(), target, cv=10, scoring='precision').mean()
    
    print(str(i) + ': ' + 'f1(' + str(round(f1,4)) 
          + '), acc(' + str(round(acc,4)) 
          + '), precision(' + str(round(precision,4)) 
          + '), recall(' + str(round(recall,4)) + ')')

100: f1(0.5204), acc(0.6299), precision(0.5609), recall(0.5491)
200: f1(0.5297), acc(0.6187), precision(0.5356), recall(0.6)
300: f1(0.5151), acc(0.6075), precision(0.5137), recall(0.5791)
400: f1(0.5122), acc(0.6149), precision(0.5155), recall(0.5709)
500: f1(0.5033), acc(0.6011), precision(0.5212), recall(0.5527)
600: f1(0.5133), acc(0.6008), precision(0.5153), recall(0.5791)
700: f1(0.4983), acc(0.5977), precision(0.5113), recall(0.5518)
800: f1(0.526), acc(0.6255), precision(0.5249), recall(0.5891)
900: f1(0.5265), acc(0.626), precision(0.5257), recall(0.59)
1000: f1(0.5286), acc(0.6295), precision(0.5283), recall(0.59)
1100: f1(0.5286), acc(0.6295), precision(0.5283), recall(0.59)
1200: f1(0.5286), acc(0.6295), precision(0.5283), recall(0.59)
1300: f1(0.5286), acc(0.6295), precision(0.5283), recall(0.59)
1400: f1(0.5286), acc(0.6295), precision(0.5283), recall(0.59)
1500: f1(0.5286), acc(0.6295), precision(0.5283), recall(0.59)
1600: f1(0.5286), acc(0.6295), precision(0.5283), rec