In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import precision_score, make_scorer, recall_score, f1_score
from sklearn import svm
from nltk.corpus import stopwords

comments = pd.read_csv('video_comments.tsv', sep='\t')
stopwords = stopwords.words("portuguese")

In [2]:
comments.sample(10)

Unnamed: 0,professor,ritmo,comment
246,Expert 6,Não,"Falta transferência de peso, braço/ombro esque..."
76,Expert 2,Não,Uso desnecessário dos braços. Acelera muito a ...
44,Expert 1,Sim,
98,Expert 3,Não,"O movimento não tem qualidade, é um movimento ..."
8,Expert 1,Não,Falta ela se soltar mais e sentir a música. E...
212,Expert 5,Não,Participante nao se apresenta no ritmo e nao i...
69,Expert 2,Sim,Parece que ainda acelera um pouco. Uso latera...
140,Expert 3,Não,Falta relaxamento das articulações (tornozelos...
213,Expert 5,Sim,Participante se apresenta no ritmo desde o pri...
46,Expert 1,Sim,Cuidar transferência de peso. Transfira todo o...


### filter text

In [3]:
import re

def wc(x): 
    try:
        return len(re.findall(r'\w+', x['comment']))
    except:
        return 0
    
comments['wc'] = comments.apply(wc,axis=1)
comments = comments[comments['wc'] > 1]

target = comments['professor'].values

### evaluate features with M NB

In [8]:
model = MultinomialNB(alpha=0.001)
average = 'macro'

for i in range(10,150,10):
    data = TfidfVectorizer(max_features=i, strip_accents='unicode', stop_words=stopwords).fit_transform(comments.comment)

    f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring=make_scorer(f1_score, average=average)).mean()
    recall = cross_val_score(model, data.toarray(), target, cv=10, scoring=make_scorer(recall_score, average=average)).mean()
    precision = cross_val_score(model, data.toarray(), target, cv=10, scoring=make_scorer(precision_score, average=average)).mean()
    
    print(str(i) + ': ' + 'f1(' + str(round(f1,4)) 
          + '), precision(' + str(round(precision,4)) 
          + '), recall(' + str(round(recall,4)) + ')')

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


10: f1(0.5528), precision(0.6017), recall(0.5825)
20: f1(0.6618), precision(0.7053), recall(0.6867)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


30: f1(0.7199), precision(0.7756), recall(0.7375)
40: f1(0.7332), precision(0.7934), recall(0.7458)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


50: f1(0.7175), precision(0.7643), recall(0.735)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


60: f1(0.6984), precision(0.7451), recall(0.7192)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


70: f1(0.7089), precision(0.7546), recall(0.7317)
80: f1(0.7576), precision(0.7944), recall(0.7808)
90: f1(0.807), precision(0.8512), recall(0.82)
100: f1(0.8134), precision(0.8555), recall(0.8275)
110: f1(0.8131), precision(0.8503), recall(0.8267)
120: f1(0.8321), precision(0.8723), recall(0.8408)
130: f1(0.8459), precision(0.8822), recall(0.8542)
140: f1(0.8773), precision(0.9043), recall(0.88)


### evaluate features with Linear SVM

In [9]:
model = svm.LinearSVC(C=2.15)
average = 'macro'

for i in range(10,150,10):
    data = TfidfVectorizer(max_features=i, strip_accents='unicode', stop_words=stopwords).fit_transform(comments.comment)

    f1 = cross_val_score(model, data.toarray(), target, cv=10, scoring=make_scorer(f1_score, average=average)).mean()
    recall = cross_val_score(model, data.toarray(), target, cv=10, scoring=make_scorer(recall_score, average=average)).mean()
    precision = cross_val_score(model, data.toarray(), target, cv=10, scoring=make_scorer(precision_score, average=average)).mean()
    
    print(str(i) + ': ' + 'f1(' + str(round(f1,4)) 
          + '), precision(' + str(round(precision,4)) 
          + '), recall(' + str(round(recall,4)) + ')')

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


10: f1(0.5922), precision(0.6701), recall(0.6125)


  'precision', 'predicted', average, warn_for)


20: f1(0.6982), precision(0.7569), recall(0.72)
30: f1(0.7602), precision(0.8011), recall(0.7708)
40: f1(0.7936), precision(0.8389), recall(0.8008)
50: f1(0.8007), precision(0.832), recall(0.81)
60: f1(0.8013), precision(0.8338), recall(0.8092)
70: f1(0.8229), precision(0.8572), recall(0.8258)
80: f1(0.8441), precision(0.8749), recall(0.8525)
90: f1(0.8767), precision(0.8989), recall(0.8825)
100: f1(0.8749), precision(0.9041), recall(0.8758)
110: f1(0.8668), precision(0.8919), recall(0.8742)
120: f1(0.8928), precision(0.9154), recall(0.8958)
130: f1(0.8932), precision(0.9133), recall(0.895)
140: f1(0.8992), precision(0.9234), recall(0.8983)
