In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

# nltk.download('stopwords')

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.naive_bayes import MultinomialNB

In [2]:
df_bossa = pd.read_csv('./data/bossa_nova.csv')

df_funk = pd.read_csv('./data/funk.csv')

df_gospel = pd.read_csv('./data/gospel.csv')

df_sertanejo = pd.read_csv('./data/sertanejo.csv')

In [3]:
# cria base de dados única

df_bossa['genre'] = 'bossa'
df_funk['genre'] = 'funk'
df_gospel['genre'] = 'gospel'
df_sertanejo['genre'] = 'sertanejo'

frames = [df_bossa, df_funk, df_gospel, df_sertanejo]

base_dados = pd.concat(frames)

In [4]:
def preprocessamento(df):
  # converte letras para lowercase
  df['lyric'] = df['lyric'].apply(lambda x: x.lower())

  # remove pontuação
  df['lyric'] = df['lyric'].str.replace(r'[^\w\s]', '')

  # remove \n|\r|\n\r
  df['lyric'] = df['lyric'].str.replace(r'\n|\r|\n\r', ' ')

  # remove stopwords
  stop = set(stopwords.words('portuguese'))
  df['lyric'] = df['lyric'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

  # convertendo label 'genre' para números
  le = LabelEncoder()
  df['genre'] = le.fit(df['genre']).transform(df['genre'])
    
  return df

df = preprocessamento(base_dados)

In [21]:
# normalizacao 
from sklearn.feature_extraction.text import TfidfVectorizer
count_vec = TfidfVectorizer()
count_vec.set_params(stop_words=None, max_features=30000, min_df=4, ngram_range=(1, 2))
counts = count_vec.fit_transform(df.lyric)

scale = MinMaxScaler()
df_numeric = scale.fit_transform(counts.toarray())
df_numeric = pd.DataFrame(df_numeric)

In [48]:
x_train, x_test, y_train, y_test = train_test_split(df_numeric, df.genre, test_size = .1, random_state=200)

In [49]:
lr = LogisticRegression()
fit = lr.fit(x_train, y_train)
pred = fit.predict(x_test)
accuracy = accuracy_score(y_test, pred)

accuracy



0.871875

In [50]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
fit = nb.fit(x_train, y_train)
pred = fit.predict(x_test)
accuracy = accuracy_score(y_test, pred)

accuracy

0.840625

In [51]:
from sklearn.linear_model import SGDClassifier
sgdc = SGDClassifier(loss='hinge', penalty='l2',
                     alpha=1e-3, random_state=42)
fit = sgdc.fit(x_train, y_train)
pred = fit.predict(x_test)
accuracy = accuracy_score(y_test, pred)

accuracy



0.865625

In [52]:
from sklearn.neural_network import MLPClassifier
decision = MLPClassifier(alpha=1)
fit = decision.fit(x_train, y_train)
pred = fit.predict(x_test)
accuracy = accuracy_score(y_test, pred)

accuracy

0.86875

In [53]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(9)
fit = decision.fit(x_train, y_train)
pred = fit.predict(x_test)
accuracy = accuracy_score(y_test, pred)

accuracy

0.86875