In [12]:
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn. naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib_venn import venn2

In [13]:
#get data

In [14]:
df_weeknd = pd.read_csv('../data/df_weeknd.csv')
df_queen = pd.read_csv('../data/df_queen.csv')
corpus_weeknd = list(df_weeknd['song_line'])
corpus_queen = list(df_queen['song_line'])

In [15]:
labels = (['weeknd']*len(corpus_weeknd))+(['queen']*len(corpus_queen))
corpus = list(df_weeknd['song_line'])+list(df_queen['song_line'])

In [16]:
Xtrain, Xtest, ytrain, ytest = train_test_split(corpus, labels, random_state=42)

In [17]:
len(Xtrain), len(Xtest), len(ytrain), len(ytest)

(1074, 359, 1074, 359)

In [42]:
# build model
def mul_model(text,label):
    """
    Takes in list of songs
    trains model on it with labels,
    and returns trained model
    """
    print('Training model...\n')
    cv = CountVectorizer(stop_words='english', ngram_range=(2,3))
    tf = TfidfTransformer()
    nb = MultinomialNB(alpha=1, fit_prior=True, class_prior=[0.31, 0.69])
    model = make_pipeline(cv,tf,nb)
    model.fit(text, label)
    return model

In [43]:
#predict
def mul_predict(m, test):
    """
    Takes the pre-trained pipeline model and predicts new artist.
    """
    predict = m.predict(test)
    probs = m.predict_proba(test).round(2)
    return predict

In [44]:
def get_score(pred, true):
    positive = 0
    negative = 0
    for m, n in zip(pred, true):
        if m==n:
            positive = positive+1
        elif m!=n:
            negative = negative+1
    return positive/(negative+positive), positive, negative
m = train_model(Xtrain, ytrain)
pred = predict(m, Xtest)            
get_score(pred, ytest)        

Training model...



(0.9442896935933147, 339, 20)

In [45]:
m.score(Xtrain,ytrain)

0.9981378026070763

In [46]:
m.predict_log_proba(Xtrain)

array([[-3.20403378e+00, -4.14452195e-02],
       [-1.38243001e-01, -2.04706760e+00],
       [-1.16756850e-01, -2.20547219e+00],
       ...,
       [-2.52614913e-04, -8.28377062e+00],
       [-3.65037775e-03, -5.61474926e+00],
       [-5.82980293e-03, -5.14768557e+00]])

In [47]:
m.predict_proba(Xtrain).round(2)

array([[0.04, 0.96],
       [0.87, 0.13],
       [0.89, 0.11],
       ...,
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.99, 0.01]])

In [48]:
m.get_params()

{'memory': None,
 'steps': [('countvectorizer',
   CountVectorizer(ngram_range=(2, 3), stop_words='english')),
  ('tfidftransformer', TfidfTransformer()),
  ('multinomialnb', MultinomialNB(alpha=1))],
 'verbose': False,
 'countvectorizer': CountVectorizer(ngram_range=(2, 3), stop_words='english'),
 'tfidftransformer': TfidfTransformer(),
 'multinomialnb': MultinomialNB(alpha=1),
 'countvectorizer__analyzer': 'word',
 'countvectorizer__binary': False,
 'countvectorizer__decode_error': 'strict',
 'countvectorizer__dtype': numpy.int64,
 'countvectorizer__encoding': 'utf-8',
 'countvectorizer__input': 'content',
 'countvectorizer__lowercase': True,
 'countvectorizer__max_df': 1.0,
 'countvectorizer__max_features': None,
 'countvectorizer__min_df': 1,
 'countvectorizer__ngram_range': (2, 3),
 'countvectorizer__preprocessor': None,
 'countvectorizer__stop_words': 'english',
 'countvectorizer__strip_accents': None,
 'countvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b',
 'countvectorizer__tok