In [1]:
from rap_db import *
from rap_clean import *
from rap_viz import line, verse_graph
from nltk.stem import *
from nltk import pos_tag
#import nltk
#nltk.download()
from textblob import TextBlob
import pandas as pd
import numpy as np
from copy import copy
from sklearn.model_selection import train_test_split as tr_ts_spl


In [2]:
doom = art_load(['Doom'])['Doom']
chief = art_load(['Chief Keef'])['Chief Keef']

In [3]:
#verse count for each
print(len(doom.uniq_art_verses))
print(len(chief.uniq_art_verses))

112
148


In [24]:
class line_data():
    def __init__(self, in_line_obj):
        self.line_obj = in_line_obj    
        self.gen_line_stem()
        self.gen_line_metrics()

    #want to run in multinomial and bernouli ways (one with frequency one with there not there binary)
    def gen_line_stem(self):
        stemmer = SnowballStemmer("english")
        words_stm = [stemmer.stem(w.lower()) for w in self.line_obj.words_as_strings if stemmer.stem(w.lower()) not in stopwords.words('english')]
        self.all_stemmed_words = list(filter(None, words_stm))
        self.unique_stemmed_words = set(self.all_stemmed_words)
        
    #may want to add stemming if accuracy shitty
    def gen_line_metrics(self):
        #get the vowel strings needed
        ex_vwls = self.line_obj.vowel_sounds
        nr_vwls = [v[:2] for v in ex_vwls]
        brd_vwls = [v[:1] for v in ex_vwls]
        #then vowel sounds for unique words, do it this way to not remake word objects
        check = set()
        ex_vwls_uniqs = []
        for w in self.line_obj.word_objs:
            if w.text.lower() not in check:
                check = check|{w.text.lower()}
                ex_vwls_uniqs.extend(list(zip(*w.matches))[1])
        nr_vwls_uniqs = [v[:2] for v in ex_vwls_uniqs]
        brd_vwls_uniqs = [v[:1] for v in ex_vwls_uniqs]
        
        #these are used a lot
        wrds = self.line_obj.words_as_strings
        unq_wrds = self.line_obj.uniq_words_as_strings
        wrd_cnt = len(wrds)
        unq_wrd_cnt = len(unq_wrds)
        blobs = TextBlob(" ".join(wrds)).sentiment
        
        #word based metrics
        self.metrics={'avg_wrd_len':sum(map(len,wrds))/wrd_cnt,
        'avg_unq_wrd_len':sum(map(len,unq_wrds))/unq_wrd_cnt,
        'unq_wrds_rat':unq_wrd_cnt/wrd_cnt,
                      
        #vowel based metrics
            #average vowel sounds per word
        'avg_wrd_vwls':len(ex_vwls)/wrd_cnt,
            #average vowel sounds per unique word
        'avg_unq_wrd_vwls':len(ex_vwls_uniqs)/unq_wrd_cnt,
            #average unique vowel sounds per word
        'avg_wrd_brd_unq_vwls':len(set(brd_vwls))/wrd_cnt,
        'avg_wrd_nr_unq_vwls':len(set(nr_vwls))/wrd_cnt,
        'avg_wrd_ex_unq_vwls':len(set(ex_vwls))/wrd_cnt,
            #average unique vowel sounds per unique word
        'avg_unq_wrd_brd_unq_vwls':len(set(brd_vwls_uniqs))/unq_wrd_cnt,
        'avg_unq_wrd_nr_unq_vwls':len(set(nr_vwls_uniqs))/unq_wrd_cnt,
        'avg_unq_wrd_ex_unq_vwls':len(set(ex_vwls_uniqs))/unq_wrd_cnt,
                      
        #specialized metrics
        'pol':blobs.polarity,
        'subj':blobs.subjectivity,
        'uniq_pos_rat': len(set(list(zip(*pos_tag(wrds)))[1]))/wrd_cnt,
        'uniq_pos_unq_wrd_rat': len(set(list(zip(*pos_tag(unq_wrds)))[1]))/unq_wrd_cnt}
        return self.metrics.keys()

In [40]:
#used to be 5 functions. Was running through lines 3-4 times and only need to do it once
#delete comments when happy
def artist_to_data(art_obj, inp_pop=False, inp_exc_line=True, inp_opto_type='near'):#opto stuff
    #sample to get keys for line data metrics, not needed later
    _ld = line_data(verse_graph(art_obj.uniq_art_verses[0], '', '').ver_as_lines[0])
    #make a DF for general language metrics
    metric_df = pd.DataFrame(columns = list(_ld.gen_line_metrics())+['artist'])
    #make a DF for pure word training
    lingustic_df = pd.DataFrame(columns=['artist','text','unique_text'])
    #iterate through artists unique art verses
    for v in art_obj.uniq_art_verses:
        #make a verse graph for each verse
        verse_g = verse_graph(v, art_obj.name, '')
        #opto verse
        verse_g.opto_matches(pop=inp_pop, exc_line=inp_exc_line, opto_type=inp_opto_type, record=False)
        #iterate through each line in verse graph
        for v_line in verse_g.ver_as_lines:
            #if the line has actually registered lines
            if v_line.word_objs:
                #make a line_data object from the line object and setup the dictionaries to feed to DF
                line_data_obj = line_data(v_line)
                app_dic = copy(line_data_obj.metrics)
                app_dic.update({'artist':art_obj.name})
                #append to metric DF
                metric_df = metric_df.append(app_dic, ignore_index=True)
                #append to lingustic DF
                lingustic_df = lingustic_df.append({'text':line_data_obj.all_stemmed_words, 'unique_text':line_data_obj.unique_stemmed_words, 'artist':art_obj.name}, ignore_index=True)  
    return metric_df, lingustic_df

In [41]:
def train_test_validate_split(art_lines, rs1=42, rs2=41):
    y = art_lines['artist']
    x = art_lines.ix[:, art_lines.columns.difference(['artist'])]
    x_tr, _x_ts, y_tr, _y_ts = tr_ts_spl(x, y, test_size=0.4, random_state=rs1)
    x_ts, x_vl, y_ts, y_vl = tr_ts_spl(_x_ts, _y_ts, test_size=0.5, random_state=rs2)
    return {'x_train':x_tr,'y_train':y_tr,'x_test':x_ts,'y_test':y_ts,'x_val':x_vl,'y_val':y_vl}

In [44]:
d_met, d_lin = artist_to_data(doom, inp_pop=2, inp_exc_line=False, inp_opto_type='exact')

In [49]:
c_met, c_lin = artist_to_data(chief, inp_pop=False, inp_exc_line=True, inp_opto_type='near')

In [50]:
comb_data = train_test_validate_split(d_met.append(c_met))



.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated



In [54]:
from sklearn.svm import SVC
svm_mod = SVC()
svm_mod.fit(comb_data['x_train'], comb_data['y_train'])
svm_mod.score(comb_data['x_test'], comb_data['y_test'])

0.65506653019447292

What you did
built basic data sorting functions, ran SVM with no tuning 67% accuracy


What you need to do next
seet up lingusitc data set trainngi
TEST NEW IMPLMENTATION LOOKS GOOD

Notes

Long term
Train models using two different training methadologies
1. text bag of words (simply look at words in textand classify using a naive bayes, random forest, SVM)
2. make a row for every line based on the whiteboarded lingustic measures (def use svm, maybe random forest, maybe KNN)