In [34]:
import pandas as pd
df = pd.read_csv('data/df/artist_song.csv')
df.head(25)

Unnamed: 0,artist,lyrics,song_strings_lists
0,bastille_,"Ooh, I'm not ready\nOoh, I'm not ready\nOoh, I...","['ooh', 'ready', 'ooh', 'ready', 'ooh', 'ready..."
1,rolling_blackouts_coastal_fever,Stepping into the town\nIgnore the stink of bu...,"['step', 'town', 'ignore', 'stink', 'burn', 'l..."
2,rolling_blackouts_coastal_fever,"Stand on the morning, head like a weather vane...","['stand', 'morning', 'head', 'like', 'weather'..."
3,bastille_,"Yeah, would I lie to you baby?\nYeah, would I?...","['yeah', 'lie', 'baby', 'yeah', 'yeah', 'lie',..."
4,rolling_blackouts_coastal_fever,I see a problem\nI'm not willing to address\nA...,"['problem', 'willing', 'address', 'pleasure', ..."
5,bastille_,"We're nocturnal, we're nocturnal\nWe're noctur...","['nocturnal', 'nocturnal', 'nocturnal', 'noctu..."
6,bastille_,"Roll the window down, let the air blow 'round ...","['roll', 'window', 'let', 'air', 'blow', 'roun..."
7,bastille_,"Don't talk to strangers.\n\nOh, in the strange...","['talk', 'stranger', 'oh', 'strange', 'dream',..."
8,bastille_,"""What the f*ck are y'all doing is the question...","['question', 'question', 'animal', 'break', 'c..."
9,bastille_,When I watch the world burn\nAll I think about...,"['watch', 'world', 'burn', 'think', 'watch', '..."


### Feature Engineering ("word engineering")
* download and save your corpus
* create labels for the corpus
* transform your lyrics into BOW document vectors
* delete some words - cf curse of dimensionality
* How can we decide what words to delete?!
    * ML solution - coefficients of the words, correlation matrix
    * all feature selection techniques should apply
    * domain expertise solution - knowing a bit about language
    * remove stop words *(e.g. 'i am very similar to something which is also similar to me and that thing is batman' - important words here: similiar, something, batman (got rid of the stopwords like i, am, to, is)*
    * standardize plural / singular differentation - stemming 
    * more tomorrow

### Preprocessing with Spacy

In [35]:
import spacy

In [36]:
nlp = spacy.load('en_core_web_md')

In [37]:
# use the Spacy library to apply tokenization, 
# stemming or lemmatization when building your Bag Of Words feature matrix


# function from the lecture
def clean_text(corpus, model):
    """preprocess a string (tokens, stopwords, lowercase, lemma & stemming) returns the cleaned result
        params: review - a string
                model - a spacy model
                
        returns: list of cleaned strings
    """
    
    new_doc = []
    doc = model(corpus)
    for word in doc:
        if not word.is_stop and word.is_alpha:
            new_doc.append(word.lemma_.lower())
    
    cleaned_string = ", ".join(new_doc)  # putting the strings back into one string
    return cleaned_string
    #return new_doc

In [38]:
# ### apply with a for-loop:
# clean_lyrics = []
# for corpus in df['lyrics'].tolist():
#     c = clean_text(corpus, nlp)
#     clean_lyrics.append(c)

# df['clean_lyrics'] = clean_lyrics

In [39]:
# apply function to each row of the song column in the dataframe 
df['song_spacy'] = df['lyrics'].apply(clean_text, args=(nlp,)) # how to parse the content of a cell??

In [40]:
df.shape

(111, 4)

In [41]:
df.head(2)

Unnamed: 0,artist,lyrics,song_strings_lists,song_spacy
0,bastille_,"Ooh, I'm not ready\nOoh, I'm not ready\nOoh, I...","['ooh', 'ready', 'ooh', 'ready', 'ooh', 'ready...","ooh, ready, ooh, ready, ooh, ready, ooh, ready..."
1,rolling_blackouts_coastal_fever,Stepping into the town\nIgnore the stink of bu...,"['step', 'town', 'ignore', 'stink', 'burn', 'l...","step, town, ignore, stink, burn, leather, youn..."


### Train-test splitting

In [42]:
X = df['song_strings_lists']
y = df['artist']

In [43]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # stratify=y

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((88,), (23,), (88,), (23,))

In [44]:
X_train # needs to be a list of individual strings!

59     ['awaken', 'satin', 'sheet', 'velvet', 'mornin...
33     ['miss', 'thought', 'far', 'away', 'whistle', ...
3      ['yeah', 'lie', 'baby', 'yeah', 'yeah', 'lie',...
106    ['miss', 'thought', 'far', 'away', 'whistle', ...
86     ['big', 'decision', 'hope', 'card', 'fall', 'k...
                             ...                        
107    ['face', 'hear', 'voice', 'know', 'good', 'fun...
67     ['career', 'want', 'get', 'career', 'sell', 'p...
64     ['love', 'leave', 'rhythm', 'evening', 'chase'...
47     ['get', 'thankful', 'get', 'thankful', 'heat',...
44     ['lose', 'wanna', 'start', 'new', 'break', 'he...
Name: song_strings_lists, Length: 88, dtype: object

### transform lyrics into BOW document vectors
### fit a count vectorizer and apply  the Tf-Idf Transformer using  the TfidfVectorizer
This does both steps (count vectorizer and tfidfTransfomer) in one

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [46]:
cv = TfidfVectorizer(stop_words='english')


In [47]:
new_corpus_x_train = cv.fit(X_train).transform(X_train)
new_corpus_x_test = cv.fit(X_train).transform(X_test)

In [48]:
cv.vocabulary_

{'awaken': 62,
 'satin': 997,
 'sheet': 1046,
 'velvet': 1270,
 'morning': 758,
 'gaze': 505,
 'stinking': 1141,
 'street': 1153,
 'black': 107,
 'glass': 509,
 'reflect': 935,
 'lose': 701,
 'team': 1193,
 'place': 857,
 'fat': 427,
 'land': 648,
 'right': 967,
 'face': 413,
 'sight': 1065,
 'tongue': 1227,
 'like': 672,
 'shine': 1049,
 'sword': 1181,
 'suit': 1161,
 'armour': 48,
 'powder': 881,
 'blue': 115,
 'run': 987,
 'wild': 1317,
 'boy': 127,
 'good': 517,
 'different': 330,
 'long': 695,
 'lunch': 709,
 'slowly': 1093,
 'massage': 722,
 'neck': 770,
 'cloud': 223,
 'descend': 315,
 'watch': 1295,
 'news': 777,
 'let': 660,
 'feeling': 438,
 'die': 328,
 'phone': 845,
 'ring': 968,
 'ticket': 1214,
 'look': 697,
 'past': 831,
 'stone': 1144,
 'gate': 504,
 'corridor': 253,
 'wind': 1321,
 'carpet': 170,
 'drape': 359,
 'candlelight': 159,
 'oil': 794,
 'portrait': 877,
 'hang': 548,
 'enter': 394,
 'mahogany': 713,
 'room': 981,
 'rope': 982,
 'lie': 666,
 'golden': 516,
 'bo

In [49]:
new_corpus_x_train

<88x1358 sparse matrix of type '<class 'numpy.float64'>'
	with 3768 stored elements in Compressed Sparse Row format>

In [50]:
new_corpus_x_test

<23x1358 sparse matrix of type '<class 'numpy.float64'>'
	with 852 stored elements in Compressed Sparse Row format>

In [51]:
df_cv_x_train = pd.DataFrame(new_corpus_x_train.todense(), columns=cv.get_feature_names()) 
df_cv_x_train.head()

Unnamed: 0,accident,ache,achilles,act,action,add,addicted,address,admit,adore,...,yard,yeah,year,yearn,yellow,yes,yonder,young,youth,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.364463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.072798,0.024843,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
df_cv_x_test = pd.DataFrame(new_corpus_x_test.todense(), columns=cv.get_feature_names()) # , index=['coldplay', 'masego']
df_cv_x_test.head()

Unnamed: 0,accident,ache,achilles,act,action,add,addicted,address,admit,adore,...,yard,yeah,year,yearn,yellow,yes,yonder,young,youth,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.2114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
# join y train and y test respectively 
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


In [54]:
df_cv_x_train['y_train'] = y_train
df_cv_x_test['y_test'] = y_test

In [55]:
df_cv_x_test.head()

Unnamed: 0,accident,ache,achilles,act,action,add,addicted,address,admit,adore,...,yeah,year,yearn,yellow,yes,yonder,young,youth,zone,y_test
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,rolling_blackouts_coastal_fever
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,rolling_blackouts_coastal_fever
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,bastille_
3,0.0,0.2114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,rolling_blackouts_coastal_fever
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,rolling_blackouts_coastal_fever


In [56]:
df_cv_x_train.to_csv('data/df/df_train_count_vectorizer.csv', index_label=False)
df_cv_x_test.to_csv('data/df/df_test_count_vectorizer.csv', index_label=False)

### The rest - standard ML pipeline
* train test splitting
* choose a model - RFC, LR, NB, etc.
* train and measure it
* cross validating