## Unpickle my DFs

In [1]:
import pandas as pd
import re
import pickle
import string

# Display data tools
from IPython.display import display
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import RegexpTokenizer
from nltk.util import ngrams
from nltk.corpus import stopwords

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/jcnachman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jcnachman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Pickle in preprocessed test reviews:

In [4]:
Test_df = pd.read_pickle("./Test_df.pkl")
Test_df

Unnamed: 0,Author,Artist,Album_Name,Genre,Score,Contents
0,Larry Fitzmaurice,RL Grime,Nova,Electronic,4.1,henry steinway’s second full length largely ab...
1,Jillian Mapes,Ariana Grande,Sweetener,Pop/R&B,8.1,after years of searching ariana grande has fo...
2,Rob Mitchum,Red Hot Chili Peppers,Stadium Arcadium,Rock,4.7,set of arena friendly song about california ...
3,Taylor M. Clark,Common,Like Water for Chocolate,Rap,8.7,man about five years ago i was so damn white ...
4,Philip Sherburne,Aphex Twin,Selected Ambient Works Volume II,Electronic,10.0,an album that changed ambient music forever...
5,Philip Sherburne,The KLF,Chill Out,Electronic,8.9,the klfs sample heavy dreamscape one of th...
6,Sheldon Pearce,Dr. Dre,The Chronic,Rap,10.0,the timeless debut from dr dre a histor...
7,Stuart Berman,Fontaines D.C.,Dogrel,Rock,8.0,the steely dublin post punk band infuse the bi...
8,Eric Harvey,Peter Gabriel,So,Rock,9.1,peter gabriels art pop masterpiece a tur...


### Pickle in preprocessed modeled df

In [5]:
modeled_df = pd.read_pickle("./result.pkl")
modeled_df

Unnamed: 0,Author,Artist,Album_Name,Genre,Score,Contents
0,Sheldon Pearce,BTS,Map of the Soul: 7,Pop/R&B,6.3,the kpop groups latest is part memoir part fa...
1,Ruth Saxelby,Various Artists,Kulør 006,Electronic,7.5,the danish dj courtesys label pivots from cope...
2,Cat Zhang,Sunny Jain,Wild Wild East,Experimental,6.8,the composer and jazz multi instrumentalist ex...
3,Andy Beta,AceMoMA,A New Dawn,Electronic,7.6,rising new_york fixtures acemo and moma ready ...
4,Paul A. Thompson,Boldy James,The Price of Tea in China,Rap,8.0,on their latest collaboration the detroit rap...
...,...,...,...,...,...,...
1174,Larry Fitzmaurice,Diplo,Europa EP,Electronic,4.2,despite trending toward pop in recent years t...
1175,Eric Torres,IAMDDB,Swervvvvv.5,Rap,7.3,on her supposedly final mixtape before she iss...
1176,Stephen Kearse,Offset,Father of 4,Rap,6.9,though its undermined by its inertia and lack ...
1177,Olivia Horn,Julia Jacklin,Crushing,Rock,7.7,on an album that mines failed relationships fo...


### Stopwords

In [6]:
# list out all my stopwords 
set(stopwords.words('english'))

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [7]:
# adding words to list:
stopwords = nltk.corpus.stopwords.words('english')
newStopWords = ['like','didnt','wasnt','music','theyve']
stopwords.extend(newStopWords)

## Topic Modeling with gensim
- From here [Topic Modeling](http://localhost:8888/notebooks/curriculum/project-04/topic-modeling/JoannaN_LDA_Exercise.ipynb)

In [9]:
# gensim
from gensim import corpora, models, similarities, matutils

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [10]:
# Create a CountVectorizer for parsing/counting words
count_vectorizer = CountVectorizer(ngram_range=(1, 2),  
                                   stop_words=stopwords, token_pattern="\\b[a-z][a-z]+\\b")

count_vectorizer.fit(modeled_df.Contents)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2), preprocessor=None,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                strip_accents=None, token_pattern='\\b[a-z][a-z]+\\b',
                tokenizer=None, vocabulary=None)

In [11]:
# Create the term-document matrix
# Transpose it so the terms are the rows
doc_word = count_vectorizer.transform(modeled_df.Contents).transpose()

In [12]:
pd.DataFrame(doc_word.toarray(), count_vectorizer.get_feature_names()).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,...,929,930,931,932,933,934,935,936,937,938,939,940,941,942,943,944,945,946,947,948,949,950,951,952,953,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,969,970,971,972,973,974,975,976,977,978,979,980,981,982,983,984,985,986,987,988,989,990,991,992,993,994,995,996,997,998,999,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023,1024,1025,1026,1027,1028,1029,1030,1031,1032,1033,1034,1035,1036,1037,1038,1039,1040,1041,1042,1043,1044,1045,1046,1047,1048,1049,1050,1051,1052,1053,1054,1055,1056,1057,1058,1059,1060,1061,1062,1063,1064,1065,1066,1067,1068,1069,1070,1071,1072,1073,1074,1075,1076,1077,1078,1079,1080,1081,1082,1083,1084,1085,1086,1087,1088,1089,1090,1091,1092,1093,1094,1095,1096,1097,1098,1099,1100,1101,1102,1103,1104,1105,1106,1107,1108,1109,1110,1111,1112,1113,1114,1115,1116,1117,1118,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128,1129,1130,1131,1132,1133,1134,1135,1136,1137,1138,1139,1140,1141,1142,1143,1144,1145,1146,1147,1148,1149,1150,1151,1152,1153,1154,1155,1156,1157,1158,1159,1160,1161,1162,1163,1164,1165,1166,1167,1168,1169,1170,1171,1172,1173,1174,1175,1176,1177,1178
aaa,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
aaa powerline,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
aaa secret,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
aacm,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
aacm owner,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [13]:
doc_word.shape

(478842, 1179)

### Convert to gensim

In [14]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(doc_word)

##### Map matrix rows to words (tokens)


In [15]:
# Save a mapping (dict) of row id to word (token) for later use by gensim

In [16]:
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

In [17]:
len(id2word)

478842

## Non-Negative Matrix Factorization (NMF)

● NMF models are interpretable (unlike PCA)

● Easy to interpret means easy to explain! 

[NMF AWS](https://s3.amazonaws.com/assets.datacamp.com/production/course_3161/slides/ch4_slides.pdf)

In [18]:
from sklearn.decomposition import NMF
import numpy as np
import random

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

import pprint

In [19]:
ex_label = [e[:30]+"..." for e in modeled_df.Contents]
ex_label

['the kpop groups latest is part...',
 'the danish dj courtesys label ...',
 'the composer and jazz multi in...',
 'rising new_york fixtures acemo...',
 'on their latest collaboration ...',
 'the latest in springsteens vin...',
 'on his solo album  the london ...',
 'marco del rios song are full o...',
 '   type o negatives third albu...',
 'the band new lp is a rosily op...',
 'the hardcore quartets   minute...',
 'on their second album  the met...',
 'grimes first project as a bona...',
 'the uk singer songwriters   al...',
 'the afghan whigs leaders debut...',
 'assisted by the falls late mar...',
 'the lanky london outlaw with c...',
 'the duo continue their drift f...',
 'two archival releases highligh...',
 'maya bons diaristic bedroom ro...',
 'with its mix of rustic lullabi...',
 'after a mixtape staking his cl...',
 'using a fanciful palette of ch...',
 'the veteran djs first solo ep ...',
 'the pop star returns with a gr...',
 'the new_york city based concep...',
 'the instru

### Run NMF with parameters from vectorizer_list:
[7, 0.19, 9638]


In [22]:
vectorizer = CountVectorizer(stop_words=stopwords,binary=True, min_df=7, max_df=.19) 
doc_word = vectorizer.fit_transform(modeled_df.Contents)

In [23]:
vectorizer

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.19, max_features=None, min_df=7,
                ngram_range=(1, 1), preprocessor=None,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [24]:
pd.DataFrame(doc_word.toarray(), index=ex_label, columns=vectorizer.get_feature_names()).shape

(1179, 9638)

In [25]:
pd.DataFrame(doc_word.toarray(), index=ex_label, columns=vectorizer.get_feature_names()).sum().sort_values(ascending=True).describe()

count    9638.00000
mean       27.52791
std        32.57091
min         7.00000
25%         9.00000
50%        15.00000
75%        30.00000
max       224.00000
dtype: float64

Customized run : vectorizer = CountVectorizer(stop_words=stopwords,binary=True, min_df=5, max_df=.19):
    
There are 9638 words.

**Average** word shows up in *~28 docs*.

**Median (50%)** word shows up in *15 docs*.

### 8 topics/components

In [26]:
nmf_model = NMF(8)
doc_topic = nmf_model.fit_transform(doc_word)

In [27]:
topic_word = pd.DataFrame(nmf_model.components_.round(3),
             index = ["component_1","component_2","component_3","component_4","component_5","component_6","component_7","component_8"],
             columns = vectorizer.get_feature_names())
topic_word

Unnamed: 0,aaron,abandon,abandoned,abandoning,abandons,abiding,abilities,ability,able,abound,abrasive,abrupt,abruptly,absence,absent,absolute,absolutely,absorb,absorbed,absorbing,abstract,abstraction,absurd,absurdist,absurdity,abundance,abuse,abused,abusive,abyss,ac,academic,academy,accelerating,accent,accents,accentuates,accept,acceptance,accepted,accepting,access,accessibility,accessible,accident,accidental,accidentally,acclaim,acclaimed,accommodate,accompanied,accompanies,accompaniment,accompany,accompanying,accomplished,accordance,according,accordingly,accordion,account,accounts,accumulated,accurate,accurately,accused,ache,achieve,achieved,achievement,achieves,achieving,aching,acid,acknowledge,acknowledged,acknowledges,acknowledging,acknowledgment,acoustic,acquired,act,acting,action,actions,active,actively,activism,activist,activity,actor,actors,actress,acts,actual,actually,acute,ad,adam,adams,adapted,add,added,addiction,adding,addition,additional,addres,addresed,addreses,addresing,adds,adept,adjacent,administration,admirable,admirably,admiration,admire,admiring,admission,admit,admits,admitted,admitting,adolescence,adolescent,adopted,adopting,adopts,adoration,adored,adrenaline,adrian,adrift,adult,adulthood,adults,advance,advanced,advantage,adventure,adventures,adventurous,adventurousness,advertising,advice,advocate,aesthetic,aesthetically,aesthetics,affair,affairs,affect,affecting,affection,affinity,affirmation,affirmations,affirming,afford,afloat,aforementioned,afraid,africa,african,africas,afro,afrobeat,afropop,afterlife,aftermath,afternoon,age,aged,agency,agenda,agent,ages,aggression,aggressive,aggressively,aggro,agile,aging,agnostic,ago,agony,agree,agreed,ah,ahead,ai,aid,aided,aids,aim,aimed,aiming,aimless,aims,aint,air,airport,airy,aka,akin,al,alabama,alanis,alarm,albeit,albini,albumthe,alchemist,alchemy,alex,alexandra,algorithm,algorithms,ali,alias,aliases,alice,alien,alienated,alienation,aliens,aligns,alike,alive,allegedly,allegiance,allegory,allen,allos_angelesbout,allos_angeleslong,allos_angelest,allow,allowed,allowing,allows,alludes,allure,alluring,allusions,almighty,alone,along,alongside,...,weekend,weekly,weeknd,weeks,weeping,weepy,weighed,weight,weightless,weighty,weird,weirder,weirdly,weirdness,weirdo,weirdos,welcome,welcoming,wellos_angeless,went,werent,west,western,wests,wet,weve,whatever,whats,wheel,wheels,wheezy,whenever,whereas,wherein,wherever,whether,whiff,whimsical,whimsy,whip,whiplash,whipped,whipping,whips,whirlwind,whiskey,whisper,whispered,whispering,whispers,whispery,whistle,whistles,white,whites,whitney,whod,whole,wholesome,wholly,whos,whove,wide,widely,wider,widescreen,widespread,wielding,wields,wife,wild,wildly,willfully,william,williams,willie,willing,willingness,willos_angeleslways,wilson,wilsons,win,wind,winding,window,windows,winds,wine,wing,wings,wink,winking,winner,winning,wins,winsome,winter,wire,wires,wiry,wisdom,wise,wisely,wiser,wish,wishes,wishing,wispy,wistful,wit,witch,within,witness,witty,wizard,wobble,wobbly,woke,wolf,woman,womans,women,womens,wonder,wondered,wonderful,wonderfully,wondering,wonders,wondrous,wont,woo,wood,wooden,woods,woodstock,woodwind,woodwinds,woozy,wop,word,wordless,wordplay,wore,worked,worker,workers,working,workout,workouts,works,worlds,worldview,worldwide,worn,worried,worries,worry,worse,worship,worshipping,worst,worth,worthwhile,worthy,wouldnt,wouldve,wound,wounded,wounds,woven,wow,wrap,wrapped,wrapping,wreckage,wrenching,wrestles,wrestling,wrings,wrist,write,writer,writers,writes,writing,written,wrld,wrong,wrote,wrought,wry,wu,xcx,xs,xx,xxxtentacion,ya,yacht,yall,yang,yard,yeah,yeahs,yearn,yearning,yearns,yeezus,yell,yelling,yellow,yells,yelp,yelps,yes,yesterday,yield,yielded,yielding,yields,yin,yip,yo,yoga,yore,youd,youknow,youll,younger,youngest,youngs,yous,youth,youthful,youths,youtube,youve,yung,zane,zealand,zeitgeist,zen,zeppelin,zero,zeroes,zine,zombie,zone,zoo,zs
component_1,0.026,0.124,0.098,0.02,0.0,0.028,0.008,0.071,0.128,0.009,0.0,0.0,0.031,0.059,0.032,0.021,0.125,0.0,0.024,0.001,0.0,0.026,0.032,0.016,0.071,0.002,0.065,0.007,0.057,0.004,0.015,0.0,0.058,0.0,0.006,0.0,0.0,0.0,0.027,0.063,0.034,0.022,0.0,0.03,0.143,0.011,0.083,0.0,0.004,0.007,0.091,0.0,0.048,0.036,0.061,0.002,0.008,0.298,0.021,0.023,0.0,0.039,0.033,0.004,0.0,0.047,0.0,0.0,0.084,0.129,0.0,0.025,0.052,0.088,0.065,0.0,0.035,0.029,0.002,0.212,0.026,0.327,0.0,0.056,0.0,0.028,0.0,0.023,0.009,0.001,0.069,0.025,0.027,0.062,0.084,0.215,0.0,0.047,0.025,0.039,0.009,0.037,0.032,0.133,0.062,0.043,0.093,0.059,0.056,0.0,0.0,0.025,0.0,0.0,0.01,0.0,0.0,0.041,0.003,0.0,0.048,0.03,0.022,0.019,0.006,0.0,0.023,0.046,0.0,0.0,0.008,0.041,0.011,0.004,0.004,0.094,0.0,0.0,0.001,0.019,0.0,0.063,0.0,0.0,0.0,0.012,0.014,0.005,0.0,0.028,0.0,0.013,0.098,0.048,0.0,0.03,0.018,0.047,0.002,0.048,0.003,0.028,0.0,0.042,0.004,0.0,0.0,0.0,0.0,0.0,0.041,0.0,0.055,0.387,0.05,0.076,0.017,0.008,0.049,0.0,0.0,0.0,0.0,0.0,0.072,0.0,0.102,0.029,0.096,0.146,0.052,0.108,0.0,0.001,0.002,0.012,0.047,0.048,0.0,0.0,0.078,0.021,0.0,0.0,0.031,0.0,0.0,0.123,0.002,0.0,0.0,0.029,0.009,0.009,0.0,0.0,0.0,0.0,0.0,0.0,0.009,0.0,0.0,0.106,0.032,0.0,0.016,0.065,0.023,0.03,0.15,0.069,0.0,0.0,0.013,0.031,0.01,0.029,0.061,0.112,0.067,0.018,0.0,0.055,0.0,0.0,0.0,0.297,0.403,0.134,...,0.063,0.0,0.0,0.313,0.018,0.0,0.0,0.08,0.01,0.01,0.083,0.0,0.032,0.0,0.035,0.038,0.064,0.0,0.215,0.491,0.233,0.148,0.125,0.0,0.046,0.0,0.245,0.181,0.074,0.01,0.0,0.029,0.021,0.042,0.017,0.123,0.048,0.024,0.0,0.0,0.0,0.0,0.0,0.0,0.038,0.014,0.012,0.075,0.0,0.022,0.0,0.034,0.0,0.369,0.026,0.0,0.271,0.184,0.0,0.054,0.008,0.0,0.122,0.062,0.05,0.0,0.053,0.004,0.0,0.282,0.103,0.05,0.037,0.023,0.097,0.098,0.004,0.0,0.046,0.039,0.0,0.021,0.033,0.024,0.061,0.045,0.001,0.038,0.012,0.012,0.0,0.033,0.0,0.046,0.007,0.037,0.125,0.014,0.008,0.0,0.116,0.067,0.0,0.0,0.026,0.0,0.0,0.014,0.0,0.0,0.008,0.083,0.003,0.015,0.006,0.0,0.0,0.018,0.009,0.14,0.012,0.027,0.037,0.241,0.063,0.035,0.004,0.112,0.022,0.008,0.125,0.001,0.027,0.036,0.0,0.03,0.0,0.0,0.014,0.0,0.167,0.027,0.0,0.0,0.387,0.0,0.023,0.477,0.0,0.0,0.075,0.168,0.0,0.036,0.05,0.085,0.0,0.009,0.045,0.0,0.0,0.158,0.134,0.001,0.115,0.135,0.043,0.143,0.048,0.009,0.007,0.001,0.038,0.051,0.0,0.012,0.0,0.0,0.007,0.004,0.054,0.261,0.111,0.177,0.129,0.296,0.506,0.0,0.135,0.626,0.05,0.028,0.01,0.0,0.036,0.01,0.0,0.012,0.034,0.006,0.0,0.006,0.053,0.009,0.0,0.135,0.0,0.0,0.0,0.011,0.051,0.0,0.0,0.041,0.009,0.03,0.07,0.031,0.0,0.0,0.0,0.0,0.02,0.026,0.0,0.001,0.208,0.066,0.226,0.06,0.07,0.049,0.044,0.014,0.0,0.036,0.137,0.0,0.0,0.0,0.026,0.009,0.032,0.027,0.0,0.023,0.031,0.061,0.001,0.029
component_2,0.006,0.067,0.02,0.014,0.029,0.0,0.02,0.141,0.035,0.011,0.043,0.0,0.1,0.085,0.053,0.0,0.0,0.0,0.0,0.0,0.05,0.042,0.002,0.025,0.017,0.005,0.0,0.0,0.0,0.0,0.0,0.015,0.0,0.0,0.008,0.056,0.012,0.0,0.0,0.0,0.004,0.012,0.005,0.039,0.0,0.041,0.017,0.0,0.0,0.0,0.047,0.002,0.0,0.019,0.044,0.023,0.022,0.029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031,0.0,0.001,0.02,0.0,0.002,0.346,0.0,0.0,0.0,0.008,0.0,0.046,0.0,0.063,0.0,0.031,0.0,0.035,0.008,0.0,0.0,0.034,0.0,0.0,0.014,0.105,0.009,0.154,0.015,0.0,0.0,0.0,0.0,0.097,0.003,0.0,0.122,0.017,0.002,0.009,0.009,0.0,0.002,0.018,0.0,0.061,0.0,0.019,0.0,0.0,0.0,0.0,0.011,0.0,0.0,0.0,0.0,0.015,0.0,0.027,0.005,0.028,0.003,0.0,0.024,0.027,0.015,0.005,0.0,0.0,0.0,0.001,0.0,0.0,0.005,0.067,0.0,0.0,0.003,0.005,0.19,0.012,0.049,0.054,0.009,0.0,0.044,0.028,0.038,0.011,0.007,0.0,0.006,0.0,0.005,0.022,0.0,0.066,0.008,0.025,0.0,0.0,0.0,0.017,0.004,0.294,0.0,0.0,0.0,0.0,0.027,0.013,0.065,0.018,0.0,0.02,0.02,0.0,0.219,0.012,0.0,0.0,0.0,0.066,0.013,0.001,0.0,0.013,0.004,0.0,0.0,0.019,0.042,0.0,0.036,0.019,0.138,0.229,0.051,0.045,0.0,0.0,0.041,0.08,0.0,0.031,0.001,0.024,0.02,0.005,0.0,0.0,0.0,0.161,0.081,0.0,0.142,0.0,0.011,0.0,0.0,0.039,0.026,0.002,0.002,0.001,0.0,0.031,0.0,0.0,0.057,0.031,0.04,0.034,0.0,0.005,0.053,0.0,0.0,0.014,0.405,0.152,...,0.019,0.0,0.01,0.0,0.014,0.0,0.0,0.038,0.047,0.0,0.068,0.014,0.028,0.004,0.013,0.002,0.122,0.033,0.061,0.014,0.0,0.087,0.009,0.031,0.024,0.011,0.065,0.114,0.007,0.005,0.021,0.012,0.003,0.0,0.0,0.216,0.005,0.014,0.0,0.026,0.018,0.004,0.029,0.02,0.014,0.0,0.031,0.021,0.004,0.06,0.0,0.045,0.022,0.049,0.0,0.006,0.0,0.251,0.002,0.015,0.083,0.0,0.03,0.067,0.019,0.022,0.005,0.019,0.036,0.0,0.0,0.015,0.016,0.002,0.0,0.0,0.033,0.02,0.003,0.013,0.0,0.0,0.082,0.036,0.036,0.012,0.0,0.019,0.008,0.0,0.0,0.028,0.0,0.006,0.018,0.008,0.021,0.01,0.038,0.01,0.0,0.002,0.022,0.0,0.039,0.0,0.0,0.107,0.14,0.0,0.013,0.081,0.018,0.0,0.004,0.029,0.03,0.0,0.028,0.0,0.006,0.0,0.0,0.024,0.0,0.039,0.052,0.0,0.007,0.005,0.032,0.014,0.016,0.012,0.0,0.0,0.0,0.027,0.139,0.009,0.111,0.096,0.0,0.0,0.12,0.032,0.0,0.236,0.028,0.02,0.13,0.034,0.0,0.037,0.033,0.0,0.001,0.02,0.0,0.007,0.017,0.04,0.008,0.0,0.046,0.057,0.0,0.096,0.007,0.0,0.015,0.0,0.063,0.032,0.049,0.01,0.0,0.0,0.0,0.005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.087,0.0,0.0,0.0,0.0,0.017,0.0,0.036,0.0,0.0,0.007,0.0,0.001,0.0,0.011,0.001,0.01,0.017,0.0,0.029,0.0,0.0,0.006,0.0,0.0,0.013,0.0,0.0,0.0,0.018,0.053,0.005,0.001,0.0,0.0,0.01,0.023,0.028,0.017,0.025,0.0,0.0,0.0,0.006,0.076,0.013,0.0,0.056,0.019,0.002,0.013,0.0,0.0,0.017,0.0,0.036,0.024,0.0,0.01,0.084,0.0,0.0
component_3,0.0,0.015,0.0,0.003,0.0,0.0,0.015,0.186,0.105,0.011,0.0,0.003,0.011,0.017,0.019,0.026,0.001,0.0,0.004,0.002,0.044,0.0,0.024,0.045,0.007,0.001,0.044,0.006,0.028,0.001,0.0,0.004,0.011,0.0,0.051,0.0,0.0,0.0,0.015,0.0,0.002,0.05,0.001,0.034,0.0,0.0,0.002,0.0,0.014,0.0,0.026,0.001,0.0,0.026,0.018,0.079,0.0,0.0,0.006,0.0,0.022,0.027,0.005,0.001,0.0,0.042,0.0,0.044,0.023,0.009,0.013,0.006,0.0,0.0,0.0,0.004,0.0,0.005,0.007,0.015,0.0,0.094,0.0,0.071,0.009,0.0,0.031,0.032,0.014,0.028,0.019,0.0,0.0,0.042,0.098,0.187,0.001,0.192,0.0,0.0,0.013,0.084,0.0,0.034,0.023,0.063,0.0,0.016,0.017,0.037,0.018,0.055,0.018,0.054,0.007,0.0,0.014,0.016,0.014,0.011,0.0,0.0,0.0,0.0,0.0,0.019,0.0,0.02,0.009,0.01,0.003,0.0,0.0,0.002,0.0,0.018,0.007,0.0,0.0,0.032,0.06,0.0,0.005,0.0,0.013,0.004,0.014,0.007,0.186,0.0,0.017,0.051,0.014,0.0,0.009,0.012,0.021,0.0,0.006,0.0,0.052,0.001,0.025,0.037,0.07,0.039,0.026,0.045,0.035,0.042,0.016,0.025,0.0,0.193,0.0,0.0,0.022,0.011,0.035,0.015,0.024,0.01,0.003,0.024,0.0,0.0,0.191,0.01,0.0,0.005,0.0,0.072,0.007,0.018,0.006,0.011,0.05,0.032,0.017,0.034,0.038,0.379,0.0,0.0,0.022,0.014,0.011,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.072,0.0,0.011,0.0,0.014,0.0,0.023,0.0,0.0,0.006,0.0,0.006,0.0,0.0,0.014,0.0,0.017,0.064,0.021,0.024,0.02,0.029,0.0,0.055,0.011,0.036,0.041,0.051,0.0,0.0,0.0,0.0,0.042,0.081,0.127,0.218,...,0.004,0.0,0.034,0.068,0.003,0.003,0.012,0.056,0.017,0.001,0.078,0.068,0.0,0.002,0.028,0.003,0.148,0.003,0.156,0.2,0.035,0.271,0.013,0.046,0.016,0.017,0.042,0.178,0.006,0.002,0.041,0.02,0.032,0.011,0.001,0.161,0.015,0.013,0.0,0.019,0.033,0.0,0.001,0.032,0.021,0.006,0.018,0.0,0.008,0.0,0.02,0.005,0.0,0.2,0.001,0.017,0.0,0.159,0.0,0.037,0.07,0.0,0.038,0.032,0.038,0.0,0.013,0.009,0.005,0.024,0.066,0.0,0.004,0.0,0.051,0.039,0.0,0.007,0.038,0.0,0.0,0.071,0.067,0.003,0.0,0.013,0.026,0.029,0.0,0.032,0.0,0.0,0.011,0.058,0.014,0.0,0.03,0.0,0.0,0.0,0.019,0.034,0.006,0.007,0.098,0.011,0.0,0.0,0.0,0.109,0.0,0.226,0.032,0.0,0.022,0.0,0.01,0.063,0.0,0.0,0.0,0.097,0.0,0.074,0.0,0.016,0.0,0.049,0.015,0.0,0.072,0.046,0.003,0.0,0.007,0.0,0.0,0.0,0.092,0.012,0.068,0.0,0.088,0.0,0.092,0.001,0.0,0.304,0.012,0.008,0.309,0.039,0.0,0.005,0.014,0.0,0.01,0.001,0.153,0.029,0.017,0.107,0.019,0.025,0.038,0.151,0.015,0.0,0.0,0.0,0.0,0.025,0.0,0.008,0.0,0.0,0.0,0.003,0.07,0.0,0.04,0.015,0.139,0.0,0.017,0.068,0.006,0.053,0.027,0.0,0.017,0.0,0.091,0.031,0.029,0.002,0.064,0.067,0.0,0.07,0.012,0.011,0.081,0.001,0.0,0.0,0.002,0.024,0.02,0.041,0.037,0.016,0.022,0.0,0.102,0.031,0.005,0.0,0.0,0.023,0.012,0.0,0.078,0.0,0.0,0.046,0.174,0.078,0.087,0.045,0.0,0.0,0.047,0.049,0.0,0.161,0.012,0.085,0.038,0.007,0.057,0.005,0.0,0.007,0.0,0.0,0.021,0.075,0.0,0.09
component_4,0.015,0.001,0.036,0.013,0.0,0.025,0.008,0.105,0.023,0.019,0.008,0.011,0.018,0.087,0.031,0.01,0.0,0.0,0.0,0.0,0.092,0.041,0.024,0.0,0.028,0.03,0.023,0.0,0.021,0.006,0.0,0.0,0.0,0.0,0.011,0.0,0.004,0.047,0.09,0.018,0.018,0.0,0.001,0.009,0.029,0.012,0.004,0.0,0.0,0.004,0.083,0.069,0.024,0.023,0.02,0.0,0.009,0.0,0.048,0.007,0.057,0.025,0.0,0.002,0.013,0.01,0.055,0.0,0.0,0.0,0.033,0.0,0.051,0.0,0.025,0.0,0.04,0.016,0.014,0.492,0.0,0.067,0.03,0.042,0.0,0.004,0.0,0.014,0.0,0.0,0.01,0.011,0.036,0.0,0.086,0.114,0.017,0.0,0.026,0.0,0.0,0.02,0.0,0.006,0.04,0.0,0.0,0.058,0.025,0.068,0.097,0.152,0.058,0.023,0.0,0.013,0.025,0.005,0.01,0.013,0.024,0.001,0.071,0.012,0.043,0.03,0.049,0.023,0.0,0.021,0.021,0.0,0.041,0.0,0.014,0.097,0.055,0.008,0.0,0.0,0.001,0.029,0.017,0.036,0.023,0.0,0.066,0.0,0.023,0.0,0.054,0.002,0.012,0.024,0.157,0.056,0.016,0.047,0.011,0.029,0.021,0.0,0.025,0.054,0.0,0.0,0.0,0.0,0.0,0.0,0.029,0.049,0.032,0.212,0.001,0.0,0.003,0.0,0.012,0.022,0.0,0.0,0.0,0.006,0.029,0.0,0.113,0.037,0.0,0.0,0.006,0.063,0.009,0.02,0.014,0.003,0.019,0.0,0.0,0.034,0.054,0.105,0.167,0.012,0.03,0.028,0.004,0.0,0.04,0.0,0.011,0.013,0.0,0.005,0.0,0.018,0.068,0.03,0.0,0.0,0.0,0.0,0.0,0.002,0.015,0.013,0.03,0.0,0.017,0.0,0.197,0.0,0.0,0.004,0.0,0.021,0.0,0.0,0.0,0.0,0.027,0.128,0.031,0.042,0.053,0.0,0.0,0.502,0.289,0.063,...,0.0,0.017,0.001,0.0,0.011,0.013,0.028,0.166,0.036,0.066,0.061,0.0,0.0,0.002,0.0,0.0,0.063,0.02,0.0,0.005,0.0,0.0,0.0,0.0,0.047,0.102,0.029,0.195,0.028,0.025,0.003,0.0,0.019,0.002,0.019,0.146,0.0,0.056,0.02,0.0,0.021,0.025,0.0,0.001,0.0,0.004,0.115,0.006,0.022,0.0,0.024,0.002,0.0,0.053,0.015,0.026,0.0,0.232,0.0,0.0,0.17,0.026,0.05,0.0,0.021,0.017,0.011,0.0,0.036,0.046,0.078,0.008,0.012,0.021,0.009,0.0,0.046,0.016,0.022,0.04,0.0,0.0,0.076,0.032,0.14,0.04,0.065,0.024,0.006,0.028,0.058,0.035,0.0,0.0,0.0,0.05,0.027,0.039,0.006,0.005,0.14,0.06,0.0,0.039,0.171,0.019,0.012,0.019,0.098,0.095,0.017,0.151,0.05,0.014,0.0,0.019,0.026,0.02,0.044,0.295,0.025,0.135,0.038,0.199,0.009,0.0,0.0,0.064,0.138,0.03,0.252,0.0,0.0,0.004,0.075,0.0,0.009,0.004,0.0,0.001,0.401,0.0,0.075,0.0,0.042,0.006,0.021,0.07,0.003,0.0,0.118,0.021,0.0,0.0,0.031,0.0,0.076,0.046,0.033,0.012,0.001,0.059,0.105,0.022,0.026,0.03,0.0,0.005,0.035,0.043,0.008,0.016,0.007,0.047,0.0,0.044,0.064,0.062,0.006,0.032,0.0,0.139,0.06,0.064,0.224,0.722,0.37,0.001,0.138,0.303,0.006,0.075,0.0,0.025,0.0,0.007,0.0,0.038,0.014,0.0,0.004,0.025,0.046,0.014,0.004,0.098,0.035,0.0,0.002,0.016,0.0,0.0,0.0,0.026,0.0,0.022,0.002,0.0,0.0,0.004,0.004,0.027,0.011,0.0,0.004,0.132,0.1,0.053,0.034,0.001,0.0,0.0,0.064,0.01,0.0,0.015,0.238,0.001,0.009,0.01,0.01,0.029,0.0,0.0,0.02,0.0,0.006,0.006,0.008,0.0
component_5,0.0,0.005,0.035,0.0,0.0,0.02,0.0,0.161,0.037,0.009,0.0,0.0,0.029,0.031,0.0,0.031,0.0,0.012,0.049,0.041,0.103,0.03,0.0,0.005,0.0,0.021,0.0,0.002,0.0,0.04,0.0,0.033,0.013,0.017,0.0,0.017,0.031,0.02,0.0,0.006,0.0,0.038,0.01,0.049,0.0,0.022,0.0,0.0,0.065,0.007,0.055,0.001,0.057,0.043,0.082,0.053,0.014,0.008,0.0,0.025,0.01,0.007,0.028,0.014,0.001,0.0,0.008,0.077,0.002,0.0,0.045,0.0,0.006,0.0,0.0,0.0,0.0,0.0,0.02,0.2,0.0,0.106,0.0,0.03,0.0,0.042,0.04,0.011,0.026,0.026,0.022,0.033,0.0,0.127,0.047,0.0,0.004,0.043,0.016,0.008,0.046,0.107,0.045,0.0,0.136,0.098,0.097,0.057,0.0,0.0,0.0,0.103,0.039,0.018,0.014,0.015,0.0,0.003,0.008,0.015,0.0,0.0,0.0,0.005,0.0,0.0,0.0,0.027,0.012,0.003,0.003,0.0,0.0,0.007,0.02,0.02,0.0,0.001,0.009,0.021,0.016,0.008,0.0,0.056,0.005,0.0,0.0,0.0,0.05,0.0,0.006,0.022,0.0,0.0,0.011,0.0,0.0,0.0,0.012,0.0,0.0,0.041,0.0,0.0,0.087,0.231,0.02,0.041,0.034,0.008,0.009,0.0,0.008,0.096,0.0,0.0,0.003,0.0,0.009,0.0,0.0,0.022,0.0,0.006,0.009,0.004,0.114,0.0,0.0,0.0,0.0,0.0,0.023,0.0,0.032,0.001,0.008,0.016,0.007,0.0,0.007,0.0,0.284,0.034,0.086,0.0,0.108,0.017,0.0,0.0,0.029,0.027,0.015,0.0,0.003,0.023,0.014,0.001,0.057,0.017,0.027,0.012,0.0,0.118,0.088,0.009,0.006,0.0,0.0,0.059,0.078,0.0,0.028,0.009,0.059,0.007,0.048,0.048,0.033,0.027,0.134,0.083,0.001,0.0,0.008,0.006,0.011,0.0,0.175,0.125,...,0.0,0.0,0.0,0.0,0.011,0.018,0.024,0.023,0.021,0.0,0.053,0.022,0.003,0.0,0.02,0.002,0.069,0.035,0.283,0.0,0.0,0.086,0.16,0.0,0.001,0.01,0.053,0.0,0.0,0.0,0.001,0.017,0.033,0.037,0.016,0.161,0.011,0.041,0.007,0.0,0.0,0.004,0.012,0.009,0.005,0.0,0.03,0.01,0.008,0.053,0.006,0.0,0.0,0.256,0.0,0.024,0.0,0.249,0.0,0.065,0.0,0.0,0.167,0.002,0.053,0.018,0.0,0.007,0.0,0.027,0.077,0.06,0.007,0.098,0.097,0.002,0.011,0.033,0.006,0.008,0.004,0.004,0.298,0.066,0.024,0.023,0.106,0.0,0.025,0.019,0.0,0.006,0.0,0.054,0.003,0.0,0.0,0.075,0.03,0.014,0.041,0.012,0.0,0.0,0.0,0.03,0.025,0.002,0.013,0.0,0.0,0.537,0.035,0.0,0.028,0.017,0.033,0.0,0.0,0.0,0.0,0.032,0.0,0.154,0.0,0.062,0.014,0.0,0.008,0.011,0.0,0.007,0.025,0.015,0.049,0.015,0.038,0.029,0.048,0.0,0.156,0.139,0.0,0.0,0.09,0.009,0.0,0.175,0.001,0.007,0.447,0.174,0.0,0.0,0.057,0.0,0.0,0.0,0.006,0.057,0.0,0.0,0.164,0.022,0.048,0.007,0.0,0.01,0.0,0.0,0.045,0.0,0.022,0.007,0.029,0.003,0.0,0.014,0.0,0.0,0.0,0.0,0.046,0.0,0.044,0.0,0.058,0.0,0.0,0.014,0.008,0.0,0.0,0.0,0.0,0.005,0.0,0.0,0.011,0.0,0.023,0.004,0.0,0.0,0.0,0.081,0.0,0.0,0.0,0.0,0.042,0.0,0.0,0.01,0.089,0.002,0.047,0.009,0.008,0.016,0.023,0.0,0.019,0.039,0.019,0.036,0.0,0.059,0.0,0.0,0.006,0.0,0.0,0.028,0.006,0.055,0.044,0.0,0.001,0.013,0.0,0.019,0.0,0.05,0.004,0.0,0.0,0.07,0.025,0.0
component_6,0.004,0.0,0.021,0.049,0.045,0.009,0.026,0.135,0.078,0.021,0.002,0.0,0.0,0.0,0.0,0.044,0.061,0.075,0.03,0.012,0.0,0.0,0.021,0.0,0.0,0.0,0.038,0.065,0.009,0.004,0.0,0.004,0.009,0.0,0.137,0.0,0.0,0.052,0.029,0.04,0.0,0.102,0.023,0.0,0.0,0.0,0.0,0.152,0.01,0.058,0.0,0.021,0.0,0.0,0.102,0.0,0.005,0.05,0.005,0.007,0.0,0.0,0.002,0.054,0.0,0.003,0.0,0.013,0.059,0.013,0.0,0.019,0.0,0.0,0.012,0.0,0.0,0.015,0.006,0.0,0.063,0.131,0.133,0.031,0.0,0.078,0.001,0.035,0.066,0.015,0.0,0.011,0.023,0.101,0.059,0.051,0.011,0.037,0.0,0.043,0.0,0.052,0.163,0.0,0.021,0.022,0.0,0.029,0.022,0.065,0.005,0.0,0.007,0.011,0.035,0.0,0.0,0.0,0.0,0.04,0.0,0.088,0.0,0.0,0.004,0.01,0.026,0.0,0.0,0.083,0.013,0.062,0.028,0.0,0.0,0.068,0.063,0.018,0.0,0.0,0.015,0.0,0.0,0.01,0.011,0.012,0.0,0.093,0.032,0.062,0.075,0.053,0.0,0.015,0.079,0.05,0.0,0.0,0.039,0.0,0.016,0.001,0.048,0.0,0.025,0.057,0.016,0.056,0.017,0.016,0.002,0.04,0.0,0.0,0.015,0.045,0.011,0.006,0.003,0.0,0.091,0.001,0.013,0.0,0.0,0.015,0.124,0.0,0.018,0.0,0.047,0.039,0.007,0.0,0.0,0.035,0.087,0.025,0.007,0.0,0.018,0.087,0.228,0.017,0.009,0.0,0.0,0.0,0.022,0.168,0.0,0.022,0.0,0.012,0.0,0.0,0.0,0.002,0.003,0.0,0.009,0.0,0.0,0.0,0.0,0.009,0.012,0.012,0.01,0.074,0.009,0.018,0.037,0.009,0.023,0.041,0.0,0.0,0.092,0.238,0.0,0.0,0.0,0.001,0.0,0.052,0.031,0.063,0.0,0.331,...,0.038,0.189,0.006,0.174,0.0,0.008,0.022,0.082,0.0,0.0,0.0,0.0,0.006,0.0,0.0,0.0,0.007,0.01,0.208,0.311,0.201,0.113,0.002,0.001,0.021,0.018,0.0,0.153,0.094,0.018,0.0,0.063,0.008,0.0,0.063,0.254,0.0,0.004,0.024,0.013,0.001,0.0,0.002,0.0,0.022,0.052,0.026,0.003,0.006,0.056,0.0,0.028,0.0,0.314,0.005,0.0,0.158,0.138,0.08,0.0,0.151,0.0,0.098,0.087,0.082,0.009,0.061,0.0,0.015,0.104,0.0,0.116,0.021,0.0,0.0,0.016,0.0,0.028,0.028,0.026,0.01,0.08,0.0,0.0,0.165,0.0,0.0,0.047,0.044,0.024,0.0,0.003,0.012,0.119,0.024,0.071,0.0,0.0,0.0,0.0,0.006,0.0,0.025,0.006,0.016,0.012,0.033,0.003,0.0,0.039,0.006,0.339,0.012,0.083,0.0,0.0,0.0,0.009,0.032,0.696,0.007,0.664,0.083,0.024,0.0,0.0,0.002,0.0,0.006,0.003,0.062,0.0,0.0,0.0,0.0,0.011,0.0,0.0,0.0,0.049,0.117,0.0,0.018,0.034,0.024,0.0,0.0,0.212,0.001,0.0,0.012,0.098,0.1,0.12,0.054,0.015,0.0,0.0,0.032,0.0,0.01,0.016,0.104,0.0,0.062,0.143,0.003,0.0,0.011,0.002,0.007,0.0,0.025,0.0,0.001,0.003,0.0,0.0,0.0,0.0,0.011,0.319,0.131,0.05,0.046,0.311,0.403,0.011,0.058,0.483,0.006,0.047,0.0,0.022,0.033,0.0,0.0,0.044,0.0,0.013,0.001,0.0,0.0,0.0,0.085,0.075,0.018,0.006,0.0,0.0,0.0,0.011,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.003,0.001,0.0,0.0,0.0,0.0,0.016,0.116,0.091,0.097,0.0,0.0,0.0,0.038,0.003,0.0,0.0,0.01,0.0,0.015,0.005,0.0,0.0,0.059,0.024,0.0,0.026,0.0,0.016,0.004,0.0
component_7,0.0,0.014,0.093,0.011,0.0,0.0,0.023,0.037,0.027,0.025,0.077,0.06,0.012,0.0,0.0,0.026,0.004,0.0,0.003,0.012,0.033,0.0,0.125,0.013,0.035,0.0,0.032,0.0,0.028,0.012,0.057,0.0,0.0,0.0,0.01,0.0,0.0,0.043,0.007,0.0,0.0,0.023,0.037,0.112,0.046,0.0,0.005,0.01,0.007,0.0,0.0,0.031,0.0,0.01,0.029,0.032,0.0,0.06,0.013,0.001,0.115,0.004,0.0,0.027,0.045,0.0,0.0,0.017,0.104,0.021,0.007,0.037,0.0,0.094,0.0,0.024,0.001,0.0,0.003,0.009,0.002,0.358,0.022,0.019,0.091,0.05,0.0,0.0,0.0,0.0,0.009,0.0,0.0,0.135,0.264,0.261,0.0,0.014,0.05,0.009,0.0,0.077,0.098,0.096,0.032,0.076,0.0,0.0,0.0,0.023,0.0,0.0,0.011,0.028,0.0,0.005,0.0,0.0,0.004,0.0,0.083,0.131,0.095,0.068,0.0,0.031,0.02,0.0,0.0,0.0,0.0,0.081,0.0,0.0,0.017,0.027,0.086,0.035,0.059,0.016,0.006,0.005,0.041,0.005,0.0,0.01,0.01,0.0,0.187,0.0,0.0,0.015,0.0,0.032,0.0,0.0,0.018,0.0,0.0,0.028,0.074,0.0,0.002,0.057,0.0,0.0,0.0,0.0,0.0,0.0,0.063,0.0,0.045,0.214,0.033,0.0,0.002,0.057,0.025,0.101,0.173,0.01,0.096,0.001,0.036,0.046,0.104,0.005,0.008,0.0,0.0,0.04,0.001,0.001,0.007,0.004,0.009,0.056,0.019,0.014,0.014,0.0,0.145,0.0,0.0,0.075,0.0,0.0,0.0,0.0,0.009,0.0,0.037,0.005,0.0,0.0,0.052,0.0,0.01,0.0,0.033,0.019,0.0,0.0,0.045,0.006,0.054,0.044,0.0,0.0,0.077,0.032,0.011,0.003,0.07,0.013,0.027,0.007,0.0,0.034,0.0,0.0,0.043,0.0,0.022,0.067,0.0,0.151,0.186,0.0,...,0.042,0.0,0.0,0.0,0.001,0.013,0.018,0.0,0.0,0.0,0.214,0.025,0.0,0.043,0.039,0.021,0.097,0.014,0.048,0.149,0.148,0.094,0.01,0.0,0.0,0.017,0.212,0.124,0.025,0.004,0.0,0.035,0.024,0.0,0.0,0.106,0.0,0.0,0.036,0.023,0.017,0.003,0.006,0.0,0.04,0.024,0.025,0.0,0.026,0.073,0.0,0.017,0.029,0.122,0.002,0.0,0.008,0.115,0.016,0.0,0.215,0.033,0.129,0.039,0.0,0.001,0.0,0.017,0.0,0.0,0.21,0.026,0.007,0.036,0.0,0.0,0.107,0.031,0.01,0.016,0.033,0.016,0.036,0.049,0.08,0.002,0.033,0.03,0.002,0.051,0.064,0.0,0.0,0.0,0.02,0.002,0.048,0.042,0.0,0.031,0.0,0.0,0.012,0.002,0.057,0.0,0.0,0.0,0.062,0.012,0.036,0.062,0.0,0.0,0.015,0.0,0.004,0.0,0.064,0.071,0.037,0.095,0.0,0.086,0.0,0.003,0.018,0.047,0.019,0.0,0.091,0.027,0.059,0.042,0.079,0.051,0.002,0.0,0.0,0.018,0.263,0.039,0.003,0.05,0.185,0.0,0.012,0.053,0.01,0.005,0.041,0.082,0.023,0.011,0.0,0.025,0.0,0.015,0.064,0.046,0.03,0.086,0.012,0.0,0.046,0.061,0.077,0.015,0.0,0.0,0.016,0.017,0.0,0.045,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.26,0.034,0.0,0.0,0.044,0.146,0.0,0.103,0.259,0.0,0.047,0.01,0.0,0.0,0.0,0.012,0.0,0.026,0.0,0.007,0.006,0.047,0.024,0.0,0.0,0.001,0.0,0.061,0.046,0.041,0.069,0.077,0.024,0.111,0.001,0.0,0.0,0.0,0.028,0.007,0.025,0.0,0.0,0.014,0.121,0.061,0.092,0.157,0.0,0.008,0.059,0.308,0.07,0.125,0.007,0.057,0.0,0.0,0.0,0.0,0.0,0.02,0.015,0.0,0.06,0.0,0.034,0.018,0.0
component_8,0.002,0.0,0.0,0.001,0.005,0.0,0.0,0.0,0.068,0.0,0.0,0.0,0.004,0.002,0.151,0.001,0.129,0.0,0.006,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.002,0.0,0.0,0.0,0.001,0.003,0.055,0.143,0.0,0.149,0.001,0.0,0.0,0.141,0.0,0.242,0.071,0.005,0.0,0.067,0.0,0.0,0.012,0.007,0.003,0.0,0.0,0.01,0.009,0.0,0.0,0.135,0.005,0.0,0.0,0.0,0.0,0.005,0.002,0.011,0.0,0.003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.146,0.0,0.0,0.003,0.034,0.15,0.028,0.0,0.007,0.0,0.003,0.0,0.0,0.0,0.008,0.0,0.0,0.0,0.232,0.0,0.074,0.002,0.0,0.0,0.0,0.007,0.0,0.004,0.0,0.0,0.0,0.005,0.002,0.001,0.0,0.009,0.0,0.0,0.009,0.003,0.008,0.002,0.005,0.002,0.0,0.01,0.053,0.0,0.231,0.0,0.0,0.001,0.0,0.152,0.009,0.0,0.0,0.0,0.011,0.002,0.027,0.0,0.149,0.005,0.0,0.143,0.0,0.013,0.003,0.0,0.069,0.0,0.0,0.109,0.003,0.001,0.0,0.0,0.0,0.013,0.0,0.0,0.0,0.0,0.0,0.0,0.004,0.0,0.0,0.002,0.0,0.004,0.0,0.0,0.0,0.0,0.001,0.0,0.059,0.168,0.0,0.005,0.0,0.002,0.004,0.002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003,0.004,0.083,0.0,0.004,0.0,0.0,0.0,0.0,0.002,0.0,0.019,0.0,0.08,0.0,0.0,0.0,0.0,0.0,0.007,0.0,0.0,0.001,0.003,0.005,0.0,0.003,0.006,0.0,0.0,0.145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003,0.0,0.0,0.0,0.002,0.011,0.0,0.008,0.0,0.0,0.064,0.0,0.005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045,0.174,...,0.011,0.0,0.013,0.0,0.0,0.0,0.0,0.0,0.005,0.002,0.0,0.004,0.001,0.069,0.0,0.0,0.0,0.0,0.003,0.075,0.05,0.047,0.01,0.077,0.0,0.0,0.0,0.018,0.0,0.008,0.0,0.0,0.019,0.0,0.014,0.02,0.0,0.005,0.004,0.003,0.0,0.072,0.0,0.0,0.0,0.0,0.003,0.0,0.0,0.21,0.0,0.005,0.139,0.122,0.009,0.008,0.0,0.272,0.0,0.0,0.0,0.007,0.058,0.0,0.0,0.003,0.0,0.0,0.0,0.0,0.087,0.002,0.004,0.0,0.0,0.001,0.0,0.0,0.001,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015,0.0,0.0,0.002,0.149,0.062,0.0,0.01,0.0,0.0,0.0,0.001,0.0,0.0,0.006,0.0,0.167,0.0,0.146,0.009,0.073,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.007,0.003,0.001,0.0,0.0,0.0,0.0,0.0,0.059,0.007,0.001,0.0,0.0,0.0,0.166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.141,0.0,0.0,0.004,0.246,0.013,0.072,0.0,0.046,0.0,0.001,0.006,0.0,0.0,0.004,0.001,0.0,0.01,0.0,0.251,0.0,0.012,0.144,0.071,0.077,0.154,0.0,0.008,0.0,0.0,0.0,0.003,0.0,0.138,0.0,0.0,0.0,0.0,0.0,0.0,0.004,0.0,0.259,0.0,0.005,0.053,0.09,0.0,0.061,0.0,0.004,0.0,0.0,0.0,0.0,0.002,0.0,0.003,0.0,0.0,0.001,0.0,0.01,0.002,0.006,0.0,0.0,0.002,0.0,0.0,0.0,0.069,0.007,0.01,0.148,0.0,0.007,0.003,0.0,0.001,0.0,0.0,0.016,0.0,0.0,0.0,0.0,0.0,0.005,0.0,0.019,0.06,0.0,0.016,0.001,0.003,0.005,0.0,0.0,0.0,0.178,0.0,0.0,0.0,0.014,0.0,0.0


In [28]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [29]:
display_topics(nmf_model, vectorizer.get_feature_names(), 15) # number of words in topic


Topic  0
became, recording, came, version, told, took, seemed, known, wrote, original, playing, era, night, played, going

Topic  1
electronic, synths, dance, techno, club, producers, beats, ambient, dancefloor, dj, ep, rhythms, mix, rave, cut

Topic  2
rapper, beats, lil, rapping, hiphop, shit, produced, star, verses, flow, baby, verse, features, guest, trying

Topic  3
songwriter, someone, writing, shes, emotional, songwriting, feeling, singing, folk, alone, acoustic, room, closer, relationship, feelings

Topic  4
piece, jazz, composer, notes, pieces, musicians, based, group, within, ambient, compositions, form, melody, familiar, deep

Topic  5
woman, girl, women, shes, culture, video, wrote, stars, sex, girls, told, singles, today, written, rolling

Topic  6
guitarist, fans, drummer, metal, guitars, indie, noise, lead, show, hardcore, group, bassist, wave, scene, used

Topic  7
indie, help, group, songwriting, means, major, approach, finds, modern, whole, guest, similar, latter, in

In [30]:
H = pd.DataFrame(doc_topic.round(3),
             index = ex_label,
             columns = ["component_1","component_2","component_3","component_4","component_5","component_6","component_7","component_8"])
H = H.reset_index() # This looks better
H

Unnamed: 0,index,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8
0,the kpop groups latest is part...,0.000,0.131,0.547,0.000,0.140,0.359,0.092,0.028
1,the danish dj courtesys label ...,0.030,0.571,0.000,0.130,0.032,0.000,0.000,0.000
2,the composer and jazz multi in...,0.000,0.000,0.173,0.000,0.575,0.176,0.000,0.000
3,rising new_york fixtures acemo...,0.008,0.667,0.119,0.000,0.000,0.000,0.000,0.016
4,on their latest collaboration ...,0.022,0.164,0.493,0.046,0.180,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...
1174,despite trending toward pop in...,0.000,0.215,0.368,0.000,0.074,0.000,0.000,0.148
1175,on her supposedly final mixtap...,0.000,0.140,0.375,0.039,0.081,0.007,0.000,0.030
1176,though its undermined by its i...,0.000,0.001,0.511,0.241,0.037,0.104,0.000,0.000
1177,on an album that mines failed ...,0.060,0.027,0.014,0.809,0.000,0.000,0.000,0.055


### Masks for specific key where component_1 > Other Components

In [31]:
mask1 = ((H['component_1'] > H['component_2']) & (H['component_1'] > H['component_3']) & (H['component_1'] > H['component_4']) & (H['component_1'] > H['component_5']) & (H['component_1'] > H['component_6']) & (H['component_1'] > H['component_7']) & (H['component_1'] > H['component_8']))
H[mask1]

Unnamed: 0,index,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8
5,the latest in springsteens vin...,0.822,0.0,0.018,0.16,0.169,0.0,0.0,0.134
50,buffy sainte maries cosmic ...,0.982,0.039,0.0,0.398,0.325,0.37,0.0,0.0
97,a pure and quiet record fro...,1.07,0.0,0.0,0.596,0.0,0.326,0.076,0.015
122,on the first posthumous album ...,0.373,0.0,0.0,0.292,0.147,0.0,0.0,0.0
180,carole kings tapestry the ...,1.198,0.0,0.0,0.697,0.0,0.059,0.0,0.0
206,the australian singer songwrit...,0.167,0.0,0.0,0.105,0.162,0.138,0.0,0.0
213,a new reissue of the illusory ...,0.256,0.123,0.038,0.179,0.196,0.0,0.0,0.0
234,this five disc set which incl...,0.35,0.121,0.057,0.0,0.0,0.012,0.136,0.052
238,the rolling stones album ...,1.522,0.52,0.0,0.0,0.0,0.0,0.0,0.0
243,five album recorded in the lat...,0.525,0.007,0.0,0.032,0.45,0.0,0.0,0.023


In [32]:
# H[mask1].count()

In [33]:
# result.index[H[mask1].index]

In [34]:
# # Print the articles for those reviews
# index_pos1 = result.index[H[mask1].index]

#### Do these component_1 albums appear to be similar?

In [35]:
# result.loc[index_pos1].head()

In [36]:
# Pitchfork_component1 = result.loc[index_pos1]
# Pitchfork_component1.head()

### Masks for specific key where component_2 > Other Components

In [43]:
mask2 = ((H['component_2'] > H['component_1']) & (H['component_2'] > H['component_3']) & (H['component_2'] > H['component_4']) & (H['component_2'] > H['component_5']) & (H['component_2'] > H['component_6']) & (H['component_2'] > H['component_7']) & (H['component_2'] > H['component_8']))
H[mask2]

Unnamed: 0,index,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8
1,the danish dj courtesys label ...,0.03,0.571,0.0,0.13,0.032,0.0,0.0,0.0
3,rising new_york fixtures acemo...,0.008,0.667,0.119,0.0,0.0,0.0,0.0,0.016
6,on his solo album the london ...,0.0,0.318,0.005,0.297,0.0,0.075,0.038,0.015
22,using a fanciful palette of ch...,0.0,0.366,0.0,0.216,0.081,0.021,0.063,0.0
23,the veteran djs first solo ep ...,0.0,0.684,0.0,0.184,0.05,0.06,0.0,0.001
25,the new_york city based concep...,0.0,0.327,0.111,0.161,0.2,0.051,0.0,0.0
30,on his fourth album kevin par...,0.137,0.485,0.119,0.326,0.073,0.0,0.181,0.0
38,the debut from the london musi...,0.0,0.917,0.0,0.0,0.427,0.039,0.0,0.0
41,on his debut solo album under ...,0.0,0.244,0.083,0.049,0.092,0.041,0.18,0.0
42,nicolas jaar returns to his da...,0.004,0.375,0.098,0.0,0.016,0.017,0.118,0.0


In [44]:
# H[mask2].count()

In [45]:
# result.index[H[mask2].index]

In [46]:
# index_pos2 = result.index[H[mask2].index]

In [47]:
# # Print the articles for those reviews
# result.iloc[index_pos2].head()

In [48]:
# Pitchfork_component2 = result.iloc[index_pos2]
# Pitchfork_component2.head()

### Masks for specific key where component_3 > Other Components

In [55]:
mask3 = ((H['component_3'] > H['component_1']) & (H['component_3'] > H['component_2']) & (H['component_3'] > H['component_4']) & (H['component_3'] > H['component_5']) & (H['component_3'] > H['component_6']) & (H['component_3'] > H['component_7']) & (H['component_3'] > H['component_8']))
H[mask3]

Unnamed: 0,index,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8
0,the kpop groups latest is part...,0.0,0.131,0.547,0.0,0.14,0.359,0.092,0.028
4,on their latest collaboration ...,0.022,0.164,0.493,0.046,0.18,0.0,0.0,0.0
21,after a mixtape staking his cl...,0.0,0.104,0.655,0.0,0.035,0.026,0.0,0.046
34,on his collaboration with kenn...,0.0,0.122,0.656,0.0,0.0,0.03,0.0,0.022
35,the rising brooklyn rapper new...,0.0,0.105,0.43,0.072,0.122,0.0,0.04,0.0
39,the enigmatic rapper debut lp ...,0.091,0.014,0.318,0.296,0.084,0.0,0.0,0.0
47,the memphis rapper offers hard...,0.004,0.0,0.445,0.063,0.0,0.0,0.038,0.075
48,the earl sweatshirt collaborat...,0.014,0.071,0.392,0.115,0.186,0.0,0.0,0.0
55,working with lil peep collabor...,0.0,0.097,0.694,0.021,0.0,0.005,0.07,0.0
57,the pop punk stalwarts resist ...,0.094,0.0,0.124,0.0,0.01,0.071,0.123,0.105


In [56]:
# H[mask3].count()

In [57]:
# result.index[H[mask3].index]

In [58]:
# index_pos3 = result.index[H[mask3].index]

In [59]:
# # Print the articles for those reviews
# Pitchfork_component3 = result.iloc[index_pos3]
# Pitchfork_component3.head()

### Masks for specific key where component_4 > Other Components

In [66]:
mask4 = ((H['component_4'] > H['component_1']) & (H['component_4'] > H['component_2']) & (H['component_4'] > H['component_3']) & (H['component_4'] > H['component_5']) & (H['component_4'] > H['component_6']) & (H['component_4'] > H['component_7']) & (H['component_4'] > H['component_8']))
H[mask4]

Unnamed: 0,index,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8
9,the band new lp is a rosily op...,0.005,0.151,0.007,0.513,0.0,0.191,0.144,0.159
14,the afghan whigs leaders debut...,0.028,0.132,0.033,0.143,0.072,0.0,0.119,0.0
16,the lanky london outlaw with c...,0.043,0.059,0.0,0.354,0.131,0.122,0.098,0.0
19,maya bons diaristic bedroom ro...,0.0,0.055,0.018,0.313,0.02,0.081,0.0,0.0
24,the pop star returns with a gr...,0.024,0.0,0.267,0.468,0.0,0.246,0.045,0.025
27,the oklahoma roots songwriter ...,0.026,0.017,0.015,0.532,0.056,0.115,0.0,0.056
29,setting aside the retro r&b va...,0.13,0.049,0.035,0.303,0.001,0.029,0.039,0.021
33,on her second album the los_a...,0.007,0.227,0.116,0.363,0.164,0.037,0.012,0.046
36,chicagos lili trifilio writes ...,0.0,0.029,0.079,0.296,0.0,0.164,0.069,0.05
37,originally recorded a decade a...,0.0,0.054,0.0,0.264,0.251,0.0,0.001,0.0


In [67]:
# H[mask4].count()

In [68]:
# result.index[H[mask4].index]

In [69]:
# index_pos4 = result.index[H[mask4].index]

In [70]:
# # Print the articles for those reviews
# Pitchfork_component4 = result.iloc[index_pos4]
# Pitchfork_component4

### Masks for specific key where component_5 > Other Components

In [77]:
mask5 = ((H['component_5'] > H['component_1']) & (H['component_5'] > H['component_2']) & (H['component_5'] > H['component_3']) & (H['component_5'] > H['component_4']) & (H['component_5'] > H['component_6']) & (H['component_5'] > H['component_7']) & (H['component_5'] > H['component_8']))
H[mask5]

Unnamed: 0,index,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8
2,the composer and jazz multi in...,0.0,0.0,0.173,0.0,0.575,0.176,0.0,0.0
7,marco del rios song are full o...,0.0,0.062,0.0,0.094,0.207,0.017,0.19,0.0
15,assisted by the falls late mar...,0.119,0.115,0.035,0.084,0.175,0.033,0.062,0.031
18,two archival releases highligh...,0.401,0.0,0.0,0.0,0.742,0.0,0.0,0.004
26,the instrumental duo go all in...,0.133,0.019,0.03,0.113,0.274,0.0,0.023,0.0
31,a new classical sextet known f...,0.035,0.054,0.063,0.012,0.469,0.0,0.0,0.029
46,in pauline oliveros coined ...,0.0,0.0,0.0,0.0,0.843,0.108,0.0,0.023
51,after touring together the ps...,0.013,0.054,0.11,0.0,0.191,0.051,0.082,0.076
63,with unexpected musical contra...,0.0,0.0,0.032,0.279,0.384,0.0,0.0,0.0
70,on the baltimore musicians fir...,0.0,0.075,0.0,0.176,0.36,0.0,0.015,0.037


In [78]:
# result.index[H[mask5].index]

In [79]:
# index_pos5 = result.index[H[mask5].index]

In [80]:
# # Print the articles for those reviews
# Pitchfork_component5 = result.loc[index_pos5]
# Pitchfork_component5

### Masks for specific key where component_6 > Other Components

In [86]:
mask6 = ((H['component_6'] > H['component_1']) & (H['component_6'] > H['component_2']) & (H['component_6'] > H['component_3']) & (H['component_6'] > H['component_4']) & (H['component_6'] > H['component_5']) & (H['component_6'] > H['component_7']) & (H['component_6'] > H['component_8']))
H[mask6]

Unnamed: 0,index,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8
12,grimes first project as a bona...,0.0,0.017,0.209,0.251,0.117,0.551,0.146,0.0
13,the uk singer songwriters al...,0.296,0.138,0.0,0.299,0.0,0.441,0.164,0.09
52,with cover song and a couple o...,0.2,0.0,0.0,0.0,0.101,0.213,0.059,0.014
58,the pop star attempts to be al...,0.0,0.093,0.165,0.183,0.0,0.28,0.0,0.049
73,shakiras big crossover reco...,0.0,0.0,0.0,0.0,0.028,2.337,0.0,0.0
105,written from her own perspecti...,0.0,0.084,0.098,0.299,0.0,0.404,0.0,0.111
129,the cryptic youtubers new albu...,0.0,0.179,0.0,0.046,0.169,0.231,0.111,0.0
145,the second album written prod...,0.061,0.274,0.162,0.0,0.0,0.333,0.001,0.018
240,mariah carey the culturalos_a...,0.332,0.0,0.131,0.11,0.214,0.345,0.0,0.247
264,steely dan from their early c...,0.293,0.085,0.0,0.274,0.0,0.358,0.231,0.119


In [87]:
# result.index[H[mask6].index]

In [88]:
# index_pos6 = result.index[H[mask6].index]

In [89]:
# # Print the articles for those reviews
# Pitchfork_component6 = result.loc[index_pos6]
# Pitchfork_component6

### Masks for specific key where component_7 > Other Components

In [96]:
mask7 = ((H['component_7'] > H['component_1']) & (H['component_7'] > H['component_2']) & (H['component_7'] > H['component_3']) & (H['component_7'] > H['component_4']) & (H['component_7'] > H['component_5']) & (H['component_7'] > H['component_6']) & (H['component_7'] > H['component_8']))
H[mask7]

Unnamed: 0,index,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8
8,type o negatives third albu...,0.211,0.0,0.0,0.0,0.0,0.707,1.302,0.0
10,the hardcore quartets minute...,0.0,0.0,0.0,0.059,0.033,0.098,0.401,0.019
11,on their second album the met...,0.0,0.12,0.0,0.062,0.101,0.0,0.324,0.129
17,the duo continue their drift f...,0.0,0.099,0.049,0.176,0.0,0.005,0.234,0.072
32,on their fourth solo release i...,0.0,0.077,0.0,0.298,0.176,0.114,0.316,0.0
43,on their dark and impressionis...,0.107,0.0,0.0,0.225,0.083,0.031,0.243,0.072
54,alongside producer gil norton ...,0.008,0.083,0.132,0.034,0.057,0.0,0.281,0.032
80,scottish indie pop musician pa...,0.127,0.07,0.0,0.286,0.012,0.0,0.328,0.031
87,these unruly british punks sou...,0.052,0.091,0.074,0.089,0.061,0.0,0.254,0.024
93,wolf parades latest album play...,0.055,0.07,0.041,0.124,0.0,0.0,0.281,0.111


In [97]:
# result.index[H[mask7].index]

In [98]:
# index_pos7 = result.index[H[mask7].index]

In [99]:
# # Print the articles for those reviews
# Pitchfork_component7 = result.loc[index_pos7]
# Pitchfork_component7

### Masks for specific key where component_8 > Other Components

In [106]:
mask8 = ((H['component_8'] > H['component_1']) & (H['component_8'] > H['component_2']) & (H['component_8'] > H['component_3']) & (H['component_8'] > H['component_4']) & (H['component_8'] > H['component_5']) & (H['component_8'] > H['component_6']) & (H['component_8'] > H['component_7']))
H[mask8]

Unnamed: 0,index,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8
20,with its mix of rustic lullabi...,0.076,0.102,0.0,0.117,0.073,0.0,0.204,0.224
366,with harmonized guitars sprin...,0.1,0.102,0.172,0.0,0.0,0.159,0.175,0.239
444,an essential greatest hits ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.983
836,pavements divisive finalos_...,0.37,0.0,0.0,0.0,0.0,0.0,0.675,2.366


In [107]:
# result.index[H[mask8].index]

In [108]:
# index_pos8 = result.index[H[mask8].index]

In [109]:
# # Print the articles for those reviews
# Pitchfork_component8 = result.iloc[index_pos8]
# Pitchfork_component8

## Pull in Test_df

In [133]:
Test_df

Unnamed: 0,Author,Artist,Album_Name,Genre,Score,Contents
0,Larry Fitzmaurice,RL Grime,Nova,Electronic,4.1,henry steinway’s second full length largely ab...
1,Jillian Mapes,Ariana Grande,Sweetener,Pop/R&B,8.1,after years of searching ariana grande has fo...
2,Rob Mitchum,Red Hot Chili Peppers,Stadium Arcadium,Rock,4.7,set of arena friendly song about california ...
3,Taylor M. Clark,Common,Like Water for Chocolate,Rap,8.7,man about five years ago i was so damn white ...
4,Philip Sherburne,Aphex Twin,Selected Ambient Works Volume II,Electronic,10.0,an album that changed ambient music forever...
5,Philip Sherburne,The KLF,Chill Out,Electronic,8.9,the klfs sample heavy dreamscape one of th...
6,Sheldon Pearce,Dr. Dre,The Chronic,Rap,10.0,the timeless debut from dr dre a histor...
7,Stuart Berman,Fontaines D.C.,Dogrel,Rock,8.0,the steely dublin post punk band infuse the bi...
8,Eric Harvey,Peter Gabriel,So,Rock,9.1,peter gabriels art pop masterpiece a tur...


### Apply dimensionality reduction on Test_df

In [136]:
doc_word1 = count_vectorizer.transform(Test_df.Contents).transpose()
# doc_word1 = count_vectorizer.fit_transform(Test_df.Contents).transpose()

In [137]:
pd.DataFrame(doc_word1.toarray(), count_vectorizer.get_feature_names())

Unnamed: 0,0,1,2,3,4,5,6,7,8
aaa,0,0,0,0,0,0,0,0,0
aaa powerline,0,0,0,0,0,0,0,0,0
aaa secret,0,0,0,0,0,0,0,0,0
aacm,0,0,0,0,0,0,0,0,0
aacm owner,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
zwei album,0,0,0,0,0,0,0,0,0
zwei grammvia,0,0,0,0,0,0,0,0,0
zz,0,0,0,0,0,0,0,0,0
zz top,0,0,0,0,0,0,0,0,0


### Convert to gensim

In [139]:
## Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(doc_word1)

### Map matrix rows to words (tokens)

In [140]:
# Save a mapping (dict) of row id to word (token) for later use by gensim

In [141]:
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

In [142]:
len(id2word)

478842

### NMF

In [143]:
ex_label1 = [e[:30]+"..." for e in Test_df.Contents]
ex_label1

['henry steinway’s second full l...',
 'after years of searching  aria...',
 '  set of arena friendly song a...',
 'man  about five years ago i wa...',
 '   an album that changed ambie...',
 '   the klfs sample heavy dream...',
 '   the timeless   debut from d...',
 'the steely dublin post punk ba...',
 '   peter gabriels   art pop ma...']

In [144]:
## Run NMF with parameters from vectorizer_list[17]:[5, 0.19, 12602]
doc_word1 = vectorizer.transform(Test_df.Contents)

In [145]:
# doc_topic1 = nmf_model.fit_transform(doc_word1)
doc_topic1 = nmf_model.transform(doc_word1)

In [146]:
doc_topic1[1]

array([0.07864642, 0.06934046, 0.15795363, 0.38117965, 0.0826694 ,
       0.06587239, 0.01167511, 0.05478524])

In [147]:
doc_word1.shape

(9, 9638)

In [148]:
doc_word.shape

(1179, 9638)

In [149]:
doc_topic1.shape

(9, 8)

In [150]:
doc_topic.shape

(1179, 8)

In [151]:
H1 = pd.DataFrame(doc_topic1.round(3),
             index = ex_label1,
             columns = ["component_1","component_2","component_3","component_4","component_5","component_6","component_7","component_8"])
H1 = H1.reset_index() # This looks better
H1

Unnamed: 0,index,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8
0,henry steinway’s second full l...,0.031,0.315,0.245,0.064,0.0,0.018,0.06,0.055
1,after years of searching aria...,0.079,0.069,0.158,0.381,0.083,0.066,0.012,0.055
2,set of arena friendly song a...,0.102,0.106,0.149,0.036,0.039,0.07,0.21,0.086
3,man about five years ago i wa...,0.074,0.065,0.304,0.0,0.103,0.145,0.0,0.006
4,an album that changed ambie...,0.426,0.811,0.039,0.115,0.397,0.41,0.364,0.125
5,the klfs sample heavy dream...,0.582,0.671,0.151,0.028,0.687,0.258,0.33,0.054
6,the timeless debut from d...,0.605,0.0,0.817,0.034,0.122,0.355,0.324,0.106
7,the steely dublin post punk ba...,0.0,0.07,0.147,0.16,0.179,0.102,0.195,0.093
8,peter gabriels art pop ma...,0.401,0.183,0.185,0.247,0.535,0.532,0.385,0.205


### A reminder of what the Topics are

In [152]:
display_topics(nmf_model, vectorizer.get_feature_names(), 15) # number of words in topic


Topic  0
became, recording, came, version, told, took, seemed, known, wrote, original, playing, era, night, played, going

Topic  1
electronic, synths, dance, techno, club, producers, beats, ambient, dancefloor, dj, ep, rhythms, mix, rave, cut

Topic  2
rapper, beats, lil, rapping, hiphop, shit, produced, star, verses, flow, baby, verse, features, guest, trying

Topic  3
songwriter, someone, writing, shes, emotional, songwriting, feeling, singing, folk, alone, acoustic, room, closer, relationship, feelings

Topic  4
piece, jazz, composer, notes, pieces, musicians, based, group, within, ambient, compositions, form, melody, familiar, deep

Topic  5
woman, girl, women, shes, culture, video, wrote, stars, sex, girls, told, singles, today, written, rolling

Topic  6
guitarist, fans, drummer, metal, guitars, indie, noise, lead, show, hardcore, group, bassist, wave, scene, used

Topic  7
indie, help, group, songwriting, means, major, approach, finds, modern, whole, guest, similar, latter, in

### Where is component_1/topic 0 greater than all the others?

In [153]:
mask11 = ((H1['component_1'] > H1['component_2']) & (H1['component_1'] > H1['component_3']) & (H1['component_1'] > H1['component_4']) & (H1['component_1'] > H1['component_5']) & (H1['component_1'] > H1['component_6']) & (H1['component_1'] > H1['component_7']) & (H1['component_1'] > H1['component_8']))
H1[mask11]

Unnamed: 0,index,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8


In [154]:
# Print the articles for those reviews
index_pos11 = Test_df.index[H1[mask11].index]

In [155]:
Test_df.iloc[index_pos11]

Unnamed: 0,Author,Artist,Album_Name,Genre,Score,Contents


### Where is component 2/topic 1 greater than all the others?

In [156]:
mask22 = ((H1['component_2'] > H1['component_1']) & (H1['component_2'] > H1['component_3']) & (H1['component_2'] > H1['component_4']) & (H1['component_2'] > H1['component_5']) & (H1['component_2'] > H1['component_6']) & (H1['component_2'] > H1['component_7']) & (H1['component_2'] > H1['component_8']))
H1[mask22]

Unnamed: 0,index,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8
0,henry steinway’s second full l...,0.031,0.315,0.245,0.064,0.0,0.018,0.06,0.055
4,an album that changed ambie...,0.426,0.811,0.039,0.115,0.397,0.41,0.364,0.125


In [157]:
# Print the articles for those reviews
index_pos12 = Test_df.index[H1[mask22].index]

In [158]:
Test_df.iloc[index_pos12]

Unnamed: 0,Author,Artist,Album_Name,Genre,Score,Contents
0,Larry Fitzmaurice,RL Grime,Nova,Electronic,4.1,henry steinway’s second full length largely ab...
4,Philip Sherburne,Aphex Twin,Selected Ambient Works Volume II,Electronic,10.0,an album that changed ambient music forever...


### Where is component 3/topic 2 greater than all the others?

In [159]:
mask33 = ((H1['component_3'] > H1['component_1']) & (H1['component_3'] > H1['component_2']) & (H1['component_3'] > H1['component_4']) & (H1['component_3'] > H1['component_5']) & (H1['component_3'] > H1['component_6']) & (H1['component_3'] > H1['component_7']) & (H1['component_3'] > H1['component_8']))
H1[mask33]

Unnamed: 0,index,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8
3,man about five years ago i wa...,0.074,0.065,0.304,0.0,0.103,0.145,0.0,0.006
6,the timeless debut from d...,0.605,0.0,0.817,0.034,0.122,0.355,0.324,0.106


In [160]:
# Print the articles for those reviews
index_pos13 = Test_df.index[H1[mask33].index]

In [161]:
Test_df.iloc[index_pos13]

Unnamed: 0,Author,Artist,Album_Name,Genre,Score,Contents
3,Taylor M. Clark,Common,Like Water for Chocolate,Rap,8.7,man about five years ago i was so damn white ...
6,Sheldon Pearce,Dr. Dre,The Chronic,Rap,10.0,the timeless debut from dr dre a histor...


### Where is component 4/topic 3 greater than all the others?

In [162]:
mask44 = ((H1['component_4'] > H1['component_1']) & (H1['component_4'] > H1['component_2']) & (H1['component_4'] > H1['component_3']) & (H1['component_4'] > H1['component_5']) & (H1['component_4'] > H1['component_6']) & (H1['component_4'] > H1['component_7']) & (H1['component_4'] > H1['component_8']))
H1[mask44]

Unnamed: 0,index,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8
1,after years of searching aria...,0.079,0.069,0.158,0.381,0.083,0.066,0.012,0.055


In [163]:
# Print the articles for those reviews
index_pos14 = Test_df.index[H1[mask44].index]

In [164]:
Test_df.iloc[index_pos14]

Unnamed: 0,Author,Artist,Album_Name,Genre,Score,Contents
1,Jillian Mapes,Ariana Grande,Sweetener,Pop/R&B,8.1,after years of searching ariana grande has fo...


### Where is component 5/topic 4 greater than all the others?

In [165]:
mask55 = ((H1['component_5'] > H1['component_1']) & (H1['component_5'] > H1['component_2']) & (H1['component_5'] > H1['component_3']) & (H1['component_5'] > H1['component_4']) & (H1['component_5'] > H1['component_6']) & (H1['component_5'] > H1['component_7']) & (H1['component_5'] > H1['component_8']))
H1[mask55]

Unnamed: 0,index,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8
5,the klfs sample heavy dream...,0.582,0.671,0.151,0.028,0.687,0.258,0.33,0.054
8,peter gabriels art pop ma...,0.401,0.183,0.185,0.247,0.535,0.532,0.385,0.205


In [166]:
# Print the articles for those reviews
index_pos15 = Test_df.index[H1[mask55].index]

In [167]:
Test_df.iloc[index_pos15]

Unnamed: 0,Author,Artist,Album_Name,Genre,Score,Contents
5,Philip Sherburne,The KLF,Chill Out,Electronic,8.9,the klfs sample heavy dreamscape one of th...
8,Eric Harvey,Peter Gabriel,So,Rock,9.1,peter gabriels art pop masterpiece a tur...


### Where is component 6/topic 5 greater than all the others?

In [168]:
mask66 = ((H1['component_6'] > H1['component_1']) & (H1['component_6'] > H1['component_2']) & (H1['component_6'] > H1['component_3']) & (H1['component_6'] > H1['component_4']) & (H1['component_6'] > H1['component_5']) & (H1['component_6'] > H1['component_7']) & (H1['component_6'] > H1['component_8']))
H1[mask66]

Unnamed: 0,index,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8


In [169]:
# # Print the articles for those reviews
# index_pos16 = Test_df.index[H1[mask66].index]

In [170]:
# Test_df.iloc[index_pos16]

Unnamed: 0,Author,Artist,Album_Name,Genre,Score,Contents


### Where is component 7/topic 6 greater than all the others?

In [171]:
mask77 = ((H1['component_7'] > H1['component_1']) & (H1['component_7'] > H1['component_2']) & (H1['component_7'] > H1['component_3']) & (H1['component_7'] > H1['component_4']) & (H1['component_7'] > H1['component_5']) & (H1['component_7'] > H1['component_6']) & (H1['component_7'] > H1['component_8']))
H1[mask77]

Unnamed: 0,index,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8
2,set of arena friendly song a...,0.102,0.106,0.149,0.036,0.039,0.07,0.21,0.086
7,the steely dublin post punk ba...,0.0,0.07,0.147,0.16,0.179,0.102,0.195,0.093


In [215]:
# Print the articles for those reviews
index_pos17 = Test_df.index[H1[mask77].index]

In [216]:
Test_df.iloc[index_pos17]

Unnamed: 0,Author,Artist,Album_Name,Genre,Score,Contents
2,Rob Mitchum,Red Hot Chili Peppers,Stadium Arcadium,Rock,4.7,set of arena friendly song about california ...
7,Stuart Berman,Fontaines D.C.,Dogrel,Rock,8.0,the steely dublin post punk band infuse the bi...


### Where is component 8/topic 7 greater than all the others?

In [172]:
mask88 = ((H1['component_8'] > H1['component_1']) & (H1['component_8'] > H1['component_2']) & (H1['component_8'] > H1['component_3']) & (H1['component_8'] > H1['component_4']) & (H1['component_8'] > H1['component_5']) & (H1['component_8'] > H1['component_6']) & (H1['component_8'] > H1['component_7']))
H1[mask88]

Unnamed: 0,index,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8


In [173]:
# # Print the articles for those reviews
# index_pos18 = result.index[H1[mask88].index]

In [174]:
# Test_df.iloc[index_pos18]

## Calculating the cosine similarities

In [175]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

### Write a for loop for cosine similarity:
- Take all values in doc_topic[0 -> X] and compare Ariana's album, doc_topic1[1], to every other album and return the albums that are the most similar

In [176]:
Pitch_list = list(modeled_df.index)

In [177]:
Test_list = list(Test_df.index)

In [178]:
# using naive method to perform conversion 
for i in range(0, len(Pitch_list)): 
    Pitch_list[i] = int(Pitch_list[i]) 

In [179]:
list_compare = list(modeled_df.index[H[mask4].index])

### Get all cosine similarities for test vs. modeled

In [180]:
# These are the index positions for my test albums
Test_list

[0, 1, 2, 3, 4, 5, 6, 7, 8]

In [181]:
# These are the index positions for my modeled albums
# List from 0 - 1186
# Pitch_list

In [182]:
cos_list = []

for num, cos_sim in enumerate(Pitch_list):
    for num, dt in enumerate(Test_list):
        try:
            cos_list.append((dt, cos_sim, (cosine_similarity((doc_topic1[dt], doc_topic[cos_sim])).round(3))))
        except:
            print(None)

# Put it in a DF
CS_df = pd.DataFrame(cos_list, columns=['TestID', 'ModelID', 'CosineSimiliarity'])

In [183]:
# The first column is the test_index number
# The second column is the modeled index number
CS_df

Unnamed: 0,TestID,ModelID,CosineSimiliarity
0,0,0,"[[1.0, 0.663], [0.663, 1.0]]"
1,1,0,"[[1.0, 0.438], [0.438, 1.0]]"
2,2,0,"[[1.0, 0.669], [0.669, 1.0]]"
3,3,0,"[[1.0, 0.959], [0.959, 1.0]]"
4,4,0,"[[1.0, 0.464], [0.464, 1.0]]"
...,...,...,...
10606,4,1178,"[[1.0, 0.151], [0.151, 1.0]]"
10607,5,1178,"[[1.0, 0.237], [0.237, 1.0]]"
10608,6,1178,"[[1.0, 0.719], [0.719, 1.0]]"
10609,7,1178,"[[1.0, 0.59], [0.59, 1.0]]"


### I just want the second number in each array for CosineSimilarity column


In [184]:
csdf_list = []

for csdf in CS_df.CosineSimiliarity:
    for csd in csdf:
        csdf_list.append(csd[0])

In [185]:
CS_df.CosineSimiliarity = csdf_list[1::2]

In [186]:
Keepme = CS_df
Keepme

Unnamed: 0,TestID,ModelID,CosineSimiliarity
0,0,0,0.663
1,1,0,0.438
2,2,0,0.669
3,3,0,0.959
4,4,0,0.464
...,...,...,...
10606,4,1178,0.151
10607,5,1178,0.237
10608,6,1178,0.719
10609,7,1178,0.590


### Return the top 3 album recommendations for each test

In [187]:
Test_rec = Keepme.set_index(['ModelID']).groupby(['TestID'])['CosineSimiliarity'].nlargest(3)

In [188]:
Test_rec_df = pd.DataFrame(Test_rec)

In [189]:
test_reset = Test_rec_df

In [190]:
Test_reset_df = test_reset.reset_index()

In [191]:
Test_reset_df

Unnamed: 0,TestID,ModelID,CosineSimiliarity
0,0,302,0.969
1,0,853,0.969
2,0,273,0.956
3,1,507,0.978
4,1,635,0.962
5,1,611,0.958
6,2,1129,0.931
7,2,379,0.923
8,2,1032,0.916
9,3,309,0.965


In [192]:
cosine_similarity((doc_topic1[2], doc_topic[911])).round(3)

array([[1.   , 0.862],
       [0.862, 1.   ]])

In [193]:
# Test_df.iloc[2]

In [194]:
# modeled_df.iloc[911]

In [195]:
test_results = []
model_results = []

for testid in Test_reset_df.TestID:
    test_results.append(Test_df.iloc[testid])

for modelid in Test_reset_df.ModelID:
    model_results.append(modeled_df.iloc[modelid])

In [196]:
test_results_2 = (pd.DataFrame(test_results)).reset_index()

In [197]:
test_results_3 = test_results_2.drop(columns='index')

In [217]:
test_results_4 = test_results_3.reset_index()
# test_results_4

In [199]:
model_results_2 = (pd.DataFrame(model_results)).reset_index()

In [200]:
model_results_3 = model_results_2.drop(columns='index')

In [218]:
model_results_4 = model_results_3.reset_index()
# model_results_4

### Join test_results_4 and model_results_4 on index

In [202]:
mergedDF = pd.merge(test_results_4, model_results_4, left_index=True, right_index=True)
# mergedDF

Unnamed: 0,index_x,Author_x,Artist_x,Album_Name_x,Genre_x,Score_x,Contents_x,index_y,Author_y,Artist_y,Album_Name_y,Genre_y,Score_y,Contents_y
0,0,Larry Fitzmaurice,RL Grime,Nova,Electronic,4.1,henry steinway’s second full length largely ab...,0,Noah Yoo,Clams Casino,Moon Trip Radio,Rap,7.7,on his second album the cloud rap innovator h...
1,1,Larry Fitzmaurice,RL Grime,Nova,Electronic,4.1,henry steinway’s second full length largely ab...,1,Michelle Kim,J-E-T-S,ZOOSPA,Electronic,7.5,the first collaborative full length by produce...
2,2,Larry Fitzmaurice,RL Grime,Nova,Electronic,4.1,henry steinway’s second full length largely ab...,2,Noah Yoo,TNGHT,II EP,Electronic,7.7,after&bowing out at the peak of the trap wave ...
3,3,Jillian Mapes,Ariana Grande,Sweetener,Pop/R&B,8.1,after years of searching ariana grande has fo...,3,Kevin Lozano,Lower Dens,The Competition,Rock,6.7,the baltimore synth pop band latest might be t...
4,4,Jillian Mapes,Ariana Grande,Sweetener,Pop/R&B,8.1,after years of searching ariana grande has fo...,4,Katherine St. Asaph,Clairo,Immunity,Pop/R&B,8.0,on her debut album the young viral star moves...
5,5,Jillian Mapes,Ariana Grande,Sweetener,Pop/R&B,8.1,after years of searching ariana grande has fo...,5,Matthew Strauss,Bon Iver,"i,i",Rock,8.8,on his fourth album justin vernon reassembles...
6,6,Rob Mitchum,Red Hot Chili Peppers,Stadium Arcadium,Rock,4.7,set of arena friendly song about california ...,6,Ian Cohen,Foals,Everything Not Saved Will Be Lost Part 1,Rock,6.0,the uk rockers summon all their crowd pleasing...
7,7,Rob Mitchum,Red Hot Chili Peppers,Stadium Arcadium,Rock,4.7,set of arena friendly song about california ...,7,Evan Rytlewski,Third Eye Blind,Screamer,Rock,6.9,on the band sixth album frontman stephan jenk...
8,8,Rob Mitchum,Red Hot Chili Peppers,Stadium Arcadium,Rock,4.7,set of arena friendly song about california ...,8,Madison Bloom,Show Me the Body,Dog Whistle,Rock,6.8,the hardcore trio grapples with the ongoing de...
9,9,Taylor M. Clark,Common,Like Water for Chocolate,Rap,8.7,man about five years ago i was so damn white ...,9,Eric Torres,Various Artists,Charlie's Angels (Original Motion Picture Soun...,No genre,5.4,ariana grande executive produces the latest en...


#### Drop certain columns 

In [203]:
fixed_merge = mergedDF.drop(columns=['index_x', 'Contents_x', 'index_y', 'Contents_y'])

#### Fix columns names

In [204]:
fixed_renamed = fixed_merge.\
rename(columns={'Author_x':'Author_test','Artist_x':'Artist_test',\
                'Album_Name_x':'Album_Name_test', 'Genre_x':'Genre_test',\
                'Score_x':'Score_test', 'Author_y': 'Author_model',\
                'Artist_y':'Artist_model', 'Album_Name_y':'Album_Name_model',\
                'Genre_y':'Genre_model', 'Score_y':'Score_model'})

fixed_renamed

Unnamed: 0,Author_test,Artist_test,Album_Name_test,Genre_test,Score_test,Author_model,Artist_model,Album_Name_model,Genre_model,Score_model
0,Larry Fitzmaurice,RL Grime,Nova,Electronic,4.1,Noah Yoo,Clams Casino,Moon Trip Radio,Rap,7.7
1,Larry Fitzmaurice,RL Grime,Nova,Electronic,4.1,Michelle Kim,J-E-T-S,ZOOSPA,Electronic,7.5
2,Larry Fitzmaurice,RL Grime,Nova,Electronic,4.1,Noah Yoo,TNGHT,II EP,Electronic,7.7
3,Jillian Mapes,Ariana Grande,Sweetener,Pop/R&B,8.1,Kevin Lozano,Lower Dens,The Competition,Rock,6.7
4,Jillian Mapes,Ariana Grande,Sweetener,Pop/R&B,8.1,Katherine St. Asaph,Clairo,Immunity,Pop/R&B,8.0
5,Jillian Mapes,Ariana Grande,Sweetener,Pop/R&B,8.1,Matthew Strauss,Bon Iver,"i,i",Rock,8.8
6,Rob Mitchum,Red Hot Chili Peppers,Stadium Arcadium,Rock,4.7,Ian Cohen,Foals,Everything Not Saved Will Be Lost Part 1,Rock,6.0
7,Rob Mitchum,Red Hot Chili Peppers,Stadium Arcadium,Rock,4.7,Evan Rytlewski,Third Eye Blind,Screamer,Rock,6.9
8,Rob Mitchum,Red Hot Chili Peppers,Stadium Arcadium,Rock,4.7,Madison Bloom,Show Me the Body,Dog Whistle,Rock,6.8
9,Taylor M. Clark,Common,Like Water for Chocolate,Rap,8.7,Eric Torres,Various Artists,Charlie's Angels (Original Motion Picture Soun...,No genre,5.4


## Recommender Based on Album Names??

https://github.com/nicolasfguillaume/Recommender-Systems-Making-Movies-Recommendation/blob/master/MovieLens%20(NMF)%20v1.ipynb

In [205]:
modeled_df.head()

Unnamed: 0,Author,Artist,Album_Name,Genre,Score,Contents
0,Sheldon Pearce,BTS,Map of the Soul: 7,Pop/R&B,6.3,the kpop groups latest is part memoir part fa...
1,Ruth Saxelby,Various Artists,Kulør 006,Electronic,7.5,the danish dj courtesys label pivots from cope...
2,Cat Zhang,Sunny Jain,Wild Wild East,Experimental,6.8,the composer and jazz multi instrumentalist ex...
3,Andy Beta,AceMoMA,A New Dawn,Electronic,7.6,rising new_york fixtures acemo and moma ready ...
4,Paul A. Thompson,Boldy James,The Price of Tea in China,Rap,8.0,on their latest collaboration the detroit rap...


In [206]:
# Predicter variable
X = Test_df['Album_Name'].values
# Target variable
y = modeled_df['Album_Name'].values

In [None]:
# Take 3 example albums, get reviews, and put them in and see which ones they were most similar to 