# Dictionary Methods - Solutions to Challenges

# Part 0: Basic dictionary methods

## 0.1 Pre-processing

In [1]:
#import the necessary packages
import pandas as pd
import nltk
nltk.download('punkt')
from nltk import word_tokenize
import string

#read the Music Reviews corpus into a Pandas dataframe
df = pd.read_csv("../day-2/data/BDHSI2016_music_reviews.csv", encoding='utf-8', sep = '\t')

#view the dataframe
df

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,album,artist,genre,release_date,critic,score,body
0,Don't Panic,All Time Low,Pop/Rock,2012-10-09 00:00:00,Kerrang!,74.0,While For Baltimore proves they can still writ...
1,Fear and Saturday Night,Ryan Bingham,Country,2015-01-20 00:00:00,Uncut,70.0,There's nothing fake about the purgatorial nar...
2,The Way I'm Livin',Lee Ann Womack,Country,2014-09-23 00:00:00,Q Magazine,84.0,All life's disastrous lows are here on a caree...
3,Doris,Earl Sweatshirt,Rap,2013-08-20 00:00:00,Pitchfork,82.0,"With Doris, Odd Future’s Odysseus is finally b..."
4,Giraffe,Echoboy,Rock,2003-02-25 00:00:00,AllMusic,71.0,Though Giraffe is definitely Echoboy's most im...
...,...,...,...,...,...,...,...
4996,Outer South,Conor Oberst And The Mystic Valley Band,Indie,2009-05-05 00:00:00,Slant Magazine,67.0,The result is an album that's unfortunately ba...
4997,On An Island,David Gilmour,Rock,2006-03-07 00:00:00,E! Online,67.0,"In the end, Island makes Dave sound like he's ..."
4998,Movement,Gossip,Indie,2003-05-06 00:00:00,Uncut,81.0,Beth Ditto's remarkable gospel holler and ferv...
4999,Locked Down,Dr. John,Pop/Rock,2012-04-03 00:00:00,PopMatters,86.0,"Dr. John is Dr. John. He's a star, and is on f..."


In [2]:
# Remove digits from `body` column:
df['body'] = df['body'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))

### Challenge - SOLUTION
Let's review preprocessing using the `df` we just created. This is a little different from yesterday's practice using strings and lists, but the essentials are the same. To see the key new things you'll likely want to use, refer to the example of removing digits from the previous cell--especially note the list comprehension and this useful strategy: 
`df['column'].apply(lambda x: function(x))`. 

To preprocess `df`, take these steps:
* Create a new column in `df` called `body_tokens` that contains a lower cased version of `df['body']`. 
* Tokenize the `body_tokens` column of `df` using one of the methods we worked with yesterday. 
* Remove punctuation from `body_tokens`. 
* Create a new column that contains the length of the token list in each row. We will use this later to normalize the dictionary counts. 
* Reflect: What other pre-processing steps might we use?

In [3]:
# First create a new column called "body_tokens"
# transform to lowercase by applying x.lower() OR the string function str.lower()
df['body_tokens'] = df['body'].apply(lambda x: x.lower())

# view output
df['body_tokens'] 

0       while for baltimore proves they can still writ...
1       there's nothing fake about the purgatorial nar...
2       all life's disastrous lows are here on a caree...
3       with doris, odd future’s odysseus is finally b...
4       though giraffe is definitely echoboy's most im...
                              ...                        
4996    the result is an album that's unfortunately ba...
4997    in the end, island makes dave sound like he's ...
4998    beth ditto's remarkable gospel holler and ferv...
4999    dr. john is dr. john. he's a star, and is on f...
5000    their work, especially that displayed on refin...
Name: body_tokens, Length: 5001, dtype: object

In [4]:
#tokenize
df['body_tokens'] = df['body_tokens'].apply(word_tokenize)

#view output
print(df['body_tokens'])

0       [while, for, baltimore, proves, they, can, sti...
1       [there, 's, nothing, fake, about, the, purgato...
2       [all, life, 's, disastrous, lows, are, here, o...
3       [with, doris, ,, odd, future, ’, s, odysseus, ...
4       [though, giraffe, is, definitely, echoboy, 's,...
                              ...                        
4996    [the, result, is, an, album, that, 's, unfortu...
4997    [in, the, end, ,, island, makes, dave, sound, ...
4998    [beth, ditto, 's, remarkable, gospel, holler, ...
4999    [dr., john, is, dr., john, ., he, 's, a, star,...
5000    [their, work, ,, especially, that, displayed, ...
Name: body_tokens, Length: 5001, dtype: object


In [5]:
punctuations = list(string.punctuation)

#remove punctuation. Note the list comprehension used with lambda x
df['body_tokens'] = df['body_tokens'].apply(lambda token: [char for char in token if char not in punctuations])

#view output
print(df['body_tokens'])

0       [while, for, baltimore, proves, they, can, sti...
1       [there, 's, nothing, fake, about, the, purgato...
2       [all, life, 's, disastrous, lows, are, here, o...
3       [with, doris, odd, future, ’, s, odysseus, is,...
4       [though, giraffe, is, definitely, echoboy, 's,...
                              ...                        
4996    [the, result, is, an, album, that, 's, unfortu...
4997    [in, the, end, island, makes, dave, sound, lik...
4998    [beth, ditto, 's, remarkable, gospel, holler, ...
4999    [dr., john, is, dr., john, he, 's, a, star, an...
5000    [their, work, especially, that, displayed, on,...
Name: body_tokens, Length: 5001, dtype: object


In [6]:
# Get total token count for each row:
df['token_count'] = df['body_tokens'].apply(lambda x: len(x))

print(df[['body_tokens','token_count']])

                                            body_tokens  token_count
0     [while, for, baltimore, proves, they, can, sti...           38
1     [there, 's, nothing, fake, about, the, purgato...           28
2     [all, life, 's, disastrous, lows, are, here, o...           13
3     [with, doris, odd, future, ’, s, odysseus, is,...           18
4     [though, giraffe, is, definitely, echoboy, 's,...           51
...                                                 ...          ...
4996  [the, result, is, an, album, that, 's, unfortu...           27
4997  [in, the, end, island, makes, dave, sound, lik...           17
4998  [beth, ditto, 's, remarkable, gospel, holler, ...           25
4999  [dr., john, is, dr., john, he, 's, a, star, an...           18
5000  [their, work, especially, that, displayed, on,...           28

[5001 rows x 2 columns]


**Reflection:**<br>
There are many other preprocessing steps, including stopword removal, normalizing text (removing URLs and hashtags), stripping whitespace, counting word frequencies, and removing infrequent words.

## 0.2. Creating dictionary counts

In [7]:
pos_sent = open("../day-2/data/positive_words.txt", encoding='utf-8').read()
neg_sent = open("../day-2/data/negative_words.txt", encoding='utf-8').read()

In [8]:
#remember the split function? We'll split on the newline character (\n) to create a list
positive_words=pos_sent.split('\n')
negative_words=neg_sent.split('\n')

In [9]:
#count number of words in each list
print(len(positive_words))
print(len(negative_words))

2231
3906


### Challenge - SOLUTION
1. Create a column with the number of positive words, and another with the proportion of positive words
2. Create a column with the number of negative words, and another with the proportion of negative words
3. Print the average proportion of negative and positive words by genre
4. Compare this to the average score by genre

*Note:* You won't be able to do this challenge (or anything else in this section) if you didn't complete the first challenge above to preprocess `df['body']` into `df['body_tokens']`. If you skipped that part or got stuck, copy and run the solution from `solutions/dictionary-methods-solutions.ipynb` before moving on.

In [10]:
# Best way to do this: A list comprehension in an apply statement!
df['pos_num'] = df['body_tokens'].apply(lambda x: len([word for word in x if word in positive_words]))
df['neg_num'] = df['body_tokens'].apply(lambda x: len([word for word in x if word in negative_words]))

In [11]:
# Another way: Create functions to count matches!
# Not really necessary for a case this simple, but easy to modify for more complex cases.

def count_pos_words(tokens):
    '''
    Counts number of positive words in a preprocessed (tokenized, etc.) text, 
    using already-defined positive sentiment dictionary `positive_words`. 
    
    Args:
        tokens: preprocessed, tokenized input text
        positive_words: global var, list of positive sentiment words (sentiment dictionary)
    Returns:
        count: number of times any word from positive_words occurs in tokens (input text)
    '''
    
    global positive_words
    
    count = 0 # initialize counter
    
    # Loop over input text; if word occurs in dictionary, add 1 to count
    for word in tokens:
        if word in positive_words:
            count += 1
    
    return count
    
def count_neg_words(tokens):
    '''
    Counts number of negative words in a preprocessed (tokenized, etc.) text, 
    using already-defined negative sentiment dictionary `negative_words`.
    
    Args:
        tokens: preprocessed, tokenized input text
        negative_words: global var, list of negative sentiment words (sentiment dictionary)
    Returns:
        count: number of times any word from negative_words occurs in tokens (input text)
    '''
    
    global negative_words
    
    count = 0 # initialize counter
    
    # Loop over input text; if word occurs in dictionary, add 1 to count
    for word in tokens:
        if word in negative_words:
            count += 1
    
    return count

# Use the functions via df[col].apply()
df['pos_num'] = df['body_tokens'].apply(count_pos_words)
df['neg_num'] = df['body_tokens'].apply(count_neg_words)

In [12]:
df['pos_prop'] = df['pos_num']/df['token_count']
df['neg_prop'] = df['neg_num']/df['token_count']
df['pos_lean'] = df['pos_prop']-df['neg_prop']
df.drop('release_date', axis=1, inplace=True)
df

Unnamed: 0,album,artist,genre,critic,score,body,body_tokens,token_count,pos_num,neg_num,pos_prop,neg_prop,pos_lean
0,Don't Panic,All Time Low,Pop/Rock,Kerrang!,74.0,While For Baltimore proves they can still writ...,"[while, for, baltimore, proves, they, can, sti...",38,1,0,0.026316,0.000000,0.026316
1,Fear and Saturday Night,Ryan Bingham,Country,Uncut,70.0,There's nothing fake about the purgatorial nar...,"[there, 's, nothing, fake, about, the, purgato...",28,0,3,0.000000,0.107143,-0.107143
2,The Way I'm Livin',Lee Ann Womack,Country,Q Magazine,84.0,All life's disastrous lows are here on a caree...,"[all, life, 's, disastrous, lows, are, here, o...",13,0,1,0.000000,0.076923,-0.076923
3,Doris,Earl Sweatshirt,Rap,Pitchfork,82.0,"With Doris, Odd Future’s Odysseus is finally b...","[with, doris, odd, future, ’, s, odysseus, is,...",18,0,1,0.000000,0.055556,-0.055556
4,Giraffe,Echoboy,Rock,AllMusic,71.0,Though Giraffe is definitely Echoboy's most im...,"[though, giraffe, is, definitely, echoboy, 's,...",51,2,4,0.039216,0.078431,-0.039216
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4996,Outer South,Conor Oberst And The Mystic Valley Band,Indie,Slant Magazine,67.0,The result is an album that's unfortunately ba...,"[the, result, is, an, album, that, 's, unfortu...",27,0,3,0.000000,0.111111,-0.111111
4997,On An Island,David Gilmour,Rock,E! Online,67.0,"In the end, Island makes Dave sound like he's ...","[in, the, end, island, makes, dave, sound, lik...",17,3,0,0.176471,0.000000,0.176471
4998,Movement,Gossip,Indie,Uncut,81.0,Beth Ditto's remarkable gospel holler and ferv...,"[beth, ditto, 's, remarkable, gospel, holler, ...",25,2,0,0.080000,0.000000,0.080000
4999,Locked Down,Dr. John,Pop/Rock,PopMatters,86.0,"Dr. John is Dr. John. He's a star, and is on f...","[dr., john, is, dr., john, he, 's, a, star, an...",18,1,0,0.055556,0.000000,0.055556


In [14]:
grouped = df.groupby(by = 'genre')
grouped['pos_lean'].mean().sort_values(ascending=False)

genre
Folk                      0.073189
Jazz                      0.060588
R&B;                      0.050918
Indie                     0.048024
Alternative/Indie Rock    0.046710
Rock                      0.044023
Electronic                0.043299
Pop/Rock                  0.042071
Pop                       0.040085
Country                   0.039038
Dance                     0.037849
Rap                       0.036232
Name: pos_lean, dtype: float64

In [15]:
grouped['score'].mean().sort_values(ascending=False)

genre
Jazz                      77.631579
Folk                      75.900000
Indie                     74.400897
Country                   74.071429
Alternative/Indie Rock    73.928571
Electronic                73.140351
Pop/Rock                  73.033782
R&B;                      72.366071
Rap                       72.173554
Rock                      70.754292
Dance                     70.146341
Pop                       64.608054
Name: score, dtype: float64

In [16]:
genre_list = list(df['genre'].unique())

for genre in genre_list:
    pos_mean = df[df['genre']==genre]['pos_prop'].mean()
    neg_mean = df[df['genre']==genre]['neg_prop'].mean()
    score_mean = df[df['genre']==genre]['score'].mean()
    print("Average sentiment and score for", str(genre) + ":\n", 
          "pos", str(round(pos_mean, 4)) + ", neg", str(round(neg_mean, 4)) + ", score", str(round(score_mean, 1)))
    print()

Average sentiment and score for Pop/Rock:
 pos 0.0776, neg 0.0356, score 73.0

Average sentiment and score for Country:
 pos 0.0721, neg 0.0331, score 74.1

Average sentiment and score for Rap:
 pos 0.071, neg 0.0347, score 72.2

Average sentiment and score for Rock:
 pos 0.0802, neg 0.0362, score 70.8

Average sentiment and score for Indie:
 pos 0.0853, neg 0.0373, score 74.4

Average sentiment and score for Electronic:
 pos 0.0786, neg 0.0353, score 73.1

Average sentiment and score for Pop:
 pos 0.0697, neg 0.0296, score 64.6

Average sentiment and score for Folk:
 pos 0.0965, neg 0.0233, score 75.9

Average sentiment and score for R&B;:
 pos 0.0745, neg 0.0236, score 72.4

Average sentiment and score for Alternative/Indie Rock:
 pos 0.0806, neg 0.0339, score 73.9

Average sentiment and score for Dance:
 pos 0.0781, neg 0.0402, score 70.1

Average sentiment and score for Jazz:
 pos 0.0873, neg 0.0267, score 77.6



In [17]:
# Check out correlation (turns out to be rather weak):
import numpy as np
np.corrcoef(df['pos_lean'], df['score'])

array([[1.      , 0.076243],
       [0.076243, 1.      ]])

## 0.3. Sentiment analysis using scikit-learn

In [18]:
#import the function CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

countvec = CountVectorizer()

#create our document term matrix as a pandas dataframe
dtm_df = pd.DataFrame(countvec.fit_transform(df.body).toarray(), columns=countvec.get_feature_names(), index = df.index)

In [19]:
#create a columns variable that is a list of all column names
columns = list(dtm_df)

In [20]:
#create a new variable that contains only column names that are in our postive words list
pos_columns = [word for word in columns if word in positive_words]

In [21]:
#create a dtm from our dtm_df that keeps only positive sentiment columns
dtm_pos = dtm_df[pos_columns]

In [22]:
#count the number of positive words for each document
dtm_pos['pos_count'] = dtm_pos.sum(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dtm_pos['pos_count'] = dtm_pos.sum(axis=1)


### Challenge - SOLUTION
1. Do the same for negative words.  
2. Calculate the proportion of negative and positive words for each document.

In [23]:
#create a new variable that contains only column names that are in our negative words list
neg_columns = [word for word in columns if word in negative_words]

#create a dtm from our dtm_df that keeps only negative sentiment columns
dtm_neg = dtm_df[neg_columns]

#count the number of negative words for each document
dtm_neg['neg_count'] = dtm_neg.sum(axis=1)

# Check out results
dtm_neg['neg_count']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dtm_neg['neg_count'] = dtm_neg.sum(axis=1)


0       0
1       3
2       1
3       1
4       4
       ..
4996    3
4997    0
4998    0
4999    0
5000    0
Name: neg_count, Length: 5001, dtype: int64

In [24]:
# Compute ratio
dtm_pos['pos_proportion'] = dtm_pos['pos_count']/dtm_df.sum(axis=1)
dtm_neg['neg_proportion'] = dtm_neg['neg_count']/dtm_df.sum(axis=1)

# Compare manual version with scikit learn CountVectorizer results:
print(dtm_pos['pos_proportion'])
df['pos_prop']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dtm_pos['pos_proportion'] = dtm_pos['pos_count']/dtm_df.sum(axis=1)


0       0.030303
1       0.000000
2       0.000000
3       0.000000
4       0.046512
          ...   
4996    0.000000
4997    0.187500
4998    0.095238
4999    0.062500
5000    0.178571
Name: pos_proportion, Length: 5001, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dtm_neg['neg_proportion'] = dtm_neg['neg_count']/dtm_df.sum(axis=1)


0       0.026316
1       0.000000
2       0.000000
3       0.000000
4       0.039216
          ...   
4996    0.000000
4997    0.176471
4998    0.080000
4999    0.055556
5000    0.178571
Name: pos_prop, Length: 5001, dtype: float64

# Part 1: Weighting dictionaries

## 1.1 Read concreteness score dictionary

In [25]:
con_score = pd.read_csv('../day-2/data/Concreteness_ratings_Brysbaert_et_al.csv')

## 1.2. Merging a DTM with a weighted dictionary

In [26]:
text_list = []
#open and read the novels, save them as variables
austen_string = open('../day-2/data/Austen_PrideAndPrejudice.txt', encoding='utf-8').read()
alcott_string = open('../day-2/data/Alcott_GarlandForGirls.txt', encoding='utf-8').read()

#append each novel to the list
text_list.append(austen_string)
text_list.append(alcott_string)

countvec = CountVectorizer(stop_words="english")

novels_df = pd.DataFrame(countvec.fit_transform(text_list).toarray(), columns=countvec.get_feature_names())

In [27]:
columns=list(novels_df)
columns_con = [word for word in columns if word in list(con_score['Word'])]

In [28]:
novels_df_con = novels_df[columns_con]

In [29]:
df = novels_df_con.transpose()
df.rename(columns={0: 'Austen', 1: 'Alcott'}, inplace=True)

In [30]:
#Rename the index 'Word', and reset the index, so the words become a column in our dataframe and we get a new index.
df.index.names = ['Word']
df.reset_index(inplace=True)

In [31]:
#merge with our dictionary dataframe, called 'con_score'
df = df.merge(con_score, on = 'Word')

## 1.3. Weighting term frequencies by the concreteness score

In [32]:
df['austen_con_score'] = df['Austen'] * df['Conc.M']
df['alcott_con_score'] = df['Alcott'] * df['Conc.M']

### Challenge - SOLUTION

Calculate and print the average concreteness score for each text. Careful! Think through this before you implement it. You want the average score, normalized over all the words in the text. 

*Hint:* Think about these two dataframes you have in memory: `df`, which has concreteness-scored words and their counts per novel; and `novels_df`, which has all words per novel.

In [33]:
# We'll devide the sum of the concreteness score by the total word count for each novel
print("Mean Concreteness for Austen's 'Pride and Prejudice'")
print(df['austen_con_score'].sum()/df['Austen'].sum())
print()
print("Mean Concreteness for Alcott's 'A Garland for Girls'")
print(df['alcott_con_score'].sum()/df['Alcott'].sum())

Mean Concreteness for Austen's 'Pride and Prejudice'
2.783289058278108

Mean Concreteness for Alcott's 'A Garland for Girls'
3.1534507874015745


### Challenge - SOLUTION
Print the most concrete and abstract terms in Austen and in Alcott. Don't worry about term frequencies; just look at the raw score of words present in each novel.<br>
*Hint:* You can't simply sort on the column `austen_con_score` and so on. Why not? What are your next steps?

In [34]:
#Create a new dataframe that keeps only words that have a non-zero value in Alcott
df_alcott = df[df['Alcott']>0]
#Sort on 'Conc.M' and print in descending order for most concrete words
df_alcott[['Word', 'Conc.M', 'Alcott']].sort_values(by=['Conc.M', 'Alcott'], ascending = False)

Unnamed: 0,Word,Conc.M,Alcott
2692,house,5.00,65
6033,water,5.00,32
470,bed,5.00,25
590,boots,5.00,17
2139,fish,5.00,17
...,...,...,...
1891,especially,1.28,12
5094,somewhat,1.28,5
4705,sanctimonious,1.28,1
2671,hope,1.25,40


In [35]:
#Create a new dataframe that keeps only words that have a non-zero value in Austen
df_austen = df[df['Austen']>0]
df_austen[['Word', 'Conc.M', 'Austen']].sort_values(by=['Conc.M', 'Austen'], ascending = False)

Unnamed: 0,Word,Conc.M,Austen
2692,house,5.00,108
413,ball,5.00,36
5198,stairs,5.00,24
470,bed,5.00,6
921,clock,5.00,6
...,...,...,...
2671,hope,1.25,121
23,absurdity,1.25,1
109,advantageously,1.24,2
2873,infinitely,1.22,4


# Bonus: Weighting words with TF-IDF<a id='tfidf'></a>

In [36]:
# Let's use our Music Reviews corpus for this. Read into Pandas DataFrame:
df = pd.read_csv("../day-2/data/BDHSI2016_music_reviews.csv", encoding='utf-8', sep = '\t')

# Clean out numbers:
df['body'] = df['body'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))

In [37]:
#import the function TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfvec = TfidfVectorizer()

#create the dtm, but with cells weigthed by the tf-idf score.
tfidf_df = pd.DataFrame(tfidfvec.fit_transform(df['body']).toarray(), columns=tfidfvec.get_feature_names())

#view results
tfidf_df

Unnamed: 0,aa,aaaa,aahs,aaliyah,aaron,ab,abandon,abandoned,abandoning,abc,...,zone,zones,zoo,zooey,zoomer,zu,zydeco,álbum,être,über
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Identifying Distinctive Words

In [38]:
#creat dataset with document index and genre
df_genre = df['genre'].to_frame()

In [39]:
#merge this into the dtm_tfidf_df
merged_df = df_genre.join(tfidf_df, how = 'right', lsuffix='_x')

In [40]:
#pull out the reviews for three genres, Rap, Alternative/Indie Rock, and Jazz
dtm_rap = merged_df[merged_df['genre_x']=='Rap']
dtm_indie = merged_df[merged_df['genre_x']=='Alternative/Indie Rock']
dtm_jazz = merged_df[merged_df['genre_x']=='Jazz']

#print the words with the highest TF-IDF scores for each genre
print('Rap Words')
print(dtm_rap.max(numeric_only=True).sort_values(ascending=False)[0:20])
print()
print('Indie Words')
print(dtm_indie.max(numeric_only=True).sort_values(ascending=False)[0:20])
print()
print('Jazz Words')
print(dtm_jazz.max(numeric_only=True).sort_values(ascending=False)[0:20])

Rap Words
blank             0.854475
waste             0.755918
amiable           0.730963
awesomely         0.717079
joyless           0.687687
beastie           0.672439
same              0.672392
sucker            0.663760
vanguard          0.661978
tight             0.653993
lamest            0.639377
derivativeness    0.636271
authentic         0.627192
diverse           0.623373
sermon            0.621175
pushin            0.617699
mastermind        0.609213
neat              0.608922
we                0.600755
lift              0.591821
dtype: float64

Indie Words
underplayed    0.516717
prisoner       0.512087
jezabels       0.512087
careworn       0.509386
folk           0.509321
fourth         0.480502
heyday         0.469035
their          0.458950
riffed         0.458182
bet            0.456164
victory        0.449289
exhausted      0.445969
bigger         0.441849
babelfished    0.431543
lightweight    0.428857
exercised      0.428857
powerhouse     0.422192
worn          

### Challenge - SOLUTION

Compare the distinctive words for two artists in the data.

Note: the artists should have a number of reviews, so check your frequency counts to identify artists.

*Hint:* Copy and paste the above code and modify it as needed.

In [41]:
print(df['artist'][::250]) # Random look at artists to choose from

0             All Time Low
250        Fujiya & Miyagi
500                 Katy B
750            Negativland
1000                Tricky
1250        Emmylou Harris
1500                Shamir
1750              Thursday
2000                 Psapp
2250             Cornelius
2500                Vessel
2750                Clutch
3000              Ice Cube
3250              Autechre
3500            Kanye West
3750              The Fall
4000           Gaz Coombes
4250    Noah and the Whale
4500        Mumford & Sons
4750            Bobby Conn
5000      Stars Of The Lid
Name: artist, dtype: object


In [43]:
#creat dataset with document index and artist, merge
merged_df_artist = df['artist'].to_frame()
merged_df_artist = merged_df_artist.join(tfidf_df, how = 'right', lsuffix='_x')

#view result
merged_df_artist

Unnamed: 0,artist_x,aa,aaaa,aahs,aaliyah,aaron,ab,abandon,abandoned,abandoning,...,zone,zones,zoo,zooey,zoomer,zu,zydeco,álbum,être,über
0,All Time Low,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Ryan Bingham,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Lee Ann Womack,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Earl Sweatshirt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Echoboy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4996,Conor Oberst And The Mystic Valley Band,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,David Gilmour,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,Gossip,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4999,Dr. John,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
# Define artists to identify distinctive words for
artist1 = 'R.E.M.'
artist2 = 'Arcade Fire'

# Filter merged_df_artist to these two artists
dtm1 = merged_df_artist[merged_df_artist['artist_x']==artist1]
dtm2 = merged_df_artist[merged_df_artist['artist_x']==artist2]

# Display distinctive words
print("Most distinctive words for " + str(artist1))
print(dtm1.max(numeric_only=True).sort_values(ascending=False)[0:20])
print()
print("Most distinctive words for " + str(artist2))
print(dtm2.max(numeric_only=True).sort_values(ascending=False)[0:20])
print()

Most distinctive words for R.E.M.
reliably        0.579442
staid           0.550549
every           0.530261
isn             0.523994
unfussy         0.513744
crucially       0.459618
committed       0.434459
convincing      0.434459
fast            0.424265
collapse        0.421777
habit           0.410508
accelerate      0.410508
stun            0.391646
forming         0.391646
dec             0.376505
noncommittal    0.368986
beautiful       0.358367
mostly          0.352703
stutter         0.352486
stipe           0.352032
dtype: float64

Most distinctive words for Arcade Fire
disc           0.459815
reflektor      0.431429
jumping        0.423503
patterns       0.409032
features       0.408639
bitterness     0.408519
shorter        0.397541
radiates       0.389749
affection      0.389749
suburbs        0.377718
beguiling      0.374164
detox          0.373836
components     0.364664
divergence     0.363223
redeem         0.356659
paced          0.352743
letter         0.350524
div