# Dictionary Methods

# Part 0: Basic dictionary methods

## 0.1 Pre-processing

First, read in our Music Reviews corpus as a Pandas dataframe.

In [1]:
#import the necessary packages
import pandas as pd
import nltk
from nltk import word_tokenize
import string

#read the Music Reviews corpus into a Pandas dataframe
df = pd.read_csv("../day-2/data/BDHSI2016_music_reviews.csv", encoding='utf-8', sep = '\t')

#view the dataframe
df

Unnamed: 0,album,artist,genre,release_date,critic,score,body
0,Don't Panic,All Time Low,Pop/Rock,2012-10-09 00:00:00,Kerrang!,74.0,While For Baltimore proves they can still writ...
1,Fear and Saturday Night,Ryan Bingham,Country,2015-01-20 00:00:00,Uncut,70.0,There's nothing fake about the purgatorial nar...
2,The Way I'm Livin',Lee Ann Womack,Country,2014-09-23 00:00:00,Q Magazine,84.0,All life's disastrous lows are here on a caree...
3,Doris,Earl Sweatshirt,Rap,2013-08-20 00:00:00,Pitchfork,82.0,"With Doris, Odd Future’s Odysseus is finally b..."
4,Giraffe,Echoboy,Rock,2003-02-25 00:00:00,AllMusic,71.0,Though Giraffe is definitely Echoboy's most im...
...,...,...,...,...,...,...,...
4996,Outer South,Conor Oberst And The Mystic Valley Band,Indie,2009-05-05 00:00:00,Slant Magazine,67.0,The result is an album that's unfortunately ba...
4997,On An Island,David Gilmour,Rock,2006-03-07 00:00:00,E! Online,67.0,"In the end, Island makes Dave sound like he's ..."
4998,Movement,Gossip,Indie,2003-05-06 00:00:00,Uncut,81.0,Beth Ditto's remarkable gospel holler and ferv...
4999,Locked Down,Dr. John,Pop/Rock,2012-04-03 00:00:00,PopMatters,86.0,"Dr. John is Dr. John. He's a star, and is on f..."


The next step is to create a new column in our dataset that contains tokenized words with all the pre-processing steps.

In [2]:
#first create a new column called "body_tokens" and transform to lowercase by applying the string function str.lower()
df['body'] = df['body'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))
df['body_tokens'] = df['body'].str.lower()

In [3]:
#tokenize
df['body_tokens'] = df['body_tokens'].apply(nltk.word_tokenize)

In [4]:
punctuations = list(string.punctuation)

#remove punctuation. Let's talk about that lambda x.
df['body_tokens'] = df['body_tokens'].apply(lambda x: [word for word in x if word not in punctuations])

In [5]:
df['token_count'] = df['body_tokens'].apply(lambda x: len(x))

## 0.2. Creating dictionary counts

In [6]:
pos_sent = open("../day-2/data/positive_words.txt", encoding='utf-8').read()
neg_sent = open("../day-2/data/negative_words.txt", encoding='utf-8').read()

In [7]:
#remember the split function? We'll split on the newline character (\n) to create a list
positive_words=pos_sent.split('\n')
negative_words=neg_sent.split('\n')

In [8]:
#count number of words in each list
print(len(positive_words))
print(len(negative_words))

2231
3906


### Challenge
1. Create a column with the number of positive words, and another with the proportion of positive words
2. Create a column with the number of negative words, and another with the proportion of negative words
3. Print the average proportion of negative and positive words by genre
4. Compare this to the average score by genre

In [9]:
df['pos_num'] = df['body_tokens'].apply(lambda x: len([word for word in x if word in positive_words]))
df['neg_num'] = df['body_tokens'].apply(lambda x: len([word for word in x if word in negative_words]))

df['pos_prop'] = df['pos_num']/df['token_count']
df['neg_prop'] = df['neg_num']/df['token_count']
df.drop('release_date', axis=1, inplace=True)
df

Unnamed: 0,album,artist,genre,critic,score,body,body_tokens,token_count,pos_num,neg_num,pos_prop,neg_prop
0,Don't Panic,All Time Low,Pop/Rock,Kerrang!,74.0,While For Baltimore proves they can still writ...,"[while, for, baltimore, proves, they, can, sti...",38,1,0,0.026316,0.000000
1,Fear and Saturday Night,Ryan Bingham,Country,Uncut,70.0,There's nothing fake about the purgatorial nar...,"[there, 's, nothing, fake, about, the, purgato...",28,0,3,0.000000,0.107143
2,The Way I'm Livin',Lee Ann Womack,Country,Q Magazine,84.0,All life's disastrous lows are here on a caree...,"[all, life, 's, disastrous, lows, are, here, o...",13,0,1,0.000000,0.076923
3,Doris,Earl Sweatshirt,Rap,Pitchfork,82.0,"With Doris, Odd Future’s Odysseus is finally b...","[with, doris, odd, future, ’, s, odysseus, is,...",18,0,1,0.000000,0.055556
4,Giraffe,Echoboy,Rock,AllMusic,71.0,Though Giraffe is definitely Echoboy's most im...,"[though, giraffe, is, definitely, echoboy, 's,...",51,2,4,0.039216,0.078431
...,...,...,...,...,...,...,...,...,...,...,...,...
4996,Outer South,Conor Oberst And The Mystic Valley Band,Indie,Slant Magazine,67.0,The result is an album that's unfortunately ba...,"[the, result, is, an, album, that, 's, unfortu...",27,0,3,0.000000,0.111111
4997,On An Island,David Gilmour,Rock,E! Online,67.0,"In the end, Island makes Dave sound like he's ...","[in, the, end, island, makes, dave, sound, lik...",17,3,0,0.176471,0.000000
4998,Movement,Gossip,Indie,Uncut,81.0,Beth Ditto's remarkable gospel holler and ferv...,"[beth, ditto, 's, remarkable, gospel, holler, ...",25,2,0,0.080000,0.000000
4999,Locked Down,Dr. John,Pop/Rock,PopMatters,86.0,"Dr. John is Dr. John. He's a star, and is on f...","[dr., john, is, dr., john, he, 's, a, star, an...",18,1,0,0.055556,0.000000


In [10]:
grouped = df.groupby('genre')
grouped['pos_prop'].mean().sort_values(ascending=False)

genre
Folk                      0.096497
Jazz                      0.087290
Indie                     0.085293
Alternative/Indie Rock    0.080572
Rock                      0.080200
Electronic                0.078628
Dance                     0.078059
Pop/Rock                  0.077627
R&B;                      0.074498
Country                   0.072140
Rap                       0.070954
Pop                       0.069679
Name: pos_prop, dtype: float64

In [11]:
grouped['score'].mean().sort_values(ascending=False)

genre
Jazz                      77.631579
Folk                      75.900000
Indie                     74.400897
Country                   74.071429
Alternative/Indie Rock    73.928571
Electronic                73.140351
Pop/Rock                  73.033782
R&B;                      72.366071
Rap                       72.173554
Rock                      70.754292
Dance                     70.146341
Pop                       64.608054
Name: score, dtype: float64

## 0.3. Sentiment analysis using scikit-learn

In [12]:
#import the function CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

countvec = CountVectorizer()

#create our document term matrix as a pandas dataframe
dtm_df = pd.DataFrame(countvec.fit_transform(df.body).toarray(), columns=countvec.get_feature_names(), index = df.index)

In [13]:
#create a columns variable that is a list of all column names
columns = list(dtm_df)

In [14]:
#create a new variable that contains only column names that are in our postive words list
pos_columns = [word for word in columns if word in positive_words]

In [15]:
#create a dtm from our dtm_df that keeps only positive sentiment columns
dtm_pos = dtm_df[pos_columns]

In [16]:
#count the number of positive words for each document
dtm_pos['pos_count'] = dtm_pos.sum(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dtm_pos['pos_count'] = dtm_pos.sum(axis=1)


### Challenge
1. Do the same for negative words.  
2. Calculate the proportion of negative and positive words for each document.

In [17]:
neg_columns = [word for word in columns if word in negative_words]
dtm_neg = dtm_df[neg_columns]

dtm_neg['neg_count'] = dtm_neg.sum(axis=1)
dtm_neg['neg_count']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dtm_neg['neg_count'] = dtm_neg.sum(axis=1)


0       0
1       3
2       1
3       1
4       4
       ..
4996    3
4997    0
4998    0
4999    0
5000    0
Name: neg_count, Length: 5001, dtype: int64

In [18]:
dtm_pos['pos_proportion'] = dtm_pos['pos_count']/dtm_df.sum(axis=1)
print(dtm_pos['pos_proportion'])
df['pos_prop']

0       0.030303
1       0.000000
2       0.000000
3       0.000000
4       0.046512
          ...   
4996    0.000000
4997    0.187500
4998    0.095238
4999    0.062500
5000    0.178571
Name: pos_proportion, Length: 5001, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dtm_pos['pos_proportion'] = dtm_pos['pos_count']/dtm_df.sum(axis=1)


0       0.026316
1       0.000000
2       0.000000
3       0.000000
4       0.039216
          ...   
4996    0.000000
4997    0.176471
4998    0.080000
4999    0.055556
5000    0.178571
Name: pos_prop, Length: 5001, dtype: float64

# Part 1: Weighting dictionaries

## 1.1 Read concreteness score dictionary

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

con_score = pd.read_csv('../day-2/data/Concreteness_ratings_Brysbaert_et_al.csv')

We can see the most concrete and most abstract words by sorting on `Conc.M`.

In [20]:
con_score[['Word','Conc.M']].sort_values(by='Conc.M',ascending=False)

Unnamed: 0,Word,Conc.M
2547,bat,5.00
10689,eagle,5.00
30740,shawl,5.00
36046,umbrella,5.00
2526,basket,5.00
...,...,...
39703,would,1.12
32378,spirituality,1.07
941,although,1.07
10905,eh,1.04


In [21]:
con_score[['Word','Conc.M']].sort_values(by='Conc.M',ascending=True)

Unnamed: 0,Word,Conc.M
10905,eh,1.04
11618,essentialness,1.04
32378,spirituality,1.07
941,although,1.07
39703,would,1.12
...,...,...
25452,pick-up truck,5.00
6160,comb,5.00
6476,computer mouse,5.00
7132,cookie,5.00


## 1.2. Merging a DTM with a weighted dictionary

In [22]:
text_list = []
#open and read the novels, save them as variables
austen_string = open('../day-2/data/Austen_PrideAndPrejudice.txt', encoding='utf-8').read()
alcott_string = open('../day-2/data/Alcott_GarlandForGirls.txt', encoding='utf-8').read()

#append each novel to the list
text_list.append(austen_string)
text_list.append(alcott_string)

countvec = CountVectorizer(stop_words="english")

novels_df = pd.DataFrame(countvec.fit_transform(text_list).toarray(), columns=countvec.get_feature_names())

Next, we'll take a subset of the DTM, keeping only the intersection between the words in our corpus and the word in the dictionary.

In [23]:
columns=list(novels_df)
columns_con = [word for word in columns if word in list(con_score['Word'])]

In [24]:
novels_df_con = novels_df[columns_con]

Next, transpose the matrix, rename the column, and merge with the dictionary dataframe.

In [25]:
df = novels_df_con.transpose()

In [26]:
df.rename(columns={0: 'Austen', 1: 'Alcott'}, inplace=True)

In [27]:
#Rename the index 'Word', and reset the index, so the words become a column in our dataframe and we get a new index.
df.index.names = ['Word']
df.reset_index(inplace=True)

In [28]:
#merge with our dictionary dataframe, called 'con_score'
df = df.merge(con_score, on = 'Word')

## 1.3. Weighting term frequencies by the concreteness score

Now we can weight the term frquency cells by the concreteness score, by multiplying the frequency count column by the concreteness score column.

In [29]:
df['austen_con_score'] = df['Austen'] * df['Conc.M']

In [30]:
df['alcott_con_score'] = df['Alcott'] * df['Conc.M']

### Challenge

Calculate and print the average concreteness score for each text. Careful! Think through this before you implement it. You want the average score, normalized over all the words in the text. 

In [31]:
#we'll devide the sum of the concreteness score by the total word count for each novel
print("Mean Concreteness for Austen's 'Pride and Prejudice'")
print(df['austen_con_score'].sum()/df['Austen'].sum())
print()
print("Mean Concreteness for Alcott's 'A Garland for Girls'")
print(df['alcott_con_score'].sum()/df['Alcott'].sum())

Mean Concreteness for Austen's 'Pride and Prejudice'
2.783289058278108

Mean Concreteness for Alcott's 'A Garland for Girls'
3.1534507874015745


## 1.4. Assessing the difference

So there is a difference, but what does it mean? What is the magnitude of the difference?

We can look at the difference between the two means as a percent difference based on the scale range. We can calculate this using simple math.

In [32]:
#first find the difference between the means by substracting one from the other
3.1534507874-2.78328905828

0.37016172912000034

In [33]:
#Find the range of concreteness scores
print(df['Conc.M'].min())
print(df['Conc.M'].max())

1.17
5.0


In [34]:
#The scale range
df['Conc.M'].max() - df['Conc.M'].min()

3.83

In [35]:
#Calculate the difference of means as a percent of this range
(0.37/3.83)* 100

9.660574412532636

### Challenge
Print the most concrete and abstract terms in Austen and in Alcott.  
*Hint:* You can't simply sort on the column `austen_con_score` and so on. Why not? What are your next steps?

In [36]:
#Create a new dataframe that keeps only words that have a non-zero value in Alcott
df_alcott = df[df['Alcott']>0]
#Sort on 'Conc.M' and pring in descending order for most concrete words
df_alcott[['Word', 'Conc.M', 'Alcott']].sort_values(by=['Conc.M', 'Alcott'], ascending = False)

Unnamed: 0,Word,Conc.M,Alcott
2692,house,5.00,65
6033,water,5.00,32
470,bed,5.00,25
590,boots,5.00,17
2139,fish,5.00,17
...,...,...,...
1891,especially,1.28,12
5094,somewhat,1.28,5
4705,sanctimonious,1.28,1
2671,hope,1.25,40


In [37]:
#Create a new dataframe that keeps only words that have a non-zero value in Austen
df_austen = df[df['Austen']>0]
df_austen[['Word', 'Conc.M', 'Austen']].sort_values(by=['Conc.M', 'Austen'], ascending = False)

Unnamed: 0,Word,Conc.M,Austen
2692,house,5.00,108
413,ball,5.00,36
5198,stairs,5.00,24
470,bed,5.00,6
921,clock,5.00,6
...,...,...,...
2671,hope,1.25,121
23,absurdity,1.25,1
109,advantageously,1.24,2
2873,infinitely,1.22,4


# Bonus: Weighting words with TF-IDF<a id='tfidf'></a>

In [38]:
# Let's use our Music Reviews corpus for this. Read into Pandas DataFrame:
df = pd.read_csv("../day-2/data/BDHSI2016_music_reviews.csv", encoding='utf-8', sep = '\t')

# Clean out numbers:
df['body'] = df['body'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))

In [39]:
#import the function TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfvec = TfidfVectorizer()

#create the dtm, but with cells weigthed by the tf-idf score.
tfidf_df = pd.DataFrame(tfidfvec.fit_transform(df['body']).toarray(), columns=tfidfvec.get_feature_names())

#view results
tfidf_df

Unnamed: 0,aa,aaaa,aahs,aaliyah,aaron,ab,abandon,abandoned,abandoning,abc,...,zone,zones,zoo,zooey,zoomer,zu,zydeco,álbum,être,über
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Identifying Distinctive Words

In [40]:
#creat dataset with document index and genre
df_genre = df['genre'].to_frame()

In [41]:
#merge this into the dtm_tfidf_df
merged_df = df_genre.join(tfidf_df, how = 'right', lsuffix='_x')

In [42]:
#pull out the reviews for three genres, Rap, Alternative/Indie Rock, and Jazz
dtm_rap = merged_df[merged_df['genre_x']=="Rap"]
dtm_indie = merged_df[merged_df['genre_x']=="Alternative/Indie Rock"]
dtm_jazz = merged_df[merged_df['genre_x']=="Jazz"]

#print the words with the highest tf-idf scores for each genre
print("Rap Words")
print(dtm_rap.max)

Rap Words
<bound method DataFrame.max of      genre_x   aa  aaaa  aahs  aaliyah  aaron   ab  abandon  abandoned  \
3        Rap  0.0   0.0   0.0      0.0    0.0  0.0      0.0        0.0   
18       Rap  0.0   0.0   0.0      0.0    0.0  0.0      0.0        0.0   
24       Rap  0.0   0.0   0.0      0.0    0.0  0.0      0.0        0.0   
33       Rap  0.0   0.0   0.0      0.0    0.0  0.0      0.0        0.0   
42       Rap  0.0   0.0   0.0      0.0    0.0  0.0      0.0        0.0   
...      ...  ...   ...   ...      ...    ...  ...      ...        ...   
4958     Rap  0.0   0.0   0.0      0.0    0.0  0.0      0.0        0.0   
4960     Rap  0.0   0.0   0.0      0.0    0.0  0.0      0.0        0.0   
4964     Rap  0.0   0.0   0.0      0.0    0.0  0.0      0.0        0.0   
4991     Rap  0.0   0.0   0.0      0.0    0.0  0.0      0.0        0.0   
4995     Rap  0.0   0.0   0.0      0.0    0.0  0.0      0.0        0.0   

      abandoning  ...  zone  zones  zoo  zooey  zoomer   zu  zydeco  á

### Challenge

Compare the distinctive words for two artists in the data.

Note: the artists should have a number of reviews, so check your frequency counts to identify artists.

HINT: Copy and paste the above code and modify it as needed.

In [43]:
df_artist = df['artist'].to_frame()
merged_df_artist = df_artist.join(tfidf_df, how = 'right', lsuffix='_x')

#view result

dtm1 = merged_df_artist[merged_df_artist['artist_x']=="R.E.M."]
dtm2 = merged_df_artist[merged_df_artist['artist_x']=="Arcade Fire"]
print("REM")
print(dtm1.max(numeric_only=True).sort_values(ascending=False)[0:20])
print()
print("Arcade Fire")
print(dtm2.max(numeric_only=True).sort_values(ascending=False)[0:20])
print()

REM
reliably        0.579442
staid           0.550549
every           0.530261
isn             0.523994
unfussy         0.513744
crucially       0.459618
committed       0.434459
convincing      0.434459
fast            0.424265
collapse        0.421777
habit           0.410508
accelerate      0.410508
stun            0.391646
forming         0.391646
dec             0.376505
noncommittal    0.368986
beautiful       0.358367
mostly          0.352703
stutter         0.352486
stipe           0.352032
dtype: float64

Arcade Fire
disc           0.459815
reflektor      0.431429
jumping        0.423503
patterns       0.409032
features       0.408639
bitterness     0.408519
shorter        0.397541
radiates       0.389749
affection      0.389749
suburbs        0.377718
beguiling      0.374164
detox          0.373836
components     0.364664
divergence     0.363223
redeem         0.356659
paced          0.352743
letter         0.350524
divergent      0.345035
double         0.336293
proposition 