In [9]:
import pandas as pd
from nltk.tokenize import word_tokenize
from gensim import models
from gensim.models import Word2Vec

In [10]:
df1960s = pd.read_csv('df1960s.csv', index_col=0)
df1970s = pd.read_csv('df1970s.csv', index_col=0)
df1980s = pd.read_csv('df1980s.csv', index_col=0)
df1990s = pd.read_csv('df1990s.csv', index_col=0)
df2000s = pd.read_csv('df2000s.csv', index_col=0)
df2010s = pd.read_csv('df2010s.csv', index_col=0)

In [11]:
def data_cleaning(decade):
    songs = []
    for index, row in decade.iterrows():
        cur_song = word_tokenize(str(row['Lyrics']))
        songs.append(cur_song)
    return songs

In [12]:
corpus1960s = data_cleaning(df1960s)
corpus1970s = data_cleaning(df1970s)
corpus1980s = data_cleaning(df1980s)
corpus1990s = data_cleaning(df1990s)
corpus2000s = data_cleaning(df2000s)
corpus2010s = data_cleaning(df2010s)

In [13]:
len(corpus1960s)

472

In [14]:
model1960 = Word2Vec(sentences = corpus1960s, window=4, min_count=5, workers=4, sg=1)
model1970 = Word2Vec(sentences = corpus1970s, window=4, min_count=5, workers=4, sg=1)
model1980 = Word2Vec(sentences = corpus1980s, window=4, min_count=5, workers=4, sg=1)
model1990 = Word2Vec(sentences = corpus1990s, window=4, min_count=5, workers=4, sg=1)
model2000 = Word2Vec(sentences = corpus2000s, window=4, min_count=5, workers=4, sg=1)
model2010 = Word2Vec(sentences = corpus2010s, window=4, min_count=5, workers=4, sg=1)

In [15]:
def corpusToSet(corpus):
    wordBank = set()
    for sentence in corpus:
        for word in sentence:
            wordBank.add(word)
    return wordBank

In [16]:
wordBank60s = corpusToSet(corpus1960s)
wordBank70s = corpusToSet(corpus1970s)
wordBank80s = corpusToSet(corpus1980s)
wordBank90s = corpusToSet(corpus1990s)
wordBank00s = corpusToSet(corpus2000s)
wordBank10s = corpusToSet(corpus2010s)

In [17]:
def getDistances(model, bank, key):
    distances = dict()
    for word in bank:
        if word in model.wv.index_to_key:
            dist = model.wv.distance(word, key)
            distances[word]=dist
    sortedDistances = dict(sorted(distances.items(), key=lambda item: item[1]))
    return sortedDistances
    

In [28]:
getDistances(model2010, wordBank10s, 'man')

{'man': 0.0,
 'yes': 0.1573338508605957,
 'ahead': 0.16288751363754272,
 'everybody': 0.23179316520690918,
 'pipe': 0.23212170600891113,
 'sugar': 0.2513575553894043,
 'well': 0.2518840432167053,
 'dawg': 0.2713460326194763,
 'brand': 0.2761487364768982,
 'tell': 0.2861102223396301,
 'insane': 0.28628993034362793,
 'band': 0.2876027822494507,
 'downtown': 0.2887535095214844,
 'hella': 0.2939339280128479,
 'mile': 0.2978677749633789,
 'bro': 0.30940520763397217,
 'course': 0.3113510012626648,
 'type': 0.31144946813583374,
 'aha': 0.31524187326431274,
 'poppin': 0.31870031356811523,
 'satan': 0.32040196657180786,
 'saying': 0.32352185249328613,
 'wantin': 0.32426518201828003,
 'dog': 0.3253817558288574,
 'brain': 0.3259628415107727,
 'plus': 0.3265266418457031,
 'buy': 0.32683300971984863,
 'dick': 0.3284595012664795,
 'talking': 0.3297070860862732,
 'front': 0.3299659490585327,
 'double': 0.33027714490890503,
 'pump': 0.33096522092819214,
 'thanks': 0.3326462507247925,
 'lipstick': 0.33