# Network of Words

In [1]:
from nltk import word_tokenize,sent_tokenize,Text,PorterStemmer,WordNetLemmatizer,pos_tag
import pandas as pd
import numpy as np
import networkx as nx
import math
import matplotlib.pyplot as plt
import spacy
from spellchecker import SpellChecker
import collections as clt
import time
from community import best_partition
import pickle
import math
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from joblib import Parallel, delayed

In [2]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub('&lt;/?.*?&gt;',' &lt;&gt; ', text)
    text = re.sub('(\\d|\\W)+',' ',text)
    return text

In [3]:
df = pd.read_csv('reviews_full_v7.csv')
df = df[df['Score'] >= 75]
plt.rcParams["figure.figsize"] = (10,7) #defining size of plots
sp = spacy.load('en_core_web_sm')
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,User,Date,Year,Month,Release,Band,Score,Link,Album_link,Review_title,Genre,Text,Sentiment,Band_Genre,Combined_Genre
0,SpookyApparition,July 11 2002,2002,7,Chaosphere,Meshuggah,82,https://www.metal-archives.com/reviews/Meshugg...,https://www.metal-archives.com/albums/Meshugga...,Technical metal at it's finest.,technical thrash metal|math metal|progressive ...,I originally bought Chaosphere in late 2000......,0.9728,groove metal|thrash metal,technical thrash metal|math metal|progressive ...
1,SpookyApparition,July 14 2002,2002,7,Necroticism_-_Descanting_the_Insalubrious,Carcass,88,https://www.metal-archives.com/reviews/Carcass...,https://www.metal-archives.com/albums/Carcass/...,Carcass at their finest.,grindcore|melodic death metal,A midpoint between Carcass' early grind albums...,0.8479,grindcore|death metal,grindcore|melodic death metal
2,MarkRyan,July 15 2002,2002,7,Angels_Fall_First,Nightwish,90,https://www.metal-archives.com/reviews/Nightwi...,https://www.metal-archives.com/albums/Nightwis...,On par with their other stuff...,symphonic power metal,"I'm not hard-core into metal at all, really, b...",0.9951,symphonic metal|power metal,symphonic power metal
3,Fear,July 16 2002,2002,7,Dark_Genesis,Iced_Earth,87,https://www.metal-archives.com/reviews/Iced_Ea...,https://www.metal-archives.com/albums/Iced_Ear...,Brilliance in Repackaging,heavy metal|us power metal,"This is not a new album, but in fact, five alb...",0.9976,thrash metal|power metal,heavy metal|us power metal
4,Demented666,July 18 2002,2002,7,Extension_of_the_Wish,Andromeda,100,https://www.metal-archives.com/reviews/Androme...,https://www.metal-archives.com/albums/Andromed...,A smashing debut.,progressive metal,Andromeda's Extension of the Wish- clearly one...,0.8619,progressive metal,progressive metal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64636,TheStormIRide,June 30 2017,2017,6,Flying_Above_Ancient_Ruins,Krolok,85,https://www.metal-archives.com/reviews/Krolok/...,https://www.metal-archives.com/albums/Krolok/F...,Feverish Nightmares of Yore,atmospheric black metal,"Formed in 2011 by HV, Krolok is an atmospheric...",0.5875,black metal,atmospheric black metal
64637,6CORPSE6GRINDER6,June 30 2017,2017,6,Chemical_Assault,Violator,85,https://www.metal-archives.com/reviews/Violato...,https://www.metal-archives.com/albums/Violator...,"Take your life back, in the pit!",thrash metal,I don't have any issues with the retro-thrash ...,0.9976,thrash metal,thrash metal
64638,6CORPSE6GRINDER6,June 30 2017,2017,6,Violent_Mosh,Violator,80,https://www.metal-archives.com/reviews/Violato...,https://www.metal-archives.com/albums/Violator...,Officers are dirtier than the criminals they c...,thrash metal,This band was so important for the thrash meta...,0.9911,thrash metal,thrash metal
64639,Wacke,June 30 2017,2017,6,Resurrection,Chimaira,89,https://www.metal-archives.com/reviews/Chimair...,https://www.metal-archives.com/albums/Chimaira...,What doesn't kill us makes us stronger.,hard rock,Chimaira is arguably the most overlooked and/o...,0.9947,metalcore|groove metal,hard rock


In [4]:
preprocess_text(df['Text'][0])

'i originally bought chaosphere in late and hated it i could rarely sit through the entire album and regretted paying for it all of the songs sounded the same without any variation but as time has passed i ve grown to appreciate it more and more the technicality is truly astounding outshining by and large the entire metal genre thordendahl and haake are among the top performers at their respective instruments guitars drums and the remaining members are more than competent the bass playing is more evident than on most metal albums and along with the guitars it creates one huge crunching rhythm while haake often blasts away in a different time signature on his kit kidman s vocal lines tie in with the music better than just about any album i can think of although i can t describe it as well as i would like he stresses syllables on words on certain drum hits and lots of small things like that which make for a very technical vocal performance something not often seen in extreme metal and la

In [5]:
def isNaN(x):
    return x != x

In [6]:
docs = []
for i,row in df.iterrows():
    if i % 10000 == 0:
        print(i)
    if isNaN(row['Review_title']):
        docs.append(preprocess_text(row['Text']))
    else:
        docs.append(preprocess_text(row['Review_title'] + ' ' + row['Text']))

0
10000
20000
30000
40000
50000
60000


In [7]:
all_stopwords = sp.Defaults.stop_words
cv = CountVectorizer(max_df = 0.85, stop_words = all_stopwords)
word_count_vector = cv.fit_transform(docs)



In [8]:
print(list(cv.vocabulary_.keys())[:30])

['technical', 'finest', 'originally', 'bought', 'chaosphere', 'late', 'hated', 'rarely', 'sit', 'entire', 'album', 'regretted', 'paying', 'songs', 'sounded', 'variation', 'time', 'passed', 've', 'grown', 'appreciate', 'technicality', 'truly', 'astounding', 'outshining', 'large', 'genre', 'thordendahl', 'haake', 'performers']


In [9]:
tfidf_transformer = TfidfTransformer(smooth_idf = True, use_idf = True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

In [10]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
def get_keywords(doc):
    feature_names=cv.get_feature_names()
    tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))
    sorted_coo = sort_coo(tf_idf_vector.tocoo())[:10]
    keywords = []
    for idx, score in sorted_coo:
        keywords.append(feature_names[idx])
    return keywords

In [11]:
a = time.time()
B = nx.Graph()
vocab = set()
spell = SpellChecker()
for i,row in df.iterrows():
    if i%1000 == 0:
        print(i,time.time() - a)
    title = row['Review_title']
    text = row['Text']
    album = row['Release'] + 'A'
    if isNaN(title):
        doc = text
    else:
        doc = title + ' ' + text
    keywords = get_keywords(doc)
    for word in keywords:
        if word.isalpha() and len(word) > 2 and len(word) < 15 and len(spell.unknown([word])) == 0:
            vocab.add(word)
            if not B.has_edge(album, word):
                B.add_edge(album, word, weight = 1)
            else:
                B[album][word]['weight'] += 1
time.time() - a

0 0.32381558418273926
1000 224.23269605636597
2000 436.9054379463196
3000 652.1290891170502
4000 868.1121191978455
5000 1090.0146985054016
6000 1296.1722781658173
7000 1487.9478433132172
8000 1676.2917518615723
9000 1852.3872928619385
10000 2029.688688993454
11000 2188.811962366104
12000 2343.478875875473
13000 2511.763131380081
14000 2710.3171384334564
15000 2920.6765189170837
16000 3142.1389985084534
17000 3420.685910463333
18000 3662.0740954875946
19000 3906.6475105285645
20000 4149.9390432834625
21000 4397.129055976868
22000 4647.365299701691
23000 4881.181287765503
24000 5126.745674133301
25000 5358.352063179016
26000 5662.329636096954
27000 5933.30647277832
28000 6180.371684551239
29000 6419.242359876633
30000 6659.76935839653
31000 6899.639287471771
32000 7144.588644504547
33000 7403.347316741943
34000 7644.694055557251
35000 7862.3085470199585
36000 8072.180876016617
37000 8228.364884376526
38000 8384.738859415054
39000 8540.455598592758
40000 8696.998180627823
41000 8852.91483

13476.839465379715

In [12]:
genre_group = pickle.load(open('genre_group.p','rb'))

In [13]:
G = nx.Graph() #eliminates lone words
for word in vocab:
    for album in B.adj[word]:
        for word1 in B.adj[album]:
            if word < word1: #order is important to avoid repeats
                if not G.has_edge(word,word1):
                    G.add_edge(word, word1, weight = 0)
                G[word][word1]['weight'] += 1

In [14]:
for u,v in G.edges():
    if G[u][v]['weight'] <= 1:
        G.remove_edge(u,v)
len([(u,v) for u,v in G.edges()])
G.remove_nodes_from(list(nx.isolates(G)))

In [18]:
nx.write_gpickle(B,'bipartite_wordsv2.p')
nx.write_gpickle(G,'network_wordsv2.p')

In [16]:
len(G.edges())

401713

In [17]:
B.number_of_nodes(), G.number_of_nodes()

(74423, 20898)

In [19]:
partitions = best_partition(G, randomize = True)

In [29]:
clt.Counter(partitions.values())

Counter({0: 2886,
         1: 4256,
         2: 4415,
         3: 2609,
         4: 3832,
         5: 2083,
         6: 534,
         7: 182,
         8: 4,
         9: 3,
         10: 2,
         11: 17,
         12: 2,
         13: 2,
         14: 2,
         15: 2,
         16: 2,
         17: 2,
         18: 2,
         19: 2,
         20: 2,
         21: 2,
         22: 3,
         23: 2,
         24: 3,
         25: 2,
         26: 2,
         27: 4,
         28: 2,
         29: 2,
         30: 2,
         31: 2,
         32: 2,
         33: 2,
         34: 2,
         35: 2,
         36: 2,
         37: 2,
         38: 2,
         39: 2,
         40: 2,
         41: 3,
         42: 2,
         43: 2,
         44: 2,
         45: 2,
         46: 2})

In [30]:
for node in partitions:
    if partitions[node] == 7:
        print(node)

melodic
gates
dual
stratosphere
started
seed
tranquillity
setup
unsullied
remain
retaining
sync
stewart
thirteenth
tiring
strain
symmetry
enemy
gardens
eucharist
gothenburg
furnished
breakers
wrongs
curtains
thump
melo
casualties
flames
believes
bids
scar
yin
bridges
race
movement
ethos
tomas
unfold
trends
cools
stigmata
stains
nilsson
scalp
ishtar
terminal
yang
projector
tranquility
richest
dawn
damage
monochromatic
formulae
peers
laine
eyebrows
doomsday
gloomiest
elektra
wages
lindberg
zebra
prophesies
rainer
bjorn
quintet
haven
prophesy
tropes
slain
knives
trampled
infusions
svensson
gallery
nbc
fluke
atg
alf
anders
debuts
larsson
haters
emulation
pastures
litanies
simmons
btg
subterranean
exposures
ushered
treasonous
hood
leszek
interpretable
occur
triton
indifferently
burning
overtures
fredrik
rearrange
superlatives
treblinka
symmetric
institution
heidi
discussed
merges
gargoyles
jesters
jester
discreet
vomits
earwigs
hypnotize
ensnare
faustus
judging
enfeebled
rigel
fornication
d