## Importing the necessary modules

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import nltk
import string
import re
from nltk.corpus import stopwords
import networkx as nx
from stemming.porter2 import stem
import csv

## Preprocessing the SNAP metadata

In [2]:
SNAP = open('D:/Bajaj/amazon-meta.txt/amazon-meta.txt', 'r', encoding='utf-8', errors='ignore')

In [3]:
SNAP

<_io.TextIOWrapper name='D:/Bajaj/amazon-meta.txt/amazon-meta.txt' mode='r' encoding='utf-8'>

In [4]:
for i in range(20):
    line = SNAP.readline()
    print(line)

Id:   1

ASIN: 0827229534

  title: Patterns of Preaching: A Sermon Sampler

  group: Book

  salesrank: 396585

  similar: 5  0804215715  156101074X  0687023955  0687074231  082721619X

  categories: 2

   |Books[283155]|Subjects[1000]|Religion & Spirituality[22]|Christianity[12290]|Clergy[12360]|Preaching[12368]

   |Books[283155]|Subjects[1000]|Religion & Spirituality[22]|Christianity[12290]|Clergy[12360]|Sermons[12370]

  reviews: total: 2  downloaded: 2  avg rating: 5

    2000-7-28  cutomer: A2JW67OY8U6HHK  rating: 5  votes:  10  helpful:   9

    2003-12-14  cutomer: A2VE83MZF98ITY  rating: 5  votes:   6  helpful:   5



Id:   2

ASIN: 0738700797

  title: Candlemas: Feast of Flames

  group: Book

  salesrank: 168596

  similar: 5  0738700827  1567184960  1567182836  0738700525  0738700940

  categories: 2



In [5]:
SNAP_dict = {}
(Id, ASIN, Title, Categories, Group, Copurchased, SalesRank, TotalReviews, AvgRating, DegreeCentrality, ClusteringCoeff) = ("", "", "", "", "", "", 0, 0, 0.0, 0, 0.0)

In [6]:
for line in SNAP:
    line = line.strip()

    if(line.startswith("Id")):
        Id = line[3:].strip()
        
    elif(line.startswith("ASIN")):
        ASIN = line[5:].strip()
        
    elif(line.startswith("title")):
        Title = line[6:].strip()
        Title = ' '.join(Title.split())
        
    elif(line.startswith("group")):
        Group = line[6:].strip()
        
    elif(line.startswith("salesrank")):
        SalesRank = line[10:].strip()
        
    elif(line.startswith("similar")):
        Itemset = line.split()
        Copurchased = ' '.join([character for character in Itemset[2:]])
        
    elif(line.startswith("categories")):
        Itemset = line.split()
        Categories = ' '.join((SNAP.readline()).lower() for i in range(int(Itemset[1].strip())))
        Categories = re.compile('[%s]' % re.escape(string.digits+string.punctuation)).sub(' ', Categories)
        Categories = ' '.join(set(Categories.split())-set(stopwords.words("english")))        
        Categories = ' '.join(stem(word) for word in Categories.split())
        
    elif(line.startswith("reviews")):
        ls = line.split()
        TotalReviews = ls[2].strip()
        AvgRating = ls[7].strip()
        

    elif (line==""):
        try:
            MetaData = {}
            if (ASIN != ""):
                SNAP_dict[ASIN]=MetaData
            MetaData['Id'] = Id            
            MetaData['Title'] = Title
            MetaData['Categories'] = ' '.join(set(Categories.split()))
            MetaData['Group'] = Group
            MetaData['Copurchased'] = Copurchased
            MetaData['SalesRank'] = int(SalesRank)
            MetaData['TotalReviews'] = int(TotalReviews)
            MetaData['AvgRating'] = float(AvgRating)
            MetaData['DegreeCentrality'] = DegreeCentrality
            MetaData['ClusteringCoeff'] = ClusteringCoeff
        except NameError:
            continue
        (Id, ASIN, Title, Categories, Group, Copurchased, SalesRank, TotalReviews, AvgRating, DegreeCentrality, ClusteringCoeff) = ("", "", "", "", "", "", 0, 0, 0.0, 0, 0.0)
SNAP.close()

In [7]:
SNAP_dict

{'0486287785': {'Id': '3',
  'Title': 'World War II Allied Fighter Planes Trading Cards',
  'Categories': 'garden subject home hobbi general book craft',
  'Group': 'Book',
  'Copurchased': '',
  'SalesRank': 1270652,
  'TotalReviews': 1,
  'AvgRating': 5.0,
  'DegreeCentrality': 0,
  'ClusteringCoeff': 0.0},
 '0842328327': {'Id': '4',
  'Title': 'Life Application Bible Commentary: 1 and 2 Timothy and Titus',
  'Categories': 'life text discipleship refer book commentari new spiritu studi general bibl histori religion guid christian applic testament live subject sacr translat',
  'Group': 'Book',
  'Copurchased': '0842328130 0830818138 0842330313 0842328610 0842328572',
  'SalesRank': 631289,
  'TotalReviews': 1,
  'AvgRating': 4.0,
  'DegreeCentrality': 0,
  'ClusteringCoeff': 0.0},
 '1577943082': {'Id': '5',
  'Title': 'Prayers That Avail Much for Business: Executive',
  'Categories': 'religion subject live christian devot book worship prayerbook spiritu busi',
  'Group': 'Book',
  'C

In [None]:
with open('metadata_preprocessed.csv', 'w') as f:  
    w = csv.DictWriter(f, SNAP_dict.keys())
    w.writeheader()
    w.writerow(SNAP_dict)

In [9]:
df = pd.read_csv('encoded-metadata_preprocessed.csv')

In [10]:
df_new = df.T

In [11]:
df_new

Unnamed: 0,0
0486287785,"{'Id': '3', 'Title': 'World War II Allied Figh..."
0842328327,"{'Id': '4', 'Title': 'Life Application Bible C..."
1577943082,"{'Id': '5', 'Title': 'Prayers That Avail Much ..."
0486220125,"{'Id': '6', 'Title': 'How the Other Half Lives..."
B00000AU3R,"{'Id': '7', 'Title': 'Batik', 'Categories': 'm..."
...,...
B000059TOC,"{'Id': '548547', 'Title': 'The Drifter', 'Cate..."
B00006JBIX,"{'Id': '548548', 'Title': 'The House Of Moreco..."
0879736836,"{'Id': '548549', 'Title': 'Catholic Bioethics ..."
B00008DDST,"{'Id': '548550', 'Title': '1, 2, 3 Soleils: Ta..."


In [12]:
df_new.columns = ['Item']

In [13]:
df_new

Unnamed: 0,Item
0486287785,"{'Id': '3', 'Title': 'World War II Allied Figh..."
0842328327,"{'Id': '4', 'Title': 'Life Application Bible C..."
1577943082,"{'Id': '5', 'Title': 'Prayers That Avail Much ..."
0486220125,"{'Id': '6', 'Title': 'How the Other Half Lives..."
B00000AU3R,"{'Id': '7', 'Title': 'Batik', 'Categories': 'm..."
...,...
B000059TOC,"{'Id': '548547', 'Title': 'The Drifter', 'Cate..."
B00006JBIX,"{'Id': '548548', 'Title': 'The House Of Moreco..."
0879736836,"{'Id': '548549', 'Title': 'Catholic Bioethics ..."
B00008DDST,"{'Id': '548550', 'Title': '1, 2, 3 Soleils: Ta..."


In [14]:
df_new.to_csv('final_preprocessed_metadata.csv')

## Splitting metadata based on its group

### Books

In [30]:
Books = {}
for ASIN,item in SNAP_dict.items():
    if (item['Group']=='Book'):
        Books[ASIN] = SNAP_dict[ASIN]

In [31]:
for ASIN,item in Books.items(): 
    Books[ASIN]['Copurchased'] = ' '.join([itemset for itemset in item['Copurchased'].split() if itemset in Books.keys()])

In [32]:
Books

{'0486287785': {'Id': '3',
  'Title': 'World War II Allied Fighter Planes Trading Cards',
  'Categories': 'garden subject home hobbi general book craft',
  'Group': 'Book',
  'Copurchased': '',
  'SalesRank': 1270652,
  'TotalReviews': 1,
  'AvgRating': 5.0,
  'DegreeCentrality': 0,
  'ClusteringCoeff': 0.0},
 '0842328327': {'Id': '4',
  'Title': 'Life Application Bible Commentary: 1 and 2 Timothy and Titus',
  'Categories': 'life text discipleship refer book commentari new spiritu studi general bibl histori religion guid christian applic testament live subject sacr translat',
  'Group': 'Book',
  'Copurchased': '0842328130 0842330313 0842328610 0842328572',
  'SalesRank': 631289,
  'TotalReviews': 1,
  'AvgRating': 4.0,
  'DegreeCentrality': 6,
  'ClusteringCoeff': 0.79},
 '1577943082': {'Id': '5',
  'Title': 'Prayers That Avail Much for Business: Executive',
  'Categories': 'religion subject live christian devot book worship prayerbook spiritu busi',
  'Group': 'Book',
  'Copurchased

In [33]:
len(Books)

393559

### DVDs

In [34]:
DVDs = {}
for ASIN,item in SNAP_dict.items():
    if (item['Group']=='DVD'):
        DVDs[ASIN] = SNAP_dict[ASIN]

In [35]:
for ASIN,item in DVDs.items(): 
    DVDs[ASIN]['Copurchased'] = ' '.join([itemset for itemset in item['Copurchased'].split() if itemset in DVDs.keys()])

In [36]:
DVDs

{'0790747324': {'Id': '21',
  'Title': 'The Time Machine',
  'Categories': 'whit studio amazon lloyd adventur b fantasi intern time c yvett theme actor helmor video outlet georg taylor director fi pal f general tom kingdom scienc monster sebastian l specialti genr home travel today deal unit mutant warner alan actress young sci featur store futurist rod costum bissel p com h cabot countri fiction drama hous paul dvds special mimieux classic categori free dvd dori art titl',
  'Group': 'DVD',
  'Copurchased': 'B00007JMD8 6305350221 B00004RF9B B00005JKFR B00005NG6A',
  'SalesRank': 795,
  'TotalReviews': 140,
  'AvgRating': 4.5,
  'DegreeCentrality': 20,
  'ClusteringCoeff': 0.47},
 'B000056PNC': {'Id': '37',
  'Title': 'Mark Messier - Leader, Champion & Legend',
  'Categories': 'sport general special biographi featur dvd hockey genr titl',
  'Group': 'DVD',
  'Copurchased': '',
  'SalesRank': 46018,
  'TotalReviews': 7,
  'AvgRating': 3.5,
  'DegreeCentrality': 0,
  'ClusteringCoeff': 0

In [37]:
len(DVDs)

19828

### Music CDs

In [38]:
Music_CDs = {}
for ASIN,item in SNAP_dict.items():
    if (item['Group']=='Music'):
        Music_CDs[ASIN] = SNAP_dict[ASIN]

In [39]:
for ASIN,item in Music_CDs.items(): 
    Music_CDs[ASIN]['Copurchased'] = ' '.join([itemset for itemset in item['Copurchased'].split() if itemset in Music_CDs.keys()])

In [40]:
Music_CDs

{'B00000AU3R': {'Id': '7',
  'Title': 'Batik',
  'Categories': 'postbebop import music general jazz modern style store specialti',
  'Group': 'Music',
  'Copurchased': 'B0000261KX B00006AM8D B000059OB9',
  'SalesRank': 5392,
  'TotalReviews': 3,
  'AvgRating': 4.5,
  'DegreeCentrality': 3,
  'ClusteringCoeff': 0.58},
 'B00004W1W1': {'Id': '14',
  'Title': 'Later...',
  'Categories': 'general music jazz indi style store specialti',
  'Group': 'Music',
  'Copurchased': '',
  'SalesRank': 390624,
  'TotalReviews': 0,
  'AvgRating': 0.0,
  'DegreeCentrality': 0,
  'ClusteringCoeff': 0.0},
 'B000007R0T': {'Id': '18',
  'Title': 'Sol to Soul',
  'Categories': 'jazz music general style',
  'Group': 'Music',
  'Copurchased': 'B000059QC1 B00000JQIE B000002L7F',
  'SalesRank': 109301,
  'TotalReviews': 15,
  'AvgRating': 5.0,
  'DegreeCentrality': 3,
  'ClusteringCoeff': 0.0},
 'B00005NTSV': {'Id': '22',
  'Title': 'Come What May',
  'Categories': 'broadway vocalist vocal music general jazz indi

In [41]:
len(Music_CDs)

103144

### Videos

In [42]:
Videos = {}
for ASIN,item in SNAP_dict.items():
    if (item['Group']=='Video'):
        Videos[ASIN] = SNAP_dict[ASIN]

In [43]:
for ASIN,item in Videos.items(): 
    Videos[ASIN]['Copurchased'] = ' '.join([itemset for itemset in item['Copurchased'].split() if itemset in Videos.keys()])

In [44]:
Videos

{'6303360041': {'Id': '59',
  'Title': 'None But the Lonely Heart',
  'Categories': 'life son b famili wyatt c theme barri duryea actor bohnen shayn georg general f boulton dan jane barrymor psycholog vhs genr soderl konstantin matthew walter mother actress grant g coulouri w roman fitzgerald drama classic ethel cari',
  'Group': 'Video',
  'Copurchased': '630290899X',
  'SalesRank': 13524,
  'TotalReviews': 5,
  'AvgRating': 4.0,
  'DegreeCentrality': 9,
  'ClusteringCoeff': 0.56},
 'B0000060T5': {'Id': '71',
  'Title': 'Jonny Quest - Bandit in Adventures Best Friend',
  'Categories': 'quest year charact cartoon general famili kid anim network seri jonni vhs genr',
  'Group': 'Video',
  'Copurchased': '6303953344 630395331X 6303953328 6303953301',
  'SalesRank': 21571,
  'TotalReviews': 2,
  'AvgRating': 5.0,
  'DegreeCentrality': 4,
  'ClusteringCoeff': 1.0},
 '6304286961': {'Id': '81',
  'Title': 'The Doors',
  'Categories': 'idol burkley amazon billi kathleen oliv b denni k kyle al

In [45]:
len(Videos)

26131

## Creating CSV files for each group 

In [43]:
with open('books.csv', 'w') as f:  
    w = csv.DictWriter(f, Books.keys())
    w.writeheader()
    w.writerow(Books)

In [44]:
with open('DVD.csv', 'w') as f:  
    w = csv.DictWriter(f, DVDs.keys())
    w.writeheader()
    w.writerow(DVDs)

In [45]:
with open('Music.csv', 'w') as f:  
    w = csv.DictWriter(f, Music_CDs.keys())
    w.writeheader()
    w.writerow(Music_CDs)

In [46]:
with open('Videos.csv', 'w') as f:  
    w = csv.DictWriter(f, Videos.keys())
    w.writeheader()
    w.writerow(Videos)

In [48]:
df_books = pd.read_csv('encoded-books.csv')

In [50]:
df_books = df_books.T
df_books.columns = ['item']
df_books

Unnamed: 0,item
0486287785,"{'Id': '3', 'Title': 'World War II Allied Figh..."
0842328327,"{'Id': '4', 'Title': 'Life Application Bible C..."
1577943082,"{'Id': '5', 'Title': 'Prayers That Avail Much ..."
0486220125,"{'Id': '6', 'Title': 'How the Other Half Lives..."
0231118597,"{'Id': '8', 'Title': 'Losing Matt Shepard', 'C..."
...,...
9700507734,"{'Id': '548541', 'Title': 'Para alcanzar el or..."
9627762644,"{'Id': '548542', 'Title': 'Starting a Hedge Fu..."
0970020503,"{'Id': '548543', 'Title': 'Facts Every Injured..."
1930519206,"{'Id': '548546', 'Title': 'Adobe Photoshop 6 V..."


In [51]:
df_DVD = pd.read_csv('encoded-DVD.csv')

In [52]:
df_DVD = df_DVD.T
df_DVD.columns = ['item']
df_DVD

Unnamed: 0,item
0790747324,"{'Id': '21', 'Title': 'The Time Machine', 'Cat..."
B000056PNC,"{'Id': '37', 'Title': 'Mark Messier - Leader, ..."
B000056PNB,"{'Id': '42', 'Title': 'Pixote', 'Categories': ..."
B000056PNE,"{'Id': '44', 'Title': ""The NBA's 100 Greatest ..."
B00000IC80,"{'Id': '149', 'Title': ""Pot O' Gold/Made for E..."
...,...
B00007BG8E,"{'Id': '548459', 'Title': 'Blood Work (Full Sc..."
B00007G1UZ,"{'Id': '548460', 'Title': ""Sesame Street - Let..."
B000059TOC,"{'Id': '548547', 'Title': 'The Drifter', 'Cate..."
B00006JBIX,"{'Id': '548548', 'Title': 'The House Of Moreco..."


In [53]:
df_Music = pd.read_csv('encoded-Music.csv')

In [54]:
df_Music = df_Music.T
df_Music.columns = ['item']
df_Music

Unnamed: 0,item
B00000AU3R,"{'Id': '7', 'Title': 'Batik', 'Categories': 'm..."
B00004W1W1,"{'Id': '14', 'Title': 'Later...', 'Categories'..."
B000007R0T,"{'Id': '18', 'Title': 'Sol to Soul', 'Categori..."
B00005NTSV,"{'Id': '22', 'Title': 'Come What May', 'Catego..."
B000002O8D,"{'Id': '34', 'Title': 'Southern By the Grace o..."
...,...
B00005AUTO,"{'Id': '548509', 'Title': 'Door Door', 'Catego..."
B00004XT2L,"{'Id': '548536', 'Title': 'Improvisations - Ja..."
B000065AHM,"{'Id': '548544', 'Title': 'Lucky Man', 'Catego..."
B0000508ZN,"{'Id': '548545', 'Title': 'I Need Your Loving'..."


In [55]:
df_Videos = pd.read_csv('encoded-Videos.csv')

In [56]:
df_Videos = df_Videos.T
df_Videos.columns = ['item']
df_Videos

Unnamed: 0,item
6303360041,"{'Id': '59', 'Title': 'None But the Lonely Hea..."
B0000060T5,"{'Id': '71', 'Title': 'Jonny Quest - Bandit in..."
6304286961,"{'Id': '81', 'Title': 'The Doors', 'Categories..."
B000063W82,"{'Id': '84', 'Title': 'The Best of Schoolhouse..."
B0000060TP,"{'Id': '124', 'Title': ""Dink the Little Dinosa..."
...,...
6300147738,"{'Id': '544010', 'Title': 'They Came From Beyo..."
6303315275,"{'Id': '548495', 'Title': 'Rockers', 'Categori..."
6302011019,"{'Id': '548499', 'Title': 'Beyond the Forest',..."
6300186016,"{'Id': '548506', 'Title': 'She Done Him Wrong'..."


In [57]:
df_books.to_csv('books_final.csv')
df_DVD.to_csv('DVD_final.csv')
df_Music.to_csv('Music_final.csv')
df_Videos.to_csv('Videos_final.csv')

## Creating Graphs for all four groups using NetworkX
weights of the edges will be the similarity between the nodes

In [46]:
Book_Graph = nx.Graph()

In [47]:
for ASIN,metadata in Books.items():
    Book_Graph.add_node(ASIN)
    for item in metadata['Copurchased'].split():
        Book_Graph.add_node(item.strip())
        similarity = 0        
        n1 = set((Books[ASIN]['Categories']).split())
        n2 = set((Books[item]['Categories']).split())
        if (len(n1 | n2)) > 0:
            similarity = round(len(n1 & n2)/len(n1 | n2),2)
        Book_Graph.add_edge(ASIN, item.strip(), weight=similarity)

In [48]:
Book_Graph

<networkx.classes.graph.Graph at 0x2051c88d888>

In [49]:
DVD_Graph = nx.Graph()

In [50]:
for ASIN,metadata in DVDs.items():
    DVD_Graph.add_node(ASIN)
    for item in metadata['Copurchased'].split():
        DVD_Graph.add_node(item.strip())
        similarity = 0        
        n1 = set((DVDs[ASIN]['Categories']).split())
        n2 = set((DVDs[item]['Categories']).split())
        if (len(n1 | n2)) > 0:
            similarity = round(len(n1 & n2)/len(n1 | n2),2)
        DVD_Graph.add_edge(ASIN, item.strip(), weight=similarity)

In [51]:
Music_Graph = nx.Graph()

In [52]:
for ASIN,metadata in Music_CDs.items():
    Music_Graph.add_node(ASIN)
    for item in metadata['Copurchased'].split():
        Music_Graph.add_node(item.strip())
        similarity = 0        
        n1 = set((Music_CDs[ASIN]['Categories']).split())
        n2 = set((Music_CDs[item]['Categories']).split())
        if (len(n1 | n2)) > 0:
            similarity = round(len(n1 & n2)/len(n1 | n2),2)
        Music_Graph.add_edge(ASIN, item.strip(), weight=similarity)

In [53]:
Videos_Graph = nx.Graph()

In [54]:
for ASIN,metadata in Videos.items():
    Videos_Graph.add_node(ASIN)
    for item in metadata['Copurchased'].split():
        Videos_Graph.add_node(item.strip())
        similarity = 0        
        n1 = set((Videos[ASIN]['Categories']).split())
        n2 = set((Videos[item]['Categories']).split())
        if (len(n1 | n2)) > 0:
            similarity = round(len(n1 & n2)/len(n1 | n2),2)
        Videos_Graph.add_edge(ASIN, item.strip(), weight=similarity)

## Extracting features from the graphs

In [55]:
degree_books = nx.degree(Book_Graph)
for ASIN in nx.nodes(Book_Graph):
    metadata = Books[ASIN]
    metadata['DegreeCentrality'] = int(degree_books[ASIN])
    coeff = nx.ego_graph(Book_Graph, ASIN, radius=1)
    metadata['ClusteringCoeff'] = round(nx.average_clustering(coeff),2)
    Books[ASIN] = metadata

In [56]:
degree_DVD = nx.degree(DVD_Graph)
for ASIN in nx.nodes(DVD_Graph):
    metadata = DVDs[ASIN]
    metadata['DegreeCentrality'] = int(degree_DVD[ASIN])
    coeff = nx.ego_graph(DVD_Graph, ASIN, radius=1)
    metadata['ClusteringCoeff'] = round(nx.average_clustering(coeff),2)
    DVDs[ASIN] = metadata

In [57]:
degree_Music = nx.degree(Music_Graph)
for ASIN in nx.nodes(Music_Graph):
    metadata = Music_CDs[ASIN]
    metadata['DegreeCentrality'] = int(degree_Music[ASIN])
    coeff = nx.ego_graph(Music_Graph, ASIN, radius=1)
    metadata['ClusteringCoeff'] = round(nx.average_clustering(coeff),2)
    Music_CDs[ASIN] = metadata

In [58]:
degree_Videos = nx.degree(Videos_Graph)
for ASIN in nx.nodes(Videos_Graph):
    metadata = Videos[ASIN]
    metadata['DegreeCentrality'] = int(degree_Videos[ASIN])
    coeff = nx.ego_graph(Videos_Graph, ASIN, radius=1)
    metadata['ClusteringCoeff'] = round(nx.average_clustering(coeff),2)
    Videos[ASIN] = metadata

## Creating .txt file containing the metadata information and .edgelist containing the edges and their weights

### Books

In [91]:
Books_file = open('./Books.txt', 'w', encoding='utf-8', errors='ignore')
Books_file.write("Id\t" + "ASIN\t" + "Title\t" + 
        "Categories\t" + "Group\t" 
        "SalesRank\t" + "TotalReviews\t" + "AvgRating\t"
        "DegreeCentrality\t" + "ClusteringCoeff\n")
for ASIN,metadata in Books.items():
    Books_file.write(metadata['Id'] + "\t" + \
            ASIN + "\t" + \
            metadata['Title'] + "\t" + \
            metadata['Categories'] + "\t" + \
            metadata['Group'] + "\t" + \
            str(metadata['SalesRank']) + "\t" + \
            str(metadata['TotalReviews']) + "\t" + \
            str(metadata['AvgRating']) + "\t" + \
            str(metadata['DegreeCentrality']) + "\t" + \
            str(metadata['ClusteringCoeff']) + "\n")
Books_file.close()

In [92]:
Books_edgelist=open("Books.edgelist",'wb')
nx.write_weighted_edgelist(Book_Graph, Books_edgelist)
Books_edgelist.close()

### DVDs

In [93]:
DVD_file = open('./DVD.txt', 'w', encoding='utf-8', errors='ignore')
DVD_file.write("Id\t" + "ASIN\t" + "Title\t" + 
        "Categories\t" + "Group\t" 
        "SalesRank\t" + "TotalReviews\t" + "AvgRating\t"
        "DegreeCentrality\t" + "ClusteringCoeff\n")
for ASIN,metadata in DVDs.items():
    DVD_file.write(metadata['Id'] + "\t" + \
            ASIN + "\t" + \
            metadata['Title'] + "\t" + \
            metadata['Categories'] + "\t" + \
            metadata['Group'] + "\t" + \
            str(metadata['SalesRank']) + "\t" + \
            str(metadata['TotalReviews']) + "\t" + \
            str(metadata['AvgRating']) + "\t" + \
            str(metadata['DegreeCentrality']) + "\t" + \
            str(metadata['ClusteringCoeff']) + "\n")
DVD_file.close()

In [94]:
DVD_edgelist=open("DVD.edgelist",'wb')
nx.write_weighted_edgelist(DVD_Graph, DVD_edgelist)
DVD_edgelist.close()

### Music CDs

In [95]:
Music_file = open('./Music.txt', 'w', encoding='utf-8', errors='ignore')
Music_file.write("Id\t" + "ASIN\t" + "Title\t" + 
        "Categories\t" + "Group\t" 
        "SalesRank\t" + "TotalReviews\t" + "AvgRating\t"
        "DegreeCentrality\t" + "ClusteringCoeff\n")
for ASIN,metadata in Music_CDs.items():
    Music_file.write(metadata['Id'] + "\t" + \
            ASIN + "\t" + \
            metadata['Title'] + "\t" + \
            metadata['Categories'] + "\t" + \
            metadata['Group'] + "\t" + \
            str(metadata['SalesRank']) + "\t" + \
            str(metadata['TotalReviews']) + "\t" + \
            str(metadata['AvgRating']) + "\t" + \
            str(metadata['DegreeCentrality']) + "\t" + \
            str(metadata['ClusteringCoeff']) + "\n")
Music_file.close()

In [96]:
Music_edgelist=open("Music.edgelist",'wb')
nx.write_weighted_edgelist(Music_Graph, Music_edgelist)
Music_edgelist.close()

### Videos

In [97]:
Videos_file = open('./Videos.txt', 'w', encoding='utf-8', errors='ignore')
Videos_file.write("Id\t" + "ASIN\t" + "Title\t" + 
        "Categories\t" + "Group\t" 
        "SalesRank\t" + "TotalReviews\t" + "AvgRating\t"
        "DegreeCentrality\t" + "ClusteringCoeff\n")
for ASIN,metadata in Videos.items():
    Videos_file.write(metadata['Id'] + "\t" + \
            ASIN + "\t" + \
            metadata['Title'] + "\t" + \
            metadata['Categories'] + "\t" + \
            metadata['Group'] + "\t" + \
            str(metadata['SalesRank']) + "\t" + \
            str(metadata['TotalReviews']) + "\t" + \
            str(metadata['AvgRating']) + "\t" + \
            str(metadata['DegreeCentrality']) + "\t" + \
            str(metadata['ClusteringCoeff']) + "\n")
Videos_file.close()

In [98]:
Videos_edgelist=open("Videos.edgelist",'wb')
nx.write_weighted_edgelist(Videos_Graph, Videos_edgelist)
Videos_edgelist.close()

## Creating a merged dataset of userdata and metadata

In [18]:
items = list(SNAP_dict.items())

In [19]:
ls = []
for i, j in enumerate(items):
    colnames = list(j[1])
    colnames.append("ASIN")
    vals = list(j[1].values())
    vals.append(j[0])
    ls.append(vals)
df = pd.DataFrame(ls, columns=colnames)

In [20]:
df

Unnamed: 0,Id,Title,Categories,Group,Copurchased,SalesRank,TotalReviews,AvgRating,DegreeCentrality,ClusteringCoeff,ASIN
0,3,World War II Allied Fighter Planes Trading Cards,garden subject home hobbi general book craft,Book,,1270652,1,5.0,0,0.0,0486287785
1,4,Life Application Bible Commentary: 1 and 2 Tim...,life text discipleship refer book commentari n...,Book,0842328130 0830818138 0842330313 0842328610 08...,631289,1,4.0,0,0.0,0842328327
2,5,Prayers That Avail Much for Business: Executive,religion subject live christian devot book wor...,Book,157794349X 0892749504 1577941829 0892749563 15...,455160,0,0.0,0,0.0,1577943082
3,6,How the Other Half Lives: Studies Among the Te...,book urban essay general america scienc camera...,Book,0486401960 0452283612 0486229076 0714840343 03...,188784,17,4.0,0,0.0,0486220125
4,7,Batik,postbebop import music general jazz modern sty...,Music,B00002616C B0000261KX B00006AM8D B000059OB9 B0...,5392,3,4.5,0,0.0,B00000AU3R
...,...,...,...,...,...,...,...,...,...,...,...
548544,548547,The Drifter,horror jo amazon kim b ann actor outlet willet...,DVD,630366704X B0002ERXB8 B0001932ZU B0001VTPUE B0...,0,1,5.0,0,0.0,B000059TOC
548545,548548,The House Of Morecock,independ comedi h distribut general hous anim ...,DVD,B0002HOE6C B0002I84JO B00004WZQN B00069CQ8E B0...,0,8,3.0,0,0.0,B00006JBIX
548546,548549,Catholic Bioethics and the Gift of Human Life,sociolog religion social nonfict subject philo...,Book,1931709920 188187110X 081890643X 1580510469 08...,0,1,4.0,0,0.0,0879736836
548547,548550,"1, 2, 3 Soleils: Taha, Khaled, Faudel",store amazon com today deal music general spec...,DVD,B00012FWNC B0002UNQQI B00069FKLO B0000CNTHZ B0...,0,3,5.0,0,0.0,B00008DDST


In [21]:
df.to_csv('Final_Metadata.csv')

In [66]:
df_userdata = pd.read_csv('D:/Bajaj/amazon0601.txt/Amazon0601.txt',sep="\t")

In [67]:
df_userdata.columns = ['Id','target']

In [68]:
df_userdata

Unnamed: 0,Id,target
0,0,1
1,0,2
2,0,3
3,0,4
4,0,5
...,...,...
3387383,403392,121379
3387384,403392,190663
3387385,403393,318438
3387386,403393,326962


In [69]:
df_source = df_userdata[['Id']]
df_source

Unnamed: 0,Id
0,0
1,0
2,0
3,0
4,0
...,...
3387383,403392
3387384,403392
3387385,403393
3387386,403393


In [70]:
df_destination = df_userdata[['target']]
df_destination

Unnamed: 0,target
0,1
1,2
2,3
3,4
4,5
...,...
3387383,121379
3387384,190663
3387385,318438
3387386,326962


In [71]:
df['Id']=df['Id'].astype(int)
df_source['Id']=df_source['Id'].astype(int)

In [72]:
df_new = pd.merge(df,df_source,on='Id')

In [73]:
df_new

Unnamed: 0,Id,Title,Categories,Group,Copurchased,SalesRank,TotalReviews,AvgRating,DegreeCentrality,ClusteringCoeff,ASIN
0,3,World War II Allied Fighter Planes Trading Cards,book home hobbi general subject garden craft,Book,,1270652,1,5.0,0,0.0,0486287785
1,3,World War II Allied Fighter Planes Trading Cards,book home hobbi general subject garden craft,Book,,1270652,1,5.0,0,0.0,0486287785
2,3,World War II Allied Fighter Planes Trading Cards,book home hobbi general subject garden craft,Book,,1270652,1,5.0,0,0.0,0486287785
3,3,World War II Allied Fighter Planes Trading Cards,book home hobbi general subject garden craft,Book,,1270652,1,5.0,0,0.0,0486287785
4,3,World War II Allied Fighter Planes Trading Cards,book home hobbi general subject garden craft,Book,,1270652,1,5.0,0,0.0,0486287785
...,...,...,...,...,...,...,...,...,...,...,...
3387353,403392,Tune,genr comedi anim general vhs,Video,B0000687D2 B0000A1HR8 B00008ZZ8O B00065GX5U B0...,3969,12,4.5,0,0.0,6305498768
3387354,403392,Tune,genr comedi anim general vhs,Video,B0000687D2 B0000A1HR8 B00008ZZ8O B00065GX5U B0...,3969,12,4.5,0,0.0,6305498768
3387355,403393,Louis L'Amour Collection,book genr literatur tape literari general west...,Book,0553714597 0553803573 0739313681 0553280902 05...,781804,1,4.0,0,0.0,1565117360
3387356,403393,Louis L'Amour Collection,book genr literatur tape literari general west...,Book,0553714597 0553803573 0739313681 0553280902 05...,781804,1,4.0,0,0.0,1565117360


In [74]:
df_destination.rename(columns = {'target':'Id'}, inplace = True)

In [75]:
df_new2 = pd.merge(df,df_destination,on='Id')

In [76]:
df_new2

Unnamed: 0,Id,Title,Categories,Group,Copurchased,SalesRank,TotalReviews,AvgRating,DegreeCentrality,ClusteringCoeff,ASIN
0,3,World War II Allied Fighter Planes Trading Cards,book home hobbi general subject garden craft,Book,,1270652,1,5.0,0,0.0,0486287785
1,3,World War II Allied Fighter Planes Trading Cards,book home hobbi general subject garden craft,Book,,1270652,1,5.0,0,0.0,0486287785
2,3,World War II Allied Fighter Planes Trading Cards,book home hobbi general subject garden craft,Book,,1270652,1,5.0,0,0.0,0486287785
3,4,Life Application Bible Commentary: 1 and 2 Tim...,discipleship studi translat general subject bo...,Book,0842328130 0830818138 0842330313 0842328610 08...,631289,1,4.0,0,0.0,0842328327
4,4,Life Application Bible Commentary: 1 and 2 Tim...,discipleship studi translat general subject bo...,Book,0842328130 0830818138 0842330313 0842328610 08...,631289,1,4.0,0,0.0,0842328327
...,...,...,...,...,...,...,...,...,...,...,...
3387373,403390,Alaska RV Adventure: The Last Great Road Trip,travel com unit state interest special genr ae...,Video,B00006307C,16629,0,0.0,0,0.0,0972031723
3387374,403390,Alaska RV Adventure: The Last Great Road Trip,travel com unit state interest special genr ae...,Video,B00006307C,16629,0,0.0,0,0.0,0972031723
3387375,403391,When Someone Has a Very Serious Illness: Child...,health book scienc children nonfict work natur...,Book,0962050202 0962050237 0870293214 0962050229 09...,152602,0,0.0,0,0.0,0962050245
3387376,403392,Tune,genr comedi anim general vhs,Video,B0000687D2 B0000A1HR8 B00008ZZ8O B00065GX5U B0...,3969,12,4.5,0,0.0,6305498768


In [77]:
df_new2.rename(columns = {'Id':'target'}, inplace = True)

In [78]:
df_new.rename(columns = {'Id':'source'}, inplace = True)

In [79]:
df_new = df_new.dropna()
df_new2 = df_new2.dropna()

In [80]:
df_merged = pd.concat([df_new,df_new2],axis=1)

In [81]:
df_merged

Unnamed: 0,source,Title,Categories,Group,Copurchased,SalesRank,TotalReviews,AvgRating,DegreeCentrality,ClusteringCoeff,...,Title.1,Categories.1,Group.1,Copurchased.1,SalesRank.1,TotalReviews.1,AvgRating.1,DegreeCentrality.1,ClusteringCoeff.1,ASIN
0,3.0,World War II Allied Fighter Planes Trading Cards,book home hobbi general subject garden craft,Book,,1270652.0,1.0,5.0,0.0,0.0,...,World War II Allied Fighter Planes Trading Cards,book home hobbi general subject garden craft,Book,,1270652,1,5.0,0,0.0,0486287785
1,3.0,World War II Allied Fighter Planes Trading Cards,book home hobbi general subject garden craft,Book,,1270652.0,1.0,5.0,0.0,0.0,...,World War II Allied Fighter Planes Trading Cards,book home hobbi general subject garden craft,Book,,1270652,1,5.0,0,0.0,0486287785
2,3.0,World War II Allied Fighter Planes Trading Cards,book home hobbi general subject garden craft,Book,,1270652.0,1.0,5.0,0.0,0.0,...,World War II Allied Fighter Planes Trading Cards,book home hobbi general subject garden craft,Book,,1270652,1,5.0,0,0.0,0486287785
3,3.0,World War II Allied Fighter Planes Trading Cards,book home hobbi general subject garden craft,Book,,1270652.0,1.0,5.0,0.0,0.0,...,Life Application Bible Commentary: 1 and 2 Tim...,discipleship studi translat general subject bo...,Book,0842328130 0830818138 0842330313 0842328610 08...,631289,1,4.0,0,0.0,0842328327
4,3.0,World War II Allied Fighter Planes Trading Cards,book home hobbi general subject garden craft,Book,,1270652.0,1.0,5.0,0.0,0.0,...,Life Application Bible Commentary: 1 and 2 Tim...,discipleship studi translat general subject bo...,Book,0842328130 0830818138 0842330313 0842328610 08...,631289,1,4.0,0,0.0,0842328327
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3387373,,,,,,,,,,,...,Alaska RV Adventure: The Last Great Road Trip,travel com unit state interest special genr ae...,Video,B00006307C,16629,0,0.0,0,0.0,0972031723
3387374,,,,,,,,,,,...,Alaska RV Adventure: The Last Great Road Trip,travel com unit state interest special genr ae...,Video,B00006307C,16629,0,0.0,0,0.0,0972031723
3387375,,,,,,,,,,,...,When Someone Has a Very Serious Illness: Child...,health book scienc children nonfict work natur...,Book,0962050202 0962050237 0870293214 0962050229 09...,152602,0,0.0,0,0.0,0962050245
3387376,,,,,,,,,,,...,Tune,genr comedi anim general vhs,Video,B0000687D2 B0000A1HR8 B00008ZZ8O B00065GX5U B0...,3969,12,4.5,0,0.0,6305498768


In [82]:
df_merged = df_merged.dropna()

In [83]:
df_merged

Unnamed: 0,source,Title,Categories,Group,Copurchased,SalesRank,TotalReviews,AvgRating,DegreeCentrality,ClusteringCoeff,...,Title.1,Categories.1,Group.1,Copurchased.1,SalesRank.1,TotalReviews.1,AvgRating.1,DegreeCentrality.1,ClusteringCoeff.1,ASIN
0,3.0,World War II Allied Fighter Planes Trading Cards,book home hobbi general subject garden craft,Book,,1270652.0,1.0,5.0,0.0,0.0,...,World War II Allied Fighter Planes Trading Cards,book home hobbi general subject garden craft,Book,,1270652,1,5.0,0,0.0,0486287785
1,3.0,World War II Allied Fighter Planes Trading Cards,book home hobbi general subject garden craft,Book,,1270652.0,1.0,5.0,0.0,0.0,...,World War II Allied Fighter Planes Trading Cards,book home hobbi general subject garden craft,Book,,1270652,1,5.0,0,0.0,0486287785
2,3.0,World War II Allied Fighter Planes Trading Cards,book home hobbi general subject garden craft,Book,,1270652.0,1.0,5.0,0.0,0.0,...,World War II Allied Fighter Planes Trading Cards,book home hobbi general subject garden craft,Book,,1270652,1,5.0,0,0.0,0486287785
3,3.0,World War II Allied Fighter Planes Trading Cards,book home hobbi general subject garden craft,Book,,1270652.0,1.0,5.0,0.0,0.0,...,Life Application Bible Commentary: 1 and 2 Tim...,discipleship studi translat general subject bo...,Book,0842328130 0830818138 0842330313 0842328610 08...,631289,1,4.0,0,0.0,0842328327
4,3.0,World War II Allied Fighter Planes Trading Cards,book home hobbi general subject garden craft,Book,,1270652.0,1.0,5.0,0.0,0.0,...,Life Application Bible Commentary: 1 and 2 Tim...,discipleship studi translat general subject bo...,Book,0842328130 0830818138 0842330313 0842328610 08...,631289,1,4.0,0,0.0,0842328327
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3387353,403392.0,Tune,genr comedi anim general vhs,Video,B0000687D2 B0000A1HR8 B00008ZZ8O B00065GX5U B0...,3969.0,12.0,4.5,0.0,0.0,...,Lie Algebras in Particle Physics (Frontiers in...,book scienc technic state mathemat group abstr...,Book,0521831431 0750306068 0201503972 0691010196 05...,373938,8,4.0,0,0.0,0738202339
3387354,403392.0,Tune,genr comedi anim general vhs,Video,B0000687D2 B0000A1HR8 B00008ZZ8O B00065GX5U B0...,3969.0,12.0,4.5,0.0,0.0,...,Lie Algebras in Particle Physics (Frontiers in...,book scienc technic state mathemat group abstr...,Book,0521831431 0750306068 0201503972 0691010196 05...,373938,8,4.0,0,0.0,0738202339
3387355,403393.0,Louis L'Amour Collection,book genr literatur tape literari general west...,Book,0553714597 0553803573 0739313681 0553280902 05...,781804.0,1.0,4.0,0.0,0.0,...,CD-ROM TestPrep t/a Pearson: AN INTRODUCTION T...,refer languag book general communic subject word,Book,007282574X,1749811,4,3.0,0,0.0,0072428201
3387356,403393.0,Louis L'Amour Collection,book genr literatur tape literari general west...,Book,0553714597 0553803573 0739313681 0553280902 05...,781804.0,1.0,4.0,0.0,0.0,...,CD-ROM TestPrep t/a Pearson: AN INTRODUCTION T...,refer languag book general communic subject word,Book,007282574X,1749811,4,3.0,0,0.0,0072428201
