# GDMA Project
Author: Julian Schelb (1069967)

In [1]:
from neo4j import GraphDatabase
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

### Connection to the database instance

In [2]:
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "subatomic-shrank-Respond"))
database_name = "cddb"
session = driver.session(database = database_name)

### Task 3: SQL to Cypher
Translate the following SQL queries to Cypher:

**Query 1:**


List all uniques artists which have published a cd in the year 2000.

``` sql
SELECT DISTINCT artist
FROM artists
NATURAL JOIN artist2album
NATURAL JOIN cds
WHERE ayear = 2000
```

In [3]:
query = """
MATCH (c:CD)-[r:CONTAINS]->(ar:Artist)
WHERE c.ayear = 2000
RETURN DISTINCT ar.artist
"""
        
dtf_data = pd.DataFrame([dict(_) for _ in session.run(query)])
dtf_data

Unnamed: 0,ar.artist
0,the frequency benders
1,boyz ii men
2,adriana calcanhoto
3,syl johnson
4,cheo feliciano
...,...
5619,garry harrison
5620,catuaba com amendoim
5621,jimmy powells
5622,ofra haza


***

**Query 2:**


Calculate the max and average year of a cd was published between 1900 and 2012.

``` sql
SELECT MAX(ayear), AVG(ayear)
FROM cddb.cds
WHERE ayear BETWEEN 1900 AND 2012
```

In [4]:
query = """
MATCH (c:CD)
WHERE 1900 <= c.ayear <=2012
RETURN max(c.ayear), avg(c.ayear)
LIMIT 100
"""
        
dtf_data = pd.DataFrame([dict(_) for _ in session.run(query)])
dtf_data

Unnamed: 0,max(c.ayear),avg(c.ayear)
0,2011,1994.247219


***

**Query 3:**


Argument all genres with a count of published CDs per genre where that count is 1000 or more. Order descending by cd count.

``` sql
SELECT genre , COUNT(cdid) AS cds
FROM cds NATURAL JOIN genres g
GROUP BY g.genreid , genre
HAVING COUNT(cdid) >= 1000
ORDER BY cds DESC
```

In [5]:
query = """
MATCH (c:CD)-[r:BELONGS_TO]->(g:Genre) 
WITH g.genre as genre, count(c) as cds
WHERE cds >= 1000
RETURN genre, cds 
ORDER BY cds DESC
"""
        
dtf_data = pd.DataFrame([dict(_) for _ in session.run(query)])
dtf_data

Unnamed: 0,genre,cds
0,classical,34069
1,blues,13232
2,country,11951
3,jazz,8742
4,opera,4045
5,rock,3813
6,folk,3238
7,pop,2897
8,new age,1838
9,r&b,1270


***

**Query 4:**


List artists and albums which are associated with a song containing the phrase "moonlight shadow".

``` sql
SELECT artist, album
FROM cddb.artists
NATURAL JOIN cddb.artist2album
NATURAL JOIN cddb.albums
NATURAL JOIN cddb.cds
NATURAL JOIN cddb.cdtracks
NATURAL JOIN cddb.songs
WHERE song LIKE '%moonlight shadow%'
```

In [6]:
query = """
MATCH (c:CD)-[r1:CONTAINS]->(ar:Artist)
MATCH (c:CD)-[r2:CONTAINS]->(ab:Album)
MATCH (c:CD)-[r3:CONTAINS]->(s:Song)
WHERE s.song =~ '.*moonlight shadow.*'
RETURN ar.artist as artist, ab.album as album
"""
        
dtf_data = pd.DataFrame([dict(_) for _ in session.run(query)])
dtf_data

Unnamed: 0,artist,album
0,groove coverage,best of
1,groove coverage,best of
2,groove coverage,best of
3,aselin debison,moonlight shadow
4,the shadows,life storyof
5,koto,koto
6,koto,plays synthesizer world hits
7,mike oldfield,crises
8,dana winner,unforgettable
9,mike oldfield,art in heaven (live in berlin);
