In [1]:
%reload_ext jupyter-rdfify

### Analyzing amount of movies per year

In [2]:
%%rdf sparql --endpoint https://api.triplydb.com/datasets/Triply/linkedmdb/services/linkedmdb/sparql
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX lmdb: <https://triplydb.com/Triply/linkedmdb/vocab/>
PREFIX dc: <http://purl.org/dc/terms/>

SELECT distinct ?year (COUNT(DISTINCT ?movie) as ?amountMovies)
WHERE {
    ?movie a lmdb:Film ;
            dc:date ?date .
    BIND(SUBSTR(?date, 1, 4) AS ?year) .
}
GROUP BY ?year
ORDER BY ?year

## The knowledge graph contains movies that were released between 1888 and 2013
## The results show some obvious faulty data in the knowledge graph:
## 1 movie from 1004
## 1 movie from 2075

year,amountMovies
,13174^^http://www.w3.org/2001/XMLSchema#integer
/gui,2^^http://www.w3.org/2001/XMLSchema#integer
1004,1^^http://www.w3.org/2001/XMLSchema#integer
1888,2^^http://www.w3.org/2001/XMLSchema#integer
1889,1^^http://www.w3.org/2001/XMLSchema#integer
1891,2^^http://www.w3.org/2001/XMLSchema#integer
1892,4^^http://www.w3.org/2001/XMLSchema#integer
1893,2^^http://www.w3.org/2001/XMLSchema#integer
1894,9^^http://www.w3.org/2001/XMLSchema#integer
1895,15^^http://www.w3.org/2001/XMLSchema#integer


In [3]:
%%rdf sparql --endpoint https://api.triplydb.com/datasets/Triply/linkedmdb/services/linkedmdb/sparql
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX lmdb: <https://triplydb.com/Triply/linkedmdb/vocab/>
PREFIX dc: <http://purl.org/dc/terms/>

SELECT distinct ?movie ?title ?year
WHERE {
    ?movie a lmdb:Film ;
             dc:title ?title ;
             dc:date ?date .
    BIND(SUBSTR(?date, 1, 4) AS ?year) .
    FILTER(?year in ("1004", "2012", "2013", "2075"))
}
ORDER BY ?year

## Movies in 2012 partly don't contain true information
## Compare Deadpool which was releaed 2016, not 2012
## Possible explanation:
## Data was added to the knowledge graph before 2012 (probably 2010)
## Movies that have a later release year were announced movies at that time

movie,title,year
<https://triplydb.com/Triply/linkedmdb/id/film/34426>,The Best Thief in the World,1004
<https://triplydb.com/Triply/linkedmdb/id/film/45021>,Crood Awakening,2012
<https://triplydb.com/Triply/linkedmdb/id/film/53953>,Newt,2012
<https://triplydb.com/Triply/linkedmdb/id/film/71763>,The Hobbit 2,2012
<https://triplydb.com/Triply/linkedmdb/id/film/73431>,Madagascar 3,2012
<https://triplydb.com/Triply/linkedmdb/id/film/97044>,Dr. Seuss' The Lorax,2012
<https://triplydb.com/Triply/linkedmdb/id/film/53954>,King of the Elves,2012
<https://triplydb.com/Triply/linkedmdb/id/film/86971>,The Guardians,2012
<https://triplydb.com/Triply/linkedmdb/id/film/97038>,Hotel Transylvania,2012
<https://triplydb.com/Triply/linkedmdb/id/film/81402>,The Avengers,2012


### Looking for duplicates in data (Same movie title, same release year but different ids)

In [4]:
%%rdf sparql --endpoint https://api.triplydb.com/datasets/Triply/linkedmdb/services/linkedmdb/sparql
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX lmdb: <https://triplydb.com/Triply/linkedmdb/vocab/>
PREFIX dc: <http://purl.org/dc/terms/>

SELECT distinct ?title ?movie1 ?movie2 ?date1 ?date2 ?year1 ?year2
WHERE {
    ?movie1 dc:title ?title ;
            dc:date ?date1 .
    ?movie2 dc:title ?title ;
            dc:date ?date2 .
  BIND(SUBSTR(?date1, 1, 4) AS ?year1) .
  BIND(SUBSTR(?date2, 1, 4) AS ?year2) .
  FILTER(?movie1 < ?movie2 && ?title != "" && ?year1 != "" && ?year2 != "" && ?year1 = ?year2)
}
LIMIT 200

## Duplicates can be found anywhere. It is difficult to distinguish them based on the filmid
## A larger list (10.000) of results can be obtained by applying this query without limiting on 
## https://triplydb.com/Triply/linkedmdb/sparql/linkedmdb

title,movie1,movie2,date1,date2,year1,year2
A Doll's House,<https://triplydb.com/Triply/linkedmdb/id/film/10051>,<https://triplydb.com/Triply/linkedmdb/id/film/15874>,1973,1973-05-22,1973,1973
Badri,<https://triplydb.com/Triply/linkedmdb/id/film/10513>,<https://triplydb.com/Triply/linkedmdb/id/film/48536>,2000-04-20,2000-04-20,2000,2000
Badri,<https://triplydb.com/Triply/linkedmdb/id/film/10513>,<https://triplydb.com/Triply/linkedmdb/id/film/26289>,2000-04-20,2000-04-20,2000,2000
Anna Christie,<https://triplydb.com/Triply/linkedmdb/id/film/11412>,<https://triplydb.com/Triply/linkedmdb/id/film/38795>,1931,1931,1931,1931
Anna Christie,<https://triplydb.com/Triply/linkedmdb/id/film/11412>,<https://triplydb.com/Triply/linkedmdb/id/film/25727>,1931,1931,1931,1931
1984,<https://triplydb.com/Triply/linkedmdb/id/film/1523>,<https://triplydb.com/Triply/linkedmdb/id/film/38932>,1984-10-10,1984-01-22,1984,1984
Alice's Restaurant,<https://triplydb.com/Triply/linkedmdb/id/film/17923>,<https://triplydb.com/Triply/linkedmdb/id/film/1022>,1969-08-20,1969-08-20,1969,1969
Johnny Angel,<https://triplydb.com/Triply/linkedmdb/id/film/30017>,<https://triplydb.com/Triply/linkedmdb/id/film/45198>,1945-12-27,1945-12-27,1945,1945
Alibi,<https://triplydb.com/Triply/linkedmdb/id/film/38260>,<https://triplydb.com/Triply/linkedmdb/id/film/25519>,1929-04-20,"1929-04-19,1929-04-20",1929,1929
Anna Christie,<https://triplydb.com/Triply/linkedmdb/id/film/38795>,<https://triplydb.com/Triply/linkedmdb/id/film/25727>,1931,1931,1931,1931


### Looking for amount of duplicates per year

In [5]:
%%rdf sparql --endpoint https://api.triplydb.com/datasets/Triply/linkedmdb/services/linkedmdb/sparql
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX lmdb: <https://triplydb.com/Triply/linkedmdb/vocab/>
PREFIX dc: <http://purl.org/dc/terms/>

SELECT distinct ?year1 (COUNT(DISTINCT ?movie1) as ?amountMovies)
WHERE {
    ?movie1 dc:title ?title ;
            dc:date ?date1 .
    ?movie2 dc:title ?title ;
            dc:date ?date2 .
  BIND(SUBSTR(?date1, 1, 4) AS ?year1) .
  BIND(SUBSTR(?date2, 1, 4) AS ?year2) .
  FILTER(?movie1 < ?movie2 && ?title != "" && ?year1 != "" && ?year2 != "" && ?year1 = ?year2)
}
GROUP BY ?year1
ORDER BY ?year1

year1,amountMovies
1888,1^^http://www.w3.org/2001/XMLSchema#integer
1891,1^^http://www.w3.org/2001/XMLSchema#integer
1892,1^^http://www.w3.org/2001/XMLSchema#integer
1893,1^^http://www.w3.org/2001/XMLSchema#integer
1894,1^^http://www.w3.org/2001/XMLSchema#integer
1897,1^^http://www.w3.org/2001/XMLSchema#integer
1898,3^^http://www.w3.org/2001/XMLSchema#integer
1899,1^^http://www.w3.org/2001/XMLSchema#integer
1900,1^^http://www.w3.org/2001/XMLSchema#integer
1901,1^^http://www.w3.org/2001/XMLSchema#integer


### Example Query for finding a movie

In [6]:
%%rdf sparql --endpoint https://api.triplydb.com/datasets/Triply/linkedmdb/services/linkedmdb/sparql
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX lmdb: <https://triplydb.com/Triply/linkedmdb/vocab/>
PREFIX dc: <http://purl.org/dc/terms/>

SELECT distinct ?movie ?date (GROUP_CONCAT(?directorName; SEPARATOR=", ") AS ?directors)
WHERE {
    ?director a lmdb:Director ;
                lmdb:director_name ?directorName .
    ?movie a lmdb:Film ;
             lmdb:director ?director ;
             dc:title "Alice in Wonderland" ;
             dc:date ?date .
}

## The same director is occurring multiple times due to the fact 
## that there are duplicates of directors in the knowledge graph

movie,date,directors
<https://triplydb.com/Triply/linkedmdb/id/film/4368>,1999,"Nick Willing, Michael Winterbottom, Nick Willing"
<https://triplydb.com/Triply/linkedmdb/id/film/4369>,1903,"Cecil Hepworth, Cecil Hepworth"
<https://triplydb.com/Triply/linkedmdb/id/film/548>,1985,"Harry Harris, Harry Harris, Harry Harris"
<https://triplydb.com/Triply/linkedmdb/id/film/3476>,1976-12-10,Bud Townsend
<https://triplydb.com/Triply/linkedmdb/id/film/65505>,2010-03-05,Tim Burton
<https://triplydb.com/Triply/linkedmdb/id/film/115>,"1933,1933-12-22",Norman Z. McLeod
<https://triplydb.com/Triply/linkedmdb/id/film/1388>,1951-07-28,"Wilfred Jackson, Wilfred Jackson, Clyde Geronimi, Hamilton Luske, Clyde Geronimi"


In [7]:
%%rdf sparql --endpoint https://api.triplydb.com/datasets/Triply/linkedmdb/services/linkedmdb/sparql
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX lmdb: <https://triplydb.com/Triply/linkedmdb/vocab/>
PREFIX dc: <http://purl.org/dc/terms/>

SELECT distinct ?movie ?date (GROUP_CONCAT(DISTINCT ?directorName; SEPARATOR=", ") AS ?directors)
WHERE {
    ?director a lmdb:Director ;
                lmdb:director_name ?directorName .
    ?movie a lmdb:Film ;
             lmdb:director ?director ;
             dc:title "Alice in Wonderland" ;
             dc:date ?date .
}

## Problem solved by only considering distinct director names

movie,date,directors
<https://triplydb.com/Triply/linkedmdb/id/film/4368>,1999,"Michael Winterbottom, Nick Willing"
<https://triplydb.com/Triply/linkedmdb/id/film/4369>,1903,Cecil Hepworth
<https://triplydb.com/Triply/linkedmdb/id/film/548>,1985,Harry Harris
<https://triplydb.com/Triply/linkedmdb/id/film/3476>,1976-12-10,Bud Townsend
<https://triplydb.com/Triply/linkedmdb/id/film/65505>,2010-03-05,Tim Burton
<https://triplydb.com/Triply/linkedmdb/id/film/115>,"1933,1933-12-22",Norman Z. McLeod
<https://triplydb.com/Triply/linkedmdb/id/film/1388>,1951-07-28,"Clyde Geronimi, Hamilton Luske, Wilfred Jackson"


### Function to get a list of movies based on a given title

In [8]:
from SPARQLWrapper import SPARQLWrapper, JSON

LMDB_PREFIX = "<https://triplydb.com/Triply/linkedmdb/vocab/>"
SPARQL_ENDPOINT = "https://api.triplydb.com/datasets/Triply/linkedmdb/services/linkedmdb/sparql"

def get_movie_id(movie_title):
    query_str = f"""
    PREFIX lmdb: {LMDB_PREFIX}

    SELECT distinct ?movie ?year (GROUP_CONCAT(DISTINCT ?directorName; SEPARATOR=", ") AS ?directors)
    WHERE {{
      ?director a lmdb:Director .
      ?director lmdb:director_name ?directorName .
      ?movie a lmdb:Film .
      ?movie <http://purl.org/dc/terms/title> "{movie_title}" .
      ?movie lmdb:director ?director .
      ?movie <http://purl.org/dc/terms/date> ?date .
      BIND(SUBSTR(?date, 1, 4) AS ?year) .
    }}
    """
    
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query_str)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    movie_ids = []
    for result in results["results"]["bindings"]:
        movie_id = result["movie"]["value"].split('/')[-1]
        movie_director = result["directors"]["value"]
        movie_year = result["year"]["value"]
        movie_ids.append({"movie_id": movie_id, "movie_title": movie_title, "movie_director": movie_director, "movie_year": movie_year})

    return movie_ids

def print_movies(movie_list):
    print("Found movies")
    for i in range(len(movie_list)):
        print(f"{i} - {movie_list[i]['movie_title']} ({movie_list[i]['movie_year']}, {movie_list[i]['movie_director']})")

In [9]:
print_movies(get_movie_id("Titanic"))

Found movies
0 - Titanic (1943, Herbert Selpin, Werner Klingler)
1 - Titanic (1953, Jean Negulesco)
2 - Titanic (1997, James Cameron)
3 - Titanic (1996, Robert Lieberman)


In [10]:
print_movies(get_movie_id("Apocalypse Now"))

## Problem: Duplicates
## Possible options:
## 1.) Show all alternatives
## 2.) Show the option with the smallest/largest movie id

Found movies
0 - Apocalypse Now (1979, Francis Ford Coppola)
1 - Apocalypse Now (1979, Francis Ford Coppola)
