# BDCC project - data set overview

**[Big Data and Cloud Computing](https://www.dcc.fc.up.pt/~edrdo/aulas/bdcc), Project 1**


## Spark setup

In [0]:

def setupSpark():
  # Spark needs to run with Java 8 ... 
  !pip install -q findspark
  !apt-get install openjdk-8-jdk-headless > /dev/null
  !echo 2 | update-alternatives --config java > /dev/null
  # !java -version
  import os, findspark
  os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'
  # !echo JAVA_HOME=$JAVA_HOME
  !pip install -q pyspark
  findspark.init(spark_home='/usr/local/lib/python3.6/dist-packages/pyspark')
  !pyspark --version

setupSpark()

from pyspark import SparkContext
from pyspark.sql import SparkSession
    
spark = SparkSession\
        .builder\
        .master('local[*]')\
        .getOrCreate()
sc = spark.sparkContext

[K     |████████████████████████████████| 217.8MB 54kB/s 
[K     |████████████████████████████████| 204kB 50.5MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 2.4.5
      /_/
                        
Using Scala version 2.11.12, OpenJDK 64-Bit Server VM, 1.8.0_242
Branch HEAD
Compiled by user centos on 2020-02-02T19:38:06Z
Revision cee4ecbb16917fa85f02c635925e2687400aa56b
Url https://gitbox.apache.org/repos/asf/spark.git
Type --help for more information.


In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/content/drive/My Drive/bdcc-colab.json' 
!echo $GOOGLE_APPLICATION_CREDENTIALS

/content/drive/My Drive/bdcc-colab.json


In [0]:
from google.cloud import storage

storage_client = storage.Client()
buckets = storage_client.list_buckets()
print('-- List of buckets in project \"' + storage_client.project + '\"')

for b in buckets:
  print(b.name)

-- List of buckets in project "bdcc20-p1"
bdcc20-movie_data


In [0]:
# To enable the GPU access Edit > Notebook settings and set the Hardware accelerator to GPU.

%tensorflow_version 2.x 
import tensorflow as tf

print("GPU device: " + tf.test.gpu_device_name())

from tensorflow.python.client import device_lib

tf_devices = device_lib.list_local_devices()

for x in tf_devices:
  print('------')
  print(x)


GPU device: 
------
name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 15402149842273220958

------
name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 12902270202234098410
physical_device_desc: "device: XLA_CPU device"



## Parameters

In [0]:

#@markdown ---
PROJECT_ID = 'bdcc20-p1'  #@param {type: "string"}
BUCKET = 'bdcc20-movie_data/bdcc1920_project_datasets' #@param {type: "string"}
DATASET = "tiny1" #@param ["tiny1", "tiny2", "tiny3", "tiny4", "medium1", "medium2", "medium3", "medium4", "large1", "large2", "large3", "large4", "large5"]
#@markdown ---


## Authenticate to GCP

In [0]:
# The authentication method 
def google_colab_authenticate(projectId, keyFile=None, debug=True):  
    import os
    from google.colab import auth
    if keyFile == None:
      keyFile='/content/bdcc-colab.json'
    if os.access(keyFile,os.R_OK):
      if debug:
        print('Using key file "%s"' % keyFile)
      os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '%s' % keyFile
      os.environ['GCP_PROJECT'] = projectId 
      os.environ['GCP_ACCOUNT'] = 'bdcc-colab@' + projectId + '.iam.gserviceaccount.com'
      !gcloud auth activate-service-account --key-file="$GOOGLE_APPLICATION_CREDENTIALS" --project="$GCP_PROJECT"
    else:
      if debug:
        print('No key file given. You may be redirected to the verification code procedure.')
      auth.authenticate_user()
      !gcloud config set project $projectId
    !gcloud info | grep -e Account -e Project

# Copy key file from Google Drive if available 
# to a path without spaces (it usually creates problems)
!test -f "/content/drive/My Drive/bdcc-colab.json" && cp "/content/drive/My Drive/bdcc-colab.json" /content/bdcc-colab.json

google_colab_authenticate(PROJECT_ID)

Using key file "/content/bdcc-colab.json"
Activated service account credentials for: [bdcc-cloud@bdcc20-p1.iam.gserviceaccount.com]
Account: [bdcc-cloud@bdcc20-p1.iam.gserviceaccount.com]
Project: [bdcc20-p1]


## Transfer dataset files if necessary

In [0]:
!ls
!test -d $DATASET || gsutil -m cp -r gs://"$BUCKET"/"$DATASET" .
!du --human $DATASET

bdcc-colab.json  drive	sample_data
Copying gs://bdcc20-movie_data/bdcc1920_project_datasets/tiny1/actors.parquet/._SUCCESS.crc...
Copying gs://bdcc20-movie_data/bdcc1920_project_datasets/tiny1/actors.parquet/part-00004-f6d6a44d-0a01-458c-af85-94c613c72cb8-c000.snappy.parquet...
Copying gs://bdcc20-movie_data/bdcc1920_project_datasets/tiny1/genres.parquet/._SUCCESS.crc...
Copying gs://bdcc20-movie_data/bdcc1920_project_datasets/tiny1/genres.parquet/.part-00000-49cf2b70-c7dc-4f9a-96ea-02fc0fec4eae-c000.snappy.parquet.crc...
Copying gs://bdcc20-movie_data/bdcc1920_project_datasets/tiny1/actors.parquet/.part-00000-f6d6a44d-0a01-458c-af85-94c613c72cb8-c000.snappy.parquet.crc...
Copying gs://bdcc20-movie_data/bdcc1920_project_datasets/tiny1/genres.parquet/.part-00005-49cf2b70-c7dc-4f9a-96ea-02fc0fec4eae-c000.snappy.parquet.crc...
Copying gs://bdcc20-movie_data/bdcc1920_project_datasets/tiny1/actors.parquet/.part-00004-f6d6a44d-0a01-458c-af85-94c613c72cb8-c000.snappy.parquet.crc...
Copying gs

## Load data from Parquet files


In [0]:
def readParquet(file, debug=False):
    global spark
    if debug:
       print('==> Reading ' + file)
    df = spark.read.parquet(file)
    return df
 
def readDataSet(debug=False):
  global DATASET, movies, actors, genres, ratings, tags
  movies =  readParquet(DATASET + '/movies.parquet', debug)
  actors =  readParquet(DATASET + '/actors.parquet', debug)
  genres =  readParquet(DATASET + '/genres.parquet', debug)
  ratings = readParquet(DATASET + '/ratings.parquet', debug)
  tags =    readParquet(DATASET + '/tags.parquet', debug)

readDataSet(debug=True)

==> Reading tiny1/movies.parquet
==> Reading tiny1/actors.parquet
==> Reading tiny1/genres.parquet
==> Reading tiny1/ratings.parquet
==> Reading tiny1/tags.parquet


## View summary information

In [0]:
def summary(df, name, distAttr,byMovie=False):
  df.createOrReplaceTempView(name)
  sdf = spark.sql(
      """
      SELECT '%s' AS df, 
             COUNT(*) AS count, 
             COUNT(DISTINCT %s) AS unique,
             COUNT(DISTINCT movieId) AS unique_movies, 
             %d - COUNT(DISTINCT movieId) AS movies_without_data, 
             COUNT(*) / %d AS avg_by_movie
      FROM %s
      """ % (name, distAttr, MOVIE_COUNT, MOVIE_COUNT, name)
  )
  print("== %s ==" % name)
  sdf.show()
  df.printSchema()
  df.limit(10).show()
  spark.catalog.dropGlobalTempView(name)

### Movies

In [0]:
MOVIE_COUNT = movies.count()
summary(movies, 'movies', 'movieId')

== movies ==
+------+-----+------+-------------+-------------------+------------+
|    df|count|unique|unique_movies|movies_without_data|avg_by_movie|
+------+-----+------+-------------+-------------------+------------+
|movies|   10|    10|           10|                  0|         1.0|
+------+-----+------+-------------+-------------------+------------+

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- imdbId: integer (nullable = true)

+-------+--------------------+----+------+
|movieId|               title|year|imdbId|
+-------+--------------------+----+------+
|      1|           Toy Story|1995|114709|
|      2|             Jumanji|1995|113497|
|      3|    Grumpier Old Men|1995|113228|
|      4|   Waiting to Exhale|1995|114885|
|      5|Father of the Bri...|1995|113041|
|      6|                Heat|1995|113277|
|      7|             Sabrina|1995|114319|
|      8|        Tom and Huck|1995|112302|
|      9|

### Actors

In [0]:
summary(actors, 'actor', 'name')

== actor ==
+-----+-----+------+-------------+-------------------+------------+
|   df|count|unique|unique_movies|movies_without_data|avg_by_movie|
+-----+-----+------+-------------+-------------------+------------+
|actor|   40|    40|           10|                  0|         4.0|
+-----+-----+------+-------------+-------------------+------------+

root
 |-- movieId: integer (nullable = true)
 |-- name: string (nullable = true)

+-------+--------------+
|movieId|          name|
+-------+--------------+
|      1|     Tim Allen|
|      1|   Don Rickles|
|      1|    Jim Varney|
|      1|     Tom Hanks|
|      2| Kirsten Dunst|
|      2|Robin Williams|
|      2| Jonathan Hyde|
|      2|   Bonnie Hunt|
|      3|   Jack Lemmon|
|      3|Walter Matthau|
+-------+--------------+



In [0]:
summary(genres, 'genres', 'genre')

== genres ==
+------+-----+------+-------------+-------------------+------------+
|    df|count|unique|unique_movies|movies_without_data|avg_by_movie|
+------+-----+------+-------------+-------------------+------------+
|genres|   25|    10|           10|                  0|         2.5|
+------+-----+------+-------------+-------------------+------------+

root
 |-- movieId: integer (nullable = true)
 |-- genre: string (nullable = true)

+-------+---------+
|movieId|    genre|
+-------+---------+
|      1|Adventure|
|      1|Animation|
|      1| Children|
|      1|   Comedy|
|      1|  Fantasy|
|      2|Adventure|
|      2| Children|
|      2|  Fantasy|
|      3|   Comedy|
|      3|  Romance|
+-------+---------+



In [0]:
summary(ratings, 'ratings', 'rating')

== ratings ==
+-------+-----+------+-------------+-------------------+------------+
|     df|count|unique|unique_movies|movies_without_data|avg_by_movie|
+-------+-----+------+-------------+-------------------+------------+
|ratings|  745|    10|           10|                  0|        74.5|
+-------+-----+------+-------------+-------------------+------------+

root
 |-- movieId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- rating: double (nullable = true)

+-------+------+------+
|movieId|userId|rating|
+-------+------+------+
|      1|     1|   4.0|
|      3|     1|   4.0|
|      6|     1|   4.0|
|      1|     5|   4.0|
|      2|     6|   4.0|
|      3|     6|   5.0|
|      4|     6|   3.0|
|      5|     6|   5.0|
|      6|     6|   4.0|
|      7|     6|   4.0|
+-------+------+------+



In [0]:
summary(tags, 'tags', 'tag')

== tags ==
+----+-----+------+-------------+-------------------+------------+
|  df|count|unique|unique_movies|movies_without_data|avg_by_movie|
+----+-----+------+-------------+-------------------+------------+
|tags|   12|    10|            5|                  5|         1.2|
+----+-----+------+-------------+-------------------+------------+

root
 |-- movieId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- tag: string (nullable = true)

+-------+------+----------------+
|movieId|userId|             tag|
+-------+------+----------------+
|      2|    62|         fantasy|
|      2|    62|magic board game|
|      2|    62|  Robin Williams|
|      3|   289|           moldy|
|      3|   289|             old|
|      1|   336|           pixar|
|      1|   474|           pixar|
|      2|   474|            game|
|      5|   474|       pregnancy|
|      5|   474|          remake|
+-------+------+----------------+

