# Defining useful functions

In [None]:
import pandas as pd

def mountFileSystem(containerName, storageAccountName):
  configs = {"fs.azure.account.auth.type": "OAuth",
       "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
       "fs.azure.account.oauth2.client.id": "<Service Principal Application ID>",
       "fs.azure.account.oauth2.client.secret": "<Service Principal Application Secret (or password)>",
       "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/<Service Principal Tenant ID>/oauth2/token",
       "fs.azure.createRemoteFileSystemDuringInitialization": "true"}
  
  mountPoint = "/mnt/adls/" + containerName
  
  try:
    dbutils.fs.mount(
      source = "abfss://" + containerName + "@" + storageAccountName + ".dfs.core.windows.net",
      mount_point = mountPoint,
      extra_configs = configs
    )
    print(mountPoint + " mounted successfully")
  except:
    print("The mount " + mountPoint + " already exists.")

  return mountPoint

def saveAsParquet(dataFrame, filePath):
  df = sqlContext.createDataFrame(dataFrame)

  df.write.parquet(filePath, mode='overwrite')

  print(filePath + " saved successfully")

mountPoint = mountFileSystem("southridge", "<storage account name>")
print(mountPoint)

# Loading Southridge Customers

In [None]:
sr_sales_customers_parquet = mountPoint + "/raw/cloudsales/Customers.parquet"
sr_streaming_customers_parquet = mountPoint + "/raw/cloudstreaming/Customers.parquet"

sr_sales_customers = sqlContext.read.parquet(sr_sales_customers_parquet)
sr_streaming_customers = sqlContext.read.parquet(sr_streaming_customers_parquet)

sr_sales_customers = sr_sales_customers.toPandas()
sr_streaming_customers = sr_streaming_customers.toPandas()

sr_customers_frame = [sr_sales_customers, sr_streaming_customers]
sr_customers = pd.concat(sr_customers_frame)

sr_customers['CreatedDate'] = pd.to_datetime(sr_customers['CreatedDate'], errors='coerce')
sr_customers['UpdatedDate'] = pd.to_datetime(sr_customers['UpdatedDate'], errors='coerce')
sr_customers.PhoneNumber = sr_customers.PhoneNumber.astype(str)

display(sr_customers)

# Loading VanArsdel Customers

In [None]:
va_customers_filePath = mountPoint + "/raw/vanarsdel/Customers.json"

va_customers_raw = spark.read.option("multiLine", "true").options(header='true', inferschema='true').json(va_customers_filePath)
va_customers_pd = va_customers_raw.toPandas()

va_customers = va_customers_pd[['CustomerID', 'LastName', 'FirstName', 'PhoneNumber', 'CreatedDate', 'UpdatedDate']]

va_customers['CreatedDate'] = pd.to_datetime(va_customers_pd['CreatedDate'], errors='coerce')
va_customers['UpdatedDate'] = pd.to_datetime(va_customers_pd['UpdatedDate'], errors='coerce')

va_customers.PhoneNumber = va_customers_pd.PhoneNumber.astype(str)

display(va_customers)

# Loading FourthCoffee Customers

In [None]:
fc_customers_filePath = "/dbfs" + mountPoint + "/raw/fourthcoffee/Customers.csv"

fc_customers_pd = pd.read_csv(fc_customers_filePath)

fc_customers = fc_customers_pd[['CustomerID', 'LastName', 'FirstName', 'PhoneNumber', 'CreatedDate', 'UpdatedDate']]

fc_customers['CreatedDate'] = pd.to_datetime(fc_customers_pd['CreatedDate'], errors='coerce')
fc_customers['UpdatedDate'] = pd.to_datetime(fc_customers_pd['UpdatedDate'], errors='coerce')
fc_customers.PhoneNumber = fc_customers_pd.PhoneNumber.astype(str)

display(fc_customers)

# Loading Southridge Addresses

In [None]:
sr_sales_addresses_parquet = mountPoint + "/raw/cloudsales/Addresses.parquet"
sr_streaming_addresses_parquet = mountPoint + "/raw/cloudstreaming/Addresses.parquet"

sr_sales_addresses = sqlContext.read.parquet(sr_sales_addresses_parquet)
sr_streaming_addresses = sqlContext.read.parquet(sr_streaming_addresses_parquet)

sr_sales_addresses = sr_sales_addresses.toPandas()
sr_streaming_addresses = sr_streaming_addresses.toPandas()

sr_addresses_frame = [sr_sales_addresses, sr_streaming_addresses]
sr_addresses = pd.concat(sr_addresses_frame)

sr_addresses['CreatedDate'] = pd.to_datetime(sr_addresses['CreatedDate'], errors='coerce')
sr_addresses['UpdatedDate'] = pd.to_datetime(sr_addresses['UpdatedDate'], errors='coerce')

sr_addresses.AddressLine2 = sr_addresses.AddressLine2.astype(str)
sr_addresses.ZipCode = sr_addresses.ZipCode.astype(str)

display(sr_addresses)

# Loading VanArsdel Addresses

In [None]:
va_customers_filePath = mountPoint + "/raw/vanarsdel/Customers.json"

va_customers_raw = spark.read.option("multiLine", "true").options(header='true', inferschema='true').json(va_customers_filePath)
va_customers_pd = va_customers_raw.toPandas()

va_addresses = va_customers_pd[['CustomerID', 'AddressLine1', 'AddressLine2', 'City', 'State', 'ZipCode', 'CreatedDate', 'UpdatedDate']]

va_addresses['CreatedDate'] = pd.to_datetime(va_addresses['CreatedDate'], errors='coerce')
va_addresses['UpdatedDate'] = pd.to_datetime(va_addresses['UpdatedDate'], errors='coerce')

va_addresses.AddressLine2 = va_addresses.AddressLine2.astype(str)
va_addresses.ZipCode = va_addresses.ZipCode.astype(str)

va_addresses.insert(0, 'AddressID', 'None')

display(va_addresses)

# Loading FourthCoffee Addresses

In [None]:
fc_customers_filePath = "/dbfs" + mountPoint + "/raw/fourthcoffee/Customers.csv"

fc_customers_pd = pd.read_csv(fc_customers_filePath)

fc_addresses = fc_customers_pd[['CustomerID', 'AddressLine1', 'AddressLine2', 'City', 'State', 'ZipCode', 'CreatedDate', 'UpdatedDate']]

fc_addresses['CreatedDate'] = pd.to_datetime(fc_addresses['CreatedDate'], errors='coerce')
fc_addresses['UpdatedDate'] = pd.to_datetime(fc_addresses['UpdatedDate'], errors='coerce')
fc_addresses.AddressLine2 = fc_addresses.AddressLine2.astype(str)
fc_addresses.ZipCode = fc_addresses.ZipCode.astype(str)

fc_addresses.loc[fc_addresses.AddressLine2 == 'nan', 'AddressLine2'] = 'None'

fc_addresses.insert(0, 'AddressID', 'None')

display(fc_addresses)

# Loading Southridge Movies and Actors

In [None]:
sr_movies_filepath = mountPoint + "/raw/moviescatalog/movies.json"

sr_movies_raw = spark.read.option("multiLine", "true").options(header='true', inferschema='true').json(sr_movies_filepath)
sr_movies_pd = sr_movies_raw.toPandas()

sr_movies_pd = sr_movies_pd[['actors', 'availabilityDate', 'genre', 'id', 'rating', 'releaseYear', 'runtime', 'streamingAvailabilityDate', 'tier', 'title']]

movieactors = sr_movies_pd[['id', 'actors']]
movies = sr_movies_pd[['id', 'title', 'genre', 'availabilityDate', 'rating', 'releaseYear', 'runtime', 'streamingAvailabilityDate', 'tier']]

import numpy as np

actorslist = movieactors.actors.values.tolist()
actorcountbymovie = [len(r) for r in actorslist]
explodedmovieids = np.repeat(movieactors.id, actorcountbymovie)

movieactors = pd.DataFrame(np.column_stack((explodedmovieids, np.concatenate(actorslist))), columns=movieactors.columns)

sr_all_moviesactorsactors = pd.merge(movies, movieactors, on='id')

sr_all_moviesactorsactors = sr_all_moviesactorsactors.rename(index=str, columns={'id': 'MovieID', 'title': 'MovieTitle', 'genre': 'Genre', 'availabilityDate': 'AvailabilityDate', 'rating': 'Rating', 'releaseYear': 'ReleaseYear', 'runtime': 'RuntimeMin', 'streamingAvailabilityDate': 'StreamingAvailabilityDate', 'tier': 'Tier', 'actors': 'ActorName'})

sr_all_moviesactorsactors['ActorID'] = 'None'
sr_all_moviesactorsactors['MovieActorID'] = 'None'
sr_all_moviesactorsactors['ActorGender'] = 'None'
sr_all_moviesactorsactors['ReleaseDate'] = 'None'

sr_all_moviesactorsactors.ReleaseYear = sr_all_moviesactorsactors.ReleaseYear.astype(str)
sr_all_moviesactorsactors.Tier = sr_all_moviesactorsactors.Tier.astype(str)
sr_all_moviesactorsactors.RuntimeMin = sr_all_moviesactorsactors.RuntimeMin.astype(str)

sr_all_moviesactorsactors = sr_all_moviesactorsactors[['MovieID', 'MovieTitle', 'Genre', 'ReleaseDate', 'AvailabilityDate', 'StreamingAvailabilityDate', 'ReleaseYear', 'Tier', 'Rating', 'RuntimeMin', 'MovieActorID', 'ActorID', 'ActorName', 'ActorGender']]

# Loading VanArsdel Movies and Actors

In [None]:
va_movies_filepath = mountPoint + "/raw/vanarsdel/Movies.json"
va_actors_filepath = mountPoint + "/raw/vanarsdel/Actors.json"
va_movieactors_filepath = mountPoint + "/raw/vanarsdel/MovieActors.json"

va_movies_raw = spark.read.option("multiLine", "true").options(header='true', inferschema='true').json(va_movies_filepath)
va_actors_raw = spark.read.option("multiLine", "true").options(header='true', inferschema='true').json(va_actors_filepath)
va_movieactors_raw = spark.read.option("multiLine", "true").options(header='true', inferschema='true').json(va_movieactors_filepath)

va_movies_pd = va_movies_raw.toPandas()
va_actors_pd = va_actors_raw.toPandas()
va_movieactors_pd = va_movieactors_raw.toPandas()

va_all_movies = pd.merge(va_movieactors_pd, va_movies_pd, on='MovieID')
va_all_movies = pd.merge(va_all_movies, va_actors_pd, on='ActorID')

va_all_movies = va_all_movies.rename(index=str, columns={'Category': 'Genre', 'RunTimeMin': 'RuntimeMin', 'Gender': 'ActorGender'})

va_all_movies['AvailabilityDate'] = 'None'
va_all_movies['StreamingAvailabilityDate'] = 'None'
va_all_movies['ReleaseYear'] = 'None'
va_all_movies['Tier'] = 'None'

va_all_movies.RuntimeMin = va_all_movies.RuntimeMin.astype(str)

va_all_movies = va_all_movies[['MovieID', 'MovieTitle', 'Genre', 'ReleaseDate', 'AvailabilityDate', 'StreamingAvailabilityDate', 'ReleaseYear', 'Tier', 'Rating', 'RuntimeMin', 'MovieActorID', 'ActorID', 'ActorName', 'ActorGender']]

va_all_movies.dtypes

# Loading FourthCoffee Movies and Actors

In [None]:
fc_movies_filepath = "/dbfs/" + mountPoint + "/raw/fourthcoffee/Movies.csv"
fc_actors_filepath = "/dbfs/" + mountPoint + "/raw/fourthcoffee/Actors.csv"
fc_movieactors_filepath = "/dbfs/" + mountPoint + "/raw/fourthcoffee/MovieActors.csv"

fc_movies_pd = pd.read_csv(fc_movies_filepath)
fc_actors_pd = pd.read_csv(fc_actors_filepath)
fc_movieactors_pd = pd.read_csv(fc_movieactors_filepath)

fc_all_moviesactors = pd.merge(fc_movieactors_pd, fc_movies_pd, on='MovieID')
fc_all_moviesactors = pd.merge(fc_all_moviesactors, fc_actors_pd, on='ActorID')

fc_all_moviesactors = fc_all_moviesactors.rename(index=str, columns={'Category': 'Genre', 'RunTimeMin': 'RuntimeMin', 'Gender': 'ActorGender'})

fc_all_moviesactors['AvailabilityDate'] = 'None'
fc_all_moviesactors['StreamingAvailabilityDate'] = 'None'
fc_all_moviesactors['ReleaseYear'] = 'None'
fc_all_moviesactors['Tier'] = 'None'

fc_all_moviesactors.RuntimeMin = fc_all_moviesactors.RuntimeMin.astype(str)

fc_all_moviesactors = fc_all_moviesactors[['MovieID', 'MovieTitle', 'Genre', 'ReleaseDate', 'AvailabilityDate', 'StreamingAvailabilityDate', 'ReleaseYear', 'Tier', 'Rating', 'RuntimeMin', 'MovieActorID', 'ActorID', 'ActorName', 'ActorGender']]

display(fc_all_moviesactors)

# Bringing all Customers together

In [None]:
sr_customers['SourceSystem'] = 'southridge'
va_customers['SourceSystem'] = 'vanarsdel'
fc_customers['SourceSystem'] = 'fourthcoffee'
customers_frame = [sr_customers, va_customers, fc_customers]

all_customers = pd.concat(customers_frame)

display(all_customers)

# Bringing all Addresses together

In [None]:
sr_addresses['SourceSystem'] = 'southridge'
va_addresses['SourceSystem'] = 'vanarsdel'
fc_addresses['SourceSystem'] = 'fourthcoffee'
addresses_frame = [sr_addresses, va_addresses, fc_addresses]

all_addresses = pd.concat(addresses_frame)

display(all_addresses)

# Bringing all Movies and Actors together

In [None]:
sr_all_moviesactors['SourceSystem'] = 'southridge'
va_all_moviesactors['SourceSystem'] = 'vanarsdel'
fc_all_moviesactors['SourceSystem'] = 'fourthcoffee'

moviesactors_frame = [sr_all_moviesactors, va_all_moviesactors, fc_all_moviesactors]

all_moviesactors = pd.concat(moviesactors_frame)

display(all_moviesactors)

# Saving the data as parquet to the data lake

In [None]:
parquet_path = mountPoint + '/parquet/<Business Object name>.parquet'

saveAsParquet(all_<business object name>, parquet_path)