## user2 can access dataset owned by user1 via AssumeRole Ceph STS Feature (without credentials sharing)

In [None]:
import boto3


## User2 has to add the ARN value of the role created by user1

Role_Arn = 'arn:aws:iam::odh:role/S3Access2'
endpoint_url='10.0.111.122:8080'

user2_sts = boto3.client('sts',
           aws_access_key_id='user2',
           aws_secret_access_key='user2pass',
           endpoint_url= "http://"+ endpoint_url ,
           region_name='')


response = user2_sts.assume_role(
            RoleArn=Role_Arn,
            RoleSessionName='user2',
            DurationSeconds=3600
            )


aws_access_key_id = response['Credentials']['AccessKeyId']
aws_secret_access_key = response['Credentials']['SecretAccessKey']


s3client = boto3.client('s3',
            aws_access_key_id = response['Credentials']['AccessKeyId'],
            aws_secret_access_key = response['Credentials']['SecretAccessKey'],
            aws_session_token = response['Credentials']['SessionToken'],
            endpoint_url= "http://"+ endpoint_url ,
            region_name='',)

bucket_name = 'ufo-dataset'
#s3bucket = s3client.create_bucket(Bucket=bucket_name)
#print(s3client.list_buckets())

print("Getting dataset from user1 "+ bucket_name +" bucket, without sharing credentials \n")
for key in s3client.list_objects(Bucket=bucket_name)['Contents']:
    print("File Found : "+key['Key'])
    
#print("\n Creating a new bucket from odh2user1 account")
#try:
#    s3bucket = s3client.create_bucket(Bucket='odh2user1-bucket')
#except ClientError as e:
#    print("\n Bucket already exists \n")

# Retrieve the list of existing buckets
#response = s3client.list_buckets()

# Output the bucket names
#print('Existing buckets:')
#for bucket in response['Buckets']:
#    print(f'  {bucket["Name"]}')

## Validating if user2 can access dataset

In [None]:
from pyspark.sql import SparkSession, SQLContext
import os
import socket

# create a spark session
spark_cluster_url = f"spark://{os.environ['SPARK_CLUSTER']}:7077"
spark = SparkSession.builder.master(spark_cluster_url).getOrCreate()

# test your spark connection
spark.range(5, numPartitions=5).rdd.map(lambda x: socket.gethostname()).distinct().collect()

hadoopConf = spark.sparkContext._jsc.hadoopConfiguration()
hadoopConf.set("fs.s3a.endpoint", endpoint_url)
hadoopConf.set("fs.s3a.access.key", aws_access_key_id)
hadoopConf.set("fs.s3a.secret.key", aws_secret_access_key)
hadoopConf.set("fs.s3a.path.style.access", "false")
hadoopConf.set("fs.s3a.connection.ssl.enabled", "false") # false if not https

data = spark.read.csv('s3a://ufo-dataset/UFO_dataset_kaggle.csv', sep=",", header=True)

df = data.toPandas()
df.head()


## Dataset Analysis Preparation

In [None]:
import sys
!{sys.executable} -m pip install pyspark
!{sys.executable} -m pip install boto
!{sys.executable} -m pip install plotly
!{sys.executable} -m pip install plotly --upgrade
!{sys.executable} -m pip install chart_studio

# Dataset Analysis Test

In [None]:
## Printing schema
data.printSchema()

# Question - 1 : What are the TOP-5 countries which reported UFO sighting

In [None]:
plot1 = data.groupBy("country").count().toPandas()

import chart_studio
chart_studio.tools.set_credentials_file(username='karasing', api_key='4VVeR6dmEYoZwBwPi6hV')

#import plotly 
#plotly.tools.set_credentials_file(username='karasing', api_key='')

import chart_studio.plotly as py
import plotly.graph_objects as go

data1 = [go.Bar(
    x=plot1['country'],
    y=plot1['count'],
    width = 0.8 
)]
py.iplot(data1, filename='basic-bar')

# Question - 2 : Which are the TOP-20 cities which reported UFO sighting

In [None]:
plot2 = data.groupBy("city").count().orderBy(data.city.desc()).toPandas()
plot2 = plot2.sort_values(by=['count'],ascending=False).head(20)
data2 = [go.Bar(
    x=plot2['city'],
    y=plot2['count'], 
)]
py.iplot(data2, filename='basic-bar')

# Question - 3 : How does a UFO Look Like ?

In [None]:
plot3 = data.groupBy("shape").count().orderBy(data.shape.desc()).toPandas()
plot3 = plot3.sort_values(by=['count'],ascending=False).head(20)
data3 = [go.Bar(
    x=plot3['shape'],
    y=plot3['count']
)]
py.iplot(data3, filename='basic-bar')

# Question - 4 : Which are the TOP-10 cities reporting UFO as "Light"

In [None]:
from pyspark.sql.functions import desc

import pandas as pd
plot4 = data.groupBy("city","shape").count().filter(data.shape == 'light').sort(desc("count")).toPandas().head(10)
data4 = [go.Bar(
    x=plot4['city'],
    y=plot4['count']
)]
py.iplot(data4, filename='basic-bar')