# Other Data Types
In this lecture we will show how we can handle different data types in Python. Pandas will be a huge help for us during this lecture and following exercises. We will cover following topics:
- Excel
- JSON
- AWS S3 

## Excel

In [None]:
import pandas as pd

We can find the metadata [here](https://github.com/jurajkapasny/intro-to-python-programming/blob/master/data/metadata.xlsx).

In [None]:
excel_file = 'data/metadata.xlsx'
metadata = pd.read_excel(excel_file)

In [None]:
metadata.head()

In [None]:
# we can specify what sheet we want
metadata_sheet1 = pd.read_excel(excel_file, sheet_name=0)
metadata_sheet1.head()

In [None]:
# if we want second sheet
metadata_sheet2 = pd.read_excel(excel_file, sheet_name=1)
metadata_sheet2.head()

In [None]:
# load sheet by name
movies_sheet3 = pd.read_excel(excel_file, sheet_name='TAG_MAPPING')
movies_sheet3.head()

In [None]:
# load excelfile object
xlsx = pd.ExcelFile(excel_file)

In [None]:
# print available sheet names
xlsx.sheet_names

In [None]:
# load all sheets into dictionary where dict key is the sheet name and value is dataframe
data = {} # initialize empty dict
for sheet_name in xlsx.sheet_names:
    data[sheet_name] = xlsx.parse(sheet_name)

In [None]:
# print dict keys
data.keys()

In [None]:
# get 'LOCATIONS' sheet from dictionary
data['LOCATIONS']

In [None]:
sorted_by_gross = movies.sort_values(['Gross Earnings'], ascending=False)
sorted_by_gross[['Title','Gross Earnings']]

In [None]:
earnings_by_co_lang = (movies[['Country', 'Language', 'Gross Earnings']]
                       .pivot_table(index=['Country', 'Language'])
                       .unstack()
                       .fillna(0))

earnings_by_co_lang.head()

In [None]:
# Export Back to Excel
movies.to_excel('data/all_movies.xlsx', index=False)

## JSON

In [None]:
# generate JSON
import json

# Creating a Python Dictionary
data = {"Sub_ID":["1","2","3","4","5","6","7","8" ],
        "Name":["Erik", "Daniel", "Michael", "Sven",
                "Gary", "Carol","Lisa", "Elisabeth" ],
        "Salary":["723.3", "515.2", "621", "731", 
                  "844.15","558", "642.8", "732.5" ],
        "StartDate":[ "1/1/2011", "7/23/2013", "12/15/2011",
                     "6/11/2013", "3/27/2011","5/21/2012", 
                     "7/30/2013", "6/17/2014"],
        "Department":[ "IT", "Manegement", "IT", "HR", 
                      "Finance", "IT", "Manegement", "IT"],
        "Sex":[ "M", "M", "M", 
              "M", "M", "F", "F", "F"]}

print(data)

In [None]:
print(type(data))

In [None]:
# saving from python to JSON file
import json

# Parse JSON
with open('data/data.json', 'w') as outfile:
    json.dump(data, outfile)

In [None]:
# Python Parsing JSON 
with open('data/data.json') as json_file:
    data = json.load(json_file)

print(data)

#### Pandas

In [None]:
import pandas as pd

# Read JSON as a dataframe with Pandas:
df = pd.read_json('data/data.json')
df

In [None]:
# now we can store as excel/csv
df.to_csv("data/data.csv", index=False)

#### Nested json

In [None]:
# define json string
data = [{"state": "Florida", 
        "shortname": "FL",
        "info": {"governor": "Rick Scott"},
        "counties": [{"name": "Dade", "population": 12345},
                     {"name": "Broward", "population": 40000},
                     {"name": "Palm Beach", "population": 60000}]},
       {"state": "Ohio",
        "shortname": "OH",
        "info": {"governor": "John Kasich"},
        "counties": [{"name": "Summit", "population": 1234},
                     {"name": "Cuyahoga", "population": 1337}]}]

In [None]:
pd.json_normalize(data)

In [None]:
pd.json_normalize(data=data, record_path='counties', meta=['state', 'shortname', ['info', 'governor']])

# Stretch

### AWS S3

In [None]:
## CUSTOM CLASS TO MANIPULATE WITH S3 DATA

import boto3
import json
import io


class S3:    
    @staticmethod
    def get_client(credentials=None):
        """
        Returns S3 client
        """
        if credentials:
            return boto3.client('s3', **credentials)
        return boto3.client('s3')
    
    @staticmethod
    def create_bucket(bucket_name, location='eu-west-1',credentials=None):
        """
        Creates bucket
        
        Params:
            bucket_name (str): name of bucket
            location (str): S3 region
        """
        client = S3.get_client(credentials)
        response = client.create_bucket(Bucket = bucket_name,
                                        CreateBucketConfiguration={'LocationConstraint': location})
        
        if response['ResponseMetadata']['HTTPStatusCode'] != 200:
            raise Exception(response)
        
        print(f'Bucket "{bucket_name}" created!')
        
    @staticmethod
    def delete_bucket(bucket_name, credentials=None):
        """
        Delete empty (!!!) bucket
        
        Params:
            bucket_name (str): name of bucket
        """
        client = S3.get_client(credentials)
        response = client.delete_bucket(Bucket=bucket_name)
        
        if response['ResponseMetadata']['HTTPStatusCode'] != 204:
            raise Exception(response)
        
        print(f'Bucket "{bucket_name}" deleted!')
    
    @staticmethod
    def create_bucket_if_not_exists(bucket_name, credentials=None):
        """
        If bucket with bucket_name does not exists => creates new one
        """
        available_buckets = S3.get_buckets()
        if bucket_name not in available_buckets:
            S3.create_bucket(bucket_name)
        
    @staticmethod
    def get_buckets(with_creation_date=False, credentials=None):
        """
        Returns all available buckets names 
        
        Params:
            with_creation_date (bool): if True => return also creation date of buckets
            
        Returns:
            list with names or list with dictionaries containing buckets info
        """
        client = S3.get_client(credentials)
        response = client.list_buckets()
        
        if response['ResponseMetadata']['HTTPStatusCode'] != 200:
            raise Exception(response)
        
        if with_creation_date:
            return response['Buckets']
        
        return [bucket['Name'] for bucket in response['Buckets']]
    
    @staticmethod
    def create_json_in_bucket_if_not_exists(bucket_name, file_name, initial_json=None, credentials=None):
        """
        Creates json in bucket if json not exists
        
        Params:
            bucket_name (str): name of bucket where to store file
            file_name (str): path to file 
            initial_json (None or dumped json): json to store
        """
        filenames = S3.get_all_objects_from_bucket(bucket_name = bucket_name, 
                                                   prefix = file_name, 
                                                   only_keys = True)
        if file_name not in filenames:
            if initial_json is None:
                initial_json = json.dumps({})
            
            S3.store_file_in_bucket(bucket_name = bucket_name,
                                    file_name = file_name,
                                    file = initial_json)
    
    @staticmethod
    def store_file_in_bucket(bucket_name, file_name, file, credentials=None):
        """
        Stores file in bucket
        
        Params:
            bucket_name (str): name of bucket where to store file
            file_name (str): path to file
            file (dumped json or binary): file to store
        """
        client = S3.get_client(credentials)
        response = client.put_object(Bucket = bucket_name,
                                     Key = file_name,
                                     Body = file)
        
        if response['ResponseMetadata']['HTTPStatusCode'] != 200:
            raise Exception(response)
        
        print(f'"{file_name}" succcesfully stored in "{bucket_name}" bucket!')
    
    @staticmethod
    def get_file_from_bucket(bucket_name, file_name, as_json=False, credentials=None):
        """
        Ger file from bucket
        
        Params:
            bucket_name (str): name of bucket where to store file
            file_name (str): path to file
            as_json (boo): if True => convert response body to json
            
        Returns:
            json or bytes object
        """
        client = S3.get_client(credentials)
        response = client.get_object(Bucket = bucket_name,
                                     Key = file_name)
        
        if response['ResponseMetadata']['HTTPStatusCode'] != 200:
            raise Exception(response)
        
        body = response['Body'].read()
        
        if as_json: 
            return json.loads(body) 
        
        return io.BytesIO(body)
        
    @staticmethod
    def get_all_objects_from_bucket(bucket_name, prefix='', only_keys=True, credentials=None):
        """
        Get all object from bucket
        
        Params:
            bucket_name (str): name of bucket where to store file
            prefix (str): file filter
            only_keys (bool): if True => returns only filenames
            
        Returns:
            list with filenames or list with dictionaries containing files info
        """
        client = S3.get_client(credentials)

        kwargs = {
            'Bucket': bucket_name,
            'Prefix': prefix,    
        }
        
        data = []
        while True:
            response = client.list_objects_v2(**kwargs)
            
            if response['ResponseMetadata']['HTTPStatusCode'] != 200:
                raise Exception(response)
            
            if only_keys:
                data += [c.get('Key') for c in response.get('Contents',[])]
            else:
                data += response.get('Contents',[])

            try:
                kwargs['ContinuationToken'] = response['NextContinuationToken']
            except KeyError:
                break

        return data

In [None]:
# setup credentials
AWS_ACCESS_KEY_ID =  'your aws acces key id here'
AWS_SECRET_ACCES_KEY = 'your aws secret acces key here'

# credentials as dictionary
credentials = {
    'aws_access_key_id': AWS_ACCESS_KEY_ID,
    'aws_secret_access_key': AWS_SECRET_ACCES_KEY
}

In [None]:
# list all available buckets 
S3.get_buckets(credentials=credentials, with_creation_date=False)

In [None]:
# list all available buckets with creation dates
S3.get_buckets(credentials=credentials, with_creation_date=True)

In [None]:
import pandas as pd

# transform buckets info into pandas dataframe
df_buckets = pd.DataFrame(S3.get_buckets(credentials=credentials, with_creation_date=True))
df_buckets

In [None]:
# create bucket
S3.create_bucket(credentials=credentials, bucket_name = 'spgdc-python-training')

In [None]:
# delete bucket
#S3.delete_bucket(credentials=credentials, bucket_name = 'spgdc-python-training')

In [None]:
# create test data
test_data = [
    {'name': 'Magnus'}, 
    {'name': 'Matus'}
]

In [None]:
# store test_data in bucket
S3.store_file_in_bucket(
    credentials = credentials,
    bucket_name='spgdc-python-training',
    file_name='test.json',
    file = json.dumps(test_data)
)

In [None]:
# search for objects in bucket
S3.get_all_objects_from_bucket(
    credentials = credentials,
    bucket_name='spgdc-python-training',
    prefix ='',
    only_keys=True
)

In [None]:
# store multiple files in s3
for i in range(12):
    data = {
        'filename': f'test_{i}.json', 
        'index': i
    }
    
    S3.store_file_in_bucket(
        credentials = credentials,
        bucket_name='spgdc-python-training',
        file_name=f'test_{i}.json',
        file = json.dumps(data)
    )

In [None]:
# search for objects in bucket
S3.get_all_objects_from_bucket(
    credentials = credentials,
    bucket_name='spgdc-python-training',
    prefix ='test_1',
    only_keys=True
)

In [None]:
# load file from bucket
file = S3.get_file_from_bucket(
        credentials=credentials,
        bucket_name='spgdc-python-training', 
        file_name='test_0.json', 
        as_json=True
        )
file

In [None]:
# list comprehension (advanced python)
pd.DataFrame([S3.get_file_from_bucket(credentials=credentials, bucket_name='spgdc-python-training', file_name=f'test_{i}.json', as_json=True) for i in range(12)])
    