In [26]:
!pip install --user boto3
!pip install --user pyarrow
!pip install --user fastparquet



In [27]:
import boto3
import io
import pandas as pd
import os

In [None]:
# Set your Minio credentials and endpoint
minio_credentials = {
    'region': 'us-east-1',
    'endpoint': 'http://10.183.16.169:9000',
    'use_ssl': False,
    'url_style': 'path',
    'access_key': 'readonly',
    'secret_key': 'readonly'
}

# Create an S3 client
s3 = boto3.client(
    's3',
    region_name=minio_credentials['region'],
    endpoint_url=minio_credentials['endpoint'],
    aws_access_key_id=minio_credentials['access_key'],
    aws_secret_access_key=minio_credentials['secret_key'],
    use_ssl=minio_credentials['use_ssl'],
    config=boto3.session.Config(signature_version='s3v4')
)

# Set the directory to save files
save_directory = './data'

# Create the directory if it doesn't exist
if not os.path.exists(save_directory):
    os.makedirs(save_directory)

# List all buckets
response = s3.list_buckets()

# Print each bucket name
print("list of buckets")
for bucket in response['Buckets']:
    print(bucket['Name'])
    objects = s3.list_objects(Bucket=bucket['Name'])
    
    for obj in objects['Contents']:
        print('Downloading ' + obj['Key'])
        
        if os.path.isfile(os.path.join(save_directory, obj['Key'].split('/')[-1] + '.parquet')):
            print(obj['Key'] + ' already exists, skipping...')
            continue

        data = io.BytesIO()
        s3.download_fileobj(bucket['Name'], obj['Key'], data)

        # Save CSV file in the ./data directory
        with open(os.path.join(save_directory, obj['Key'].split('/')[-1]), 'wb') as csv_file:
            csv_file.write(data.getvalue())

        print(obj['Key'] + ' downloaded and saved as CSV in the ./data directory')
        
        # Change the data back to String IO for further processing if needed
        data = io.StringIO(data.getvalue().decode('utf-8'))

        print(obj['Key'] + ' downloaded, now turning it into a parquet file')

        df = pd.read_csv(data)
        df.to_parquet(os.path.join(save_directory, obj['Key'].split('/')[-1] + '.parquet'), index=False)

In [11]:
!pip install --user boto3
!pip install --user pyarrow
!pip install --user fastparquet
!pip install --user minio



In [10]:
import os
from minio import Minio
from minio.error import S3Error

In [12]:
def upload_parquet_files_to_minio(folder_path, bucket_name, endpoint, access_key, secret_key):
    try:
        # Initialize the Minio client
        minio_client = Minio(endpoint,
                             access_key=access_key,
                             secret_key=secret_key,
                             secure=False)  # Change to True if using HTTPS
        
        # Check if the bucket exists, create it if not
        if not minio_client.bucket_exists(bucket_name):
            minio_client.make_bucket(bucket_name)

        # Loop through files in the specified directory
        for filename in os.listdir(folder_path):
            if filename.endswith(".parquet"):
                file_path = os.path.join(folder_path, filename)
                object_name = filename

                # Upload the parquet file
                minio_client.fput_object(bucket_name, object_name, file_path)

                print(f"File '{object_name}' uploaded successfully to bucket '{bucket_name}' on Minio server.")

    except ReferenceError as e:
        print(f"Error: {e}" +"Too bad!")

# Set your Minio server details
minio_endpoint = "localhost:8000"
minio_access_key = "admin"
minio_secret_key = "password"
minio_bucket_name = "parquets"

# Directory containing parquet files
parquet_folder_path = "./data"

# Call the function
upload_parquet_files_to_minio(parquet_folder_path, minio_bucket_name, minio_endpoint, minio_access_key, minio_secret_key)

File 'april_2023.csv.parquet' uploaded successfully to bucket 'parquets' on Minio server.
File 'august_2022.csv.parquet' uploaded successfully to bucket 'parquets' on Minio server.
File 'dataset.csv.parquet' uploaded successfully to bucket 'parquets' on Minio server.
File 'december_2022.csv.parquet' uploaded successfully to bucket 'parquets' on Minio server.
File 'healthcare_dataset.csv.parquet' uploaded successfully to bucket 'parquets' on Minio server.
File 'heart_attack_prediction_dataset.csv.parquet' uploaded successfully to bucket 'parquets' on Minio server.
File 'january_2023.csv.parquet' uploaded successfully to bucket 'parquets' on Minio server.
File 'july_2022.csv.parquet' uploaded successfully to bucket 'parquets' on Minio server.
File 'june_2023.csv.parquet' uploaded successfully to bucket 'parquets' on Minio server.
File 'march_2023.csv.parquet' uploaded successfully to bucket 'parquets' on Minio server.
File 'may_2023.csv.parquet' uploaded successfully to bucket 'parquets'

In [13]:
!pip install --user mlxtend



In [14]:
!pip install --user pandas



In [15]:
import numpy as np 
import pandas as pd 
from mlxtend.frequent_patterns import apriori, association_rules 
import minio
import io

In [16]:
# Initialize the Minio client
minio_client = Minio("localhost:8000",
                     access_key="admin",
                     secret_key="password",
                     secure=False)  # Change to True if using HTTPS

# Specify the bucket and file information
bucket_name = "parquets"
file_name = "Online_Retail.csv.parquet"


# Get the Parquet file as an object
obj = minio_client.get_object(bucket_name, file_name)
    
# Read the Parquet file content
parquet_content = obj.read()

# Assuming 'parquet_content' contains the raw Parquet data as bytes
# Read Parquet data using BytesIO
parquet_stream = io.BytesIO(parquet_content)

# Read the Parquet data using pandas
df = pd.read_parquet(parquet_stream)

# Now you can work with the DataFrame 'df' containing your Parquet data
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


In [17]:
df.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [19]:
df.Country.unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

In [20]:
# Cleaning the Data

# Stripping extra spaces in the description 
df['Description'] = df['Description'].str.strip() 
  
# Dropping the rows without any invoice number 
df.dropna(axis = 0, subset =['InvoiceNo'], inplace = True) 
df['InvoiceNo'] = df['InvoiceNo'].astype('str') 
  
# Dropping all transactions which were done on credit 
df = df[~df['InvoiceNo'].str.contains('C')] 

In [21]:
# Splitting the data according to the region of transaction

# Transactions done in France 
basket_France = (df[df['Country'] =="France"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 
  
# Transactions done in the United Kingdom 
basket_UK = (df[df['Country'] =="United Kingdom"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 
  
# Transactions done in Portugal 
basket_Por = (df[df['Country'] =="Portugal"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 
  
# Transactions done in Sweden     
basket_Sweden = (df[df['Country'] =="Sweden"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 

In [22]:
# Hot encoding the Data

# Defining the hot encoding function to make the data suitable  
# for the concerned libraries 
def hot_encode(x): 
    if(x<= 0): 
        return 0
    if(x>= 1): 
        return 1
  
# Encoding the datasets 
basket_encoded = basket_France.applymap(hot_encode) 
basket_France = basket_encoded 
  
basket_encoded = basket_UK.applymap(hot_encode) 
basket_UK = basket_encoded 
  
basket_encoded = basket_Por.applymap(hot_encode) 
basket_Por = basket_encoded 
  
basket_encoded = basket_Sweden.applymap(hot_encode) 
basket_Sweden = basket_encoded 

  basket_encoded = basket_France.applymap(hot_encode)
  basket_encoded = basket_UK.applymap(hot_encode)
  basket_encoded = basket_Por.applymap(hot_encode)
  basket_encoded = basket_Sweden.applymap(hot_encode)


In [23]:
# Building the models and analyzing the results

# Building the model 
frq_items = apriori(basket_France, min_support = 0.05, use_colnames = True) 
  
# Collecting the inferred rules in a dataframe 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
44,(JUMBO BAG WOODLAND ANIMALS),(POSTAGE),0.076531,0.765306,0.076531,1.000000,1.306667,0.017961,inf,0.254144
259,"(PLASTERS IN TIN CIRCUS PARADE, RED TOADSTOOL ...",(POSTAGE),0.051020,0.765306,0.051020,1.000000,1.306667,0.011974,inf,0.247312
272,"(PLASTERS IN TIN WOODLAND ANIMALS, RED TOADSTO...",(POSTAGE),0.053571,0.765306,0.053571,1.000000,1.306667,0.012573,inf,0.247978
301,"(SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...",(SET/6 RED SPOTTY PAPER PLATES),0.102041,0.127551,0.099490,0.975000,7.644000,0.086474,34.897959,0.967949
300,"(SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...",(SET/6 RED SPOTTY PAPER CUPS),0.102041,0.137755,0.099490,0.975000,7.077778,0.085433,34.489796,0.956294
...,...,...,...,...,...,...,...,...,...,...
37,(POSTAGE),(JAM MAKING SET PRINTED),0.765306,0.053571,0.051020,0.066667,1.244444,0.010022,1.014031,0.836957
27,(POSTAGE),(CIRCUS PARADE CHILDRENS EGG CUP),0.765306,0.056122,0.051020,0.066667,1.187879,0.008070,1.011297,0.673913
96,(POSTAGE),(PARTY BUNTING),0.765306,0.056122,0.051020,0.066667,1.187879,0.008070,1.011297,0.673913
227,(POSTAGE),"(LUNCH BAG WOODLAND, LUNCH BAG RED RETROSPOT)",0.765306,0.056122,0.051020,0.066667,1.187879,0.008070,1.011297,0.673913


In [25]:
frq_items = apriori(basket_Sweden, min_support = 0.05, use_colnames = True) 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(PACK OF 72 SKULL CAKE CASES),(12 PENCILS SMALL TUBE SKULL),0.055556,0.055556,0.055556,1.000000,18.000000,0.052469,inf,1.000000
1,(12 PENCILS SMALL TUBE SKULL),(PACK OF 72 SKULL CAKE CASES),0.055556,0.055556,0.055556,1.000000,18.000000,0.052469,inf,1.000000
4,(ASSORTED BOTTLE TOP MAGNETS),(36 DOILIES DOLLY GIRL),0.055556,0.055556,0.055556,1.000000,18.000000,0.052469,inf,1.000000
5,(36 DOILIES DOLLY GIRL),(ASSORTED BOTTLE TOP MAGNETS),0.055556,0.055556,0.055556,1.000000,18.000000,0.052469,inf,1.000000
180,(CHILDRENS CUTLERY DOLLY GIRL),(CHILDRENS CUTLERY CIRCUS PARADE),0.055556,0.055556,0.055556,1.000000,18.000000,0.052469,inf,1.000000
...,...,...,...,...,...,...,...,...,...,...
25487,(POSTAGE),"(WOODEN OWLS LIGHT GARLAND, CHILDRENS CUTLERY ...",0.611111,0.055556,0.055556,0.090909,1.636364,0.021605,1.038889,1.000000
203,(POSTAGE),(CUPCAKE LACE PAPER SET 6),0.611111,0.083333,0.055556,0.090909,1.090909,0.004630,1.008333,0.214286
385,(POSTAGE),(MINI PLAYING CARDS DOLLY GIRL),0.611111,0.083333,0.055556,0.090909,1.090909,0.004630,1.008333,0.214286
483,(POSTAGE),(ROUND SNACK BOXES SET OF4 WOODLAND),0.611111,0.083333,0.055556,0.090909,1.090909,0.004630,1.008333,0.214286
