Market Basket Analysis

In [1]:
!pip3 install efficient-apriori
!pip3 install pymongo
!pip3 install pandas
!pip3 install matplotlib
!pip3 install seaborn
!pip3 install boto3
!pip3 install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.0


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from efficient_apriori import apriori
from IPython.display import display
import boto3
import os
from dotenv import load_dotenv

In [4]:
load_dotenv()
access_key_id = os.getenv("ACCESS_KEY_ID")
secret_access_key = os.getenv("SECRET_ACCESS_KEY")

bucket_name = 'is459-t3-job-transformed-data'
google_object_key = 'input/google.csv'
nodeflair_object_key = 'input/jobs.csv'

s3 = boto3.client('s3', aws_access_key_id=access_key_id, aws_secret_access_key=secret_access_key)


jobs_df = pd.read_csv(s3.get_object(Bucket=bucket_name, Key=nodeflair_object_key)['Body'])
google_df = pd.read_csv(s3.get_object(Bucket=bucket_name, Key=google_object_key)['Body'])

In [None]:
jobs_df = jobs_df.iloc[: , 1:]
print(jobs_df.shape)
google_df = google_df.iloc[: , 1:]
print(google_df.shape)



In [None]:
frames = [jobs_df, google_df]
jobs_df = pd.concat(frames)
jobs_df.shape

In [None]:
jobs_df.head()

In [None]:
def str_to_list(string):
    return ast.literal_eval(string)

jobs_df['stacks'] = jobs_df['stacks'].apply(str_to_list)


In [None]:
jobs_df

In [None]:
records = []

for index, row in jobs_df.iterrows():
    records.append([stack for stack in row['stacks']])

print(len(records))

print(records[0:10])

In [None]:
itemsets, rules = apriori(records, min_support=0.01, min_confidence=0.5, max_length=15)

In [None]:
result_apr_df = pd.DataFrame(columns=['antecedent', 'consequent', 'basket_rule', 'support', 'confidence', 'lift'])
for rule in rules:
    antecedent = list(rule.lhs)
    consequent = list(rule.rhs)
    basket_rule = str(antecedent) + "->" + str(consequent)
    support = rule.support
    confidence = rule.confidence
    lift = rule.lift
    result_apr_df = result_apr_df.append({'antecedent': antecedent, 'consequent': consequent, 'basket_rule': basket_rule, 'support': support, 'confidence': confidence, 'lift': lift}, ignore_index=True)

result_apr_df = result_apr_df.sort_values(by=['confidence'], ascending=False)
result_apr_df.head()

In [None]:
result_apr_df = result_apr_df.rename_axis('index')

In [None]:
# result_apr_df.to_csv('results.csv')
result_apr_df.iloc[:100, :].to_csv('results.csv')

Load CSV files into MongoDB

In [None]:
import csv
import json
import re
from pymongo import MongoClient

# Connect to MongoDB
mongo_user = os.getenv("MONGO_USER")
mongo_pwd = os.getenv("MONGO_PWD")
client = MongoClient(f'mongodb://{mongo_user}:{mongo_pwd}@35.171.48.20:27017')
db = client['IS459']
collection = db['market_basket_analysis']

# Open CSV file and read rows into a list
with open('results.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    rows = list(reader)

# Define the filter to check if the document already exists
for i in range(100):
    row = rows[i]
    # Regular expression to match any word between single quotes
    quote_pattern = re.compile(r"'([^']*)'")

    # Find all matches of the quote pattern
    words_list = quote_pattern.findall(row['antecedent'] + row['consequent'])
    # print(words_list)
    
    new_row = {'index': row['index'], 'stack': words_list, 'stack_count': len(words_list), 'support': row['support'], 'confidence': row['confidence'], 'lift': row['lift']}
    # print(new_row)
    
    filter = {'stack': words_list}
    
    # # Define the update operation to set the document if it doesn't exist
    update = {'$setOnInsert': new_row}
    
    # # Execute the update operation with upsert=True to insert the document if it doesn't exist
    result = collection.update_one(filter, update, upsert=True)
    
    if result.upserted_id is not None:
        print(f"Document for antecedent: {row['antecedent']} and consequent: {row['consequent']} inserted.")
    else:
        print(f"Document for antecedent: {row['antecedent']} and consequent: {row['consequent']} already exists.")


In [None]:
jobs_df.head()

In [None]:
stacks_df = pd.DataFrame(columns=['stack', 'count'])
stacks_map = {}

for row in jobs_df['stacks']:
    for item in row:
        if item not in stacks_map:
            stacks_map[item] = 1
        else:
            stacks_map[item] += 1
for stack, count in stacks_map.items():
    stacks_df = stacks_df.append({'stack': stack, 'count': count}, ignore_index=True)

stacks_df

In [None]:
collection_stacks = db['stacks']

stack_records = stacks_df.to_dict('records')
for stack in stack_records:
    # print(stack['stack'])

    # Check if the document exists
    filter_condition = {"stack": stack["stack"]}
    existing_doc = collection_stacks.find_one(filter_condition)

    # If the document doesn't exist, insert it
    if existing_doc is None:
        collection_stacks.insert_one(stack)
        print("Document inserted.")
    # update the count value of the stack
    else:
        newvalue = { "$set": { 'count': stack['count'] } }
        collection_stacks.update_one(filter_condition,newvalue)
        print("Document updated.")