Sampling Data

In [5]:
import json
import os
from tqdm import tqdm

def sample_json(input_file, output_file, target_size_gb, filter_key='also_buy'):
    target_size_bytes = target_size_gb * 1024**3
    current_size_bytes = 0

    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in tqdm(infile):
            record = json.loads(line)
            if record.get(filter_key):
                outfile.write(json.dumps(record) + '\n')
                current_size_bytes += len(line.encode('utf-8'))

            if current_size_bytes >= target_size_bytes:
                break

    print("Finished")

sample_json('Sampled_Meta_Data.json', 'small_sample.json', 0.5)

0it [00:00, ?it/s]

54093it [00:05, 9812.04it/s] 

Finished





Pre-Processing

In [13]:
import json
import pandas as pd

# Open the JSON file
with open('Sampled_Meta_Data.json') as f:
    # Initialize an empty list to store the JSON objects
    data_list = []

    # Iterate through each line in the file
    for line in f:
        try:
            # Attempt to load each line as a JSON object and append it to the list
            data = json.loads(line)
            data_list.append(data)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")

# Initialize an empty dictionary to store values
result_dict = {}

# Iterate through the list of dictionaries
for data in data_list:
    # Iterate through key-value pairs in each dictionary
    for key, value in data.items():
        # If the key is not in result_dict, create a list with the current value
        if key not in result_dict:
            result_dict[key] = [value]
        else:
            # If the key is already in result_dict, append the current value to the list
            result_dict[key].append(value)

# creating dataframe
data_frame = pd.DataFrame(result_dict)

# dropping rows with any missing values
data_frame = data_frame.dropna(how='any')



In [None]:
import json
import pandas as pd

# Function to process data in batches
def process_large_file(input_file, batch_size=100000):
    # Open the JSON file
    with open(input_file) as f:
        batch_count = 0
        total_records = 0
        batch_data = []

        # Iterate through each line in the file
        for line in f:
            try:
                # Attempt to load each line as a JSON object
                data = json.loads(line)
                batch_data.append(data)
                total_records += 1

                # Check if batch size is reached
                if len(batch_data) >= batch_size:
                    # Process the current batch
                    process_batch(batch_data, batch_count)
                    batch_count += 1
                    batch_data = []

            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")

        # Process the remaining records in the last batch
        if batch_data:
            process_batch(batch_data, batch_count)

        print(f"Total records processed: {total_records}")

# Function to process each batch
def process_batch(batch_data, batch_count):
    # Convert batch data to DataFrame
    batch_df = pd.DataFrame(batch_data)

    # Perform data processing operations

    # dropping rows with any missing values
    batch_df = batch_df.dropna(how='any')

    # removing unnecessary column headers
    processed_batch = data_frame.drop(['title','also_view','price','feature','description', 'brand','category','date', 'fit', 'tech1', 'tech2', 'image', 'details', 'main_cat', 'similar_item', 'rank'], axis=1)

    processed_batch['asin'] = processed_batch['asin'].astype('object')
    processed_batch['also_buy'] = processed_batch['also_buy'].astype('object')   

    # Save processed batch to a new file or perform further operations
    output_file = f"output_batch_{batch_count}.json"
    processed_batch.to_json(output_file, orient='records')

    print(f"Batch {batch_count} processed and saved to {output_file}")

# Main function
if __name__ == "__main__":
    input_file = 'Sampled_Meta_Data.json'
    process_large_file(input_file)


In [11]:
# creating dataframe
data_frame = pd.DataFrame(result_dict)

# dropping rows with any missing values
data_frame = data_frame.dropna(how='any')

# Apply the function to the 'price' column
#df['price'] = df['price'].apply(extract_first_price)

# removing unnecessary column headers
df = data_frame.drop(['title','also_view','price','feature','description', 'brand','category','date', 'fit', 'tech1', 'tech2', 'image', 'details', 'main_cat', 'similar_item', 'rank'], axis=1)

# Function to extract first value from price strings
def extract_first_price(price_str):
    # Split the string by '-'
    price_parts = price_str.split(' - ')
    # Extract the first part
    first_price = price_parts[0]
    return first_price

df['asin'] = df['asin'].astype('object')
df['also_buy'] = df['also_buy'].astype('object')

# Print the modified DataFrame
df.head()

Unnamed: 0,also_buy,asin
0,"[B077GQQKRV, B07CBJQTF6, B07H2Z6S9J, B06Y26PZ5...",6342509379
1,"[B018YRBB80, B07FD9HWPM, B017M55DI4, B07KX6PPW...",6342502315
2,[B00VBVXVPI],6342522545
3,"[B01AHZSZ9A, B01I809NCO, B07219C7LQ, B06ZZBQMT...",6342522898
4,"[B06XY5N95G, B01LY4VKTL, B01EKRMG8C, B004SLKRY...",6342523002


In [4]:
value = df.iloc[1, 0]
print(value)

Crazy Women's Voile Crinkle Scarf Shawl


In [12]:
# saving as json file
output_file = 'outputFile.json'
df.to_json(output_file, orient='records')

print(f"Dateframe successfully written in {output_file}")

Dateframe successfully written in outputFile.json


In [5]:
# Merge columns into a list
merged_list = df.apply(lambda row: [row['asin'], row['also_buy']], axis=1)

# Flatten the nested list
flattened_list = [item for sublist in merged_list for item in (sublist if isinstance(sublist, list) else [sublist])]

flattened_list[2]

'6342502315'

In [4]:
df_new = data_frame.drop(['title','also_view','price','feature','description', 'brand','category','date', 'fit', 'tech1', 'tech2', 'image', 'details', 'main_cat', 'similar_item', 'rank'], axis=1)

df_new.head()


Unnamed: 0,also_buy,asin
0,"[B077GQQKRV, B07CBJQTF6, B07H2Z6S9J, B06Y26PZ5...",6342509379
1,"[B018YRBB80, B07FD9HWPM, B017M55DI4, B07KX6PPW...",6342502315
2,[B00VBVXVPI],6342522545
3,"[B01AHZSZ9A, B01I809NCO, B07219C7LQ, B06ZZBQMT...",6342522898
4,"[B06XY5N95G, B01LY4VKTL, B01EKRMG8C, B004SLKRY...",6342523002


In [7]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
import pandas as pd

df_new['asin'] = df_new['asin'].astype('object')
df_new['also_buy'] = df_new['also_buy'].astype('object')

#data = df_new.iloc[:20]

# Flatten the 'also_buy' lists and handle non-iterable elements
transactions = []
for sublist in data['also_buy']:
    if isinstance(sublist, list):
        transactions.extend(sublist)
    else:
        transactions.append(sublist)

# Convert the transactions into a list of lists format
transactions = [[str(item) for item in sublist] if isinstance(sublist, list) else [str(sublist)] for sublist in transactions]

# Encode the transactions into a binary format suitable for Apriori
encoder = TransactionEncoder()
transactions_encoded = encoder.fit_transform(transactions)

# Convert the encoded transactions into a DataFrame
df1 = pd.DataFrame(transactions_encoded, columns=encoder.columns_)

# Apply the Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(df1, min_support=0.01, use_colnames=True)

frequent_itemsets


Unnamed: 0,support,itemsets
0,0.012407,(B000000XB8)
1,0.012407,(B001RNO30W)
2,0.012407,(B001SES07C)
3,0.012407,(B002YAGIOG)
4,0.012407,(B00318CBS2)
5,0.012407,(B003AIKE3C)
6,0.012407,(B003AIKE6E)
7,0.012407,(B004GGU9QY)
8,0.012407,(B005TS13Y0)
9,0.012407,(B007KFQ3E0)


In [1]:
import json
import pandas as pd

# Function to process data in batches
def process_large_file(input_file, batch_size=100000):
    # Open the JSON file
    with open(input_file) as f:
        batch_count = 0
        total_records = 0
        batch_data = []

        # Iterate through each line in the file
        for line in f:
            try:
                # Attempt to load each line as a JSON object
                data = json.loads(line)
                batch_data.append(data)
                total_records += 1

                # Check if batch size is reached
                if len(batch_data) >= batch_size:
                    # Process the current batch
                    process_batch(batch_data, batch_count)
                    batch_count += 1
                    batch_data = []

            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")

        # Process the remaining records in the last batch
        if batch_data:
            process_batch(batch_data, batch_count)

        print(f"Total records processed: {total_records}")

# Function to process each batch
def process_batch(batch_data, batch_count):
    # Convert batch data to DataFrame
    batch_df = pd.DataFrame(batch_data)

    # Perform data processing operations

    # dropping rows with any missing values
    batch_df = batch_df.dropna(how='any')

    # removing unnecessary column headers
    processed_batch = batch_df.drop(['title','also_view','price','feature','description', 'brand','category','date', 'fit', 'tech1', 'tech2', 'image', 'details', 'main_cat', 'similar_item', 'rank'], axis=1)

    processed_batch['asin'] = processed_batch['asin'].astype('object')
    processed_batch['also_buy'] = processed_batch['also_buy'].astype('object')   

    # Save processed batch to a new file or perform further operations
    output_file = f"output_batch_{batch_count}.json"
    processed_batch.to_json(output_file, orient='records')

    print(f"Batch {batch_count} processed and saved to {output_file}")

# Main function
if __name__ == "__main__":
    input_file = 'Sampled_Meta_Data.json'
    process_large_file(input_file)


Batch 0 processed and saved to output_batch_0.json
Batch 1 processed and saved to output_batch_1.json
Batch 2 processed and saved to output_batch_2.json
Batch 3 processed and saved to output_batch_3.json
Batch 4 processed and saved to output_batch_4.json
Batch 5 processed and saved to output_batch_5.json
Batch 6 processed and saved to output_batch_6.json


KeyError: "['details'] not found in axis"