In [7]:
import os
import json
import mysql.connector

class DataStoring:
    def __init__(self, host=None, user=None, password=None, db=None):
        self.host = host
        self.user = user
        self.password = password
        self.connection = None
        self.cursor = None
        self.db = db
        self.code_path = r"C:\Users\haris\OneDrive\Desktop\Guvi\Projects\Amazon_Sales_Analysis\data_processing\ipynb_files"
        self.read_conifg()
        self.get_connection()
        self.create_db()
        # self.create_tables()

    def read_conifg(self):
        self.json_path = os.path.join(self.code_path, "db_config.json")
        self.config_data = json.load(open(self.json_path))
        self.host = self.config_data.get("host", "localhost")
        self.user = self.config_data.get("user", "root")
        self.password = self.config_data.get("password", "12345678")
        self.db = self.config_data.get("db", "amazon_sales_analysis")
    
    def save_config(self):
        with open(self.json_path, 'w') as file:
            json.dump(self.config_data, file, indent=4)

    def get_connection(self):
        self.read_conifg()
        try:
            if self.db and self.check_dbs():
                self.connection = mysql.connector.connect(
                                                        host=self.host,
                                                        user=self.user,
                                                        password=self.password,
                                                        database=self.db
                                                    )
                self.cursor = self.connection.cursor()
            else:
                self.connection = mysql.connector.connect(
                                                            host=self.host,
                                                            user=self.user,
                                                            password=self.password
                                                        )
                self.cursor = self.connection.cursor()
        except Exception as e:
            print(f"Couldn't connect to the MySQL server due to the error: {e}")

    def execute_query(self, query):
        if not self.connection or not self.connection.is_connected() or not self.cursor:
            self.get_connection()
        self.cursor.execute(query)
        return self.cursor
    
    def check_dbs(self):
        conn = mysql.connector.connect(
                                        host=self.host,
                                        user=self.user,
                                        password=self.password
                                    )

        cursor = conn.cursor()
        cursor.execute("SHOW DATABASES")

        databases = [db[0] for db in cursor.fetchall()]
        if self.db in databases:
            return True
        else:
            return False

    def create_db(self):
        if self.connection and self.cursor:
            try:
                self.db = "amazon_sales_analysis"
                self.cursor.execute(f"CREATE DATABASE IF NOT EXISTS {self.db}")
                self.connection.commit()
                self.config_data["db"] = self.db
                self.save_config()
            except Exception as e:
                print(f"Error creating database: {e}")
    
    def create_tables(self, table=None):
        self.read_conifg()
        try:
            self.cursor.execute(table if table else self.table)
            self.connection.commit()
        except Exception as e:
            print(f"Error creating table: {e}")

In [20]:
data_storing = DataStoring()
table = json.load(open(r"C:\Users\haris\OneDrive\Desktop\Guvi\Projects\Amazon_Sales_Analysis\data_processing\ipynb_files\db_config.json")).get("tables")
columns_sql = ", ".join([f"`{col}` TEXT" for col in data.columns])
table = table.format(table_name="cleaned_combined_data", columns=columns_sql)
print(table)
data_storing.create_tables(table=table)

CREATE TABLE IF NOT EXISTS cleaned_combined_data (id INT AUTO_INCREMENT PRIMARY KEY,`transaction_id` TEXT, `order_date` TEXT, `customer_id` TEXT, `product_id` TEXT, `product_name` TEXT, `category` TEXT, `subcategory` TEXT, `brand` TEXT, `original_price_inr` TEXT, `discount_percent` TEXT, `discounted_price_inr` TEXT, `quantity` TEXT, `subtotal_inr` TEXT, `delivery_charges` TEXT, `final_amount_inr` TEXT, `customer_city` TEXT, `customer_state` TEXT, `customer_tier` TEXT, `customer_spending_tier` TEXT, `customer_age_group` TEXT, `payment_method` TEXT, `delivery_days` TEXT, `delivery_type` TEXT, `is_prime_member` TEXT, `is_festival_sale` TEXT, `festival_name` TEXT, `customer_rating` TEXT, `return_status` TEXT, `order_month` TEXT, `order_year` TEXT, `order_quarter` TEXT, `product_weight_kg` TEXT, `is_prime_eligible` TEXT, `product_rating` TEXT, `clean_order_date` TEXT, `clean_original_price_inr` TEXT, `clean_discount_percent` TEXT, `clean_final_amount_inr` TEXT, `clean_delivery_charges` TEXT

In [22]:
connection = mysql.connector.connect(
    host="localhost",
    user="root",
    password="12345678",
    database="amazon_sales_analysis"
)

cursor = connection.cursor()

placeholders = ", ".join(["%s"] * len(data.columns))
columns_sql = ", ".join([f"`{col}`" for col in data.columns])  # backtick column names
insert_query = f"""
INSERT INTO cleaned_combined_data ({columns_sql})
VALUES ({placeholders});
"""

# --- Step 4: Insert Data ---
try:
    cursor.executemany(insert_query, data.to_records(index=False).tolist())
    connection.commit()
    print(f"✅ Successfully inserted {cursor.rowcount} rows into table.")
except mysql.connector.Error as err:
    print(f"❌ Error inserting data: {err}")

❌ Error inserting data: 2055: Lost connection to MySQL server at 'localhost:3306', system error: Errno 8: EOF occurred in violation of protocol (_ssl.c:2417)


In [25]:
# import pandas as pd
# import mysql.connector

# csv_file = "your_file.csv"
# df = pd.read_csv(csv_file)
# df.columns = [col.strip().replace(" ", "_").replace("\n", "_") for col in df.columns]

connection = mysql.connector.connect(
    host="localhost",
    user="root",
    password="12345678",
    database="amazon_sales_analysis",
    ssl_disabled=True  # ✅ Disable SSL if not needed
)

cursor = connection.cursor()

cursor.execute("DELETE FROM cleaned_combined_data;")
connection.commit()

placeholders = ", ".join(["%s"] * len(data.columns))
columns_sql = ", ".join([f"`{col}`" for col in data.columns])
insert_query = f"""
INSERT INTO cleaned_combined_data ({columns_sql})
VALUES ({placeholders});
"""

# ✅ Insert in batches (e.g., 500 rows at a time)
batch_size = 500
data = data.to_records(index=False).tolist()

for i in range(0, len(data), batch_size):
    batch = data[i:i+batch_size]
    try:
        cursor.executemany(insert_query, batch)
        connection.commit()
        print(f"✅ Inserted rows {i+1} to {i+len(batch)}")
    except mysql.connector.Error as err:
        print(f"❌ Error inserting batch {i//batch_size + 1}: {err}")
        break

cursor.close()
connection.close()


✅ Inserted rows 1 to 500
✅ Inserted rows 501 to 1000
✅ Inserted rows 1001 to 1500
✅ Inserted rows 1501 to 2000
✅ Inserted rows 2001 to 2500
✅ Inserted rows 2501 to 3000
✅ Inserted rows 3001 to 3500
✅ Inserted rows 3501 to 4000
✅ Inserted rows 4001 to 4500
✅ Inserted rows 4501 to 5000
✅ Inserted rows 5001 to 5500
✅ Inserted rows 5501 to 6000
✅ Inserted rows 6001 to 6500
✅ Inserted rows 6501 to 7000
✅ Inserted rows 7001 to 7500
✅ Inserted rows 7501 to 8000
✅ Inserted rows 8001 to 8500
✅ Inserted rows 8501 to 9000
✅ Inserted rows 9001 to 9500
✅ Inserted rows 9501 to 10000
✅ Inserted rows 10001 to 10500
✅ Inserted rows 10501 to 11000
✅ Inserted rows 11001 to 11500
✅ Inserted rows 11501 to 12000
✅ Inserted rows 12001 to 12500
✅ Inserted rows 12501 to 13000
✅ Inserted rows 13001 to 13500
✅ Inserted rows 13501 to 14000
✅ Inserted rows 14001 to 14500
✅ Inserted rows 14501 to 15000
✅ Inserted rows 15001 to 15500
✅ Inserted rows 15501 to 16000
✅ Inserted rows 16001 to 16500
✅ Inserted rows 16501

In [11]:
import pandas as pd

def load_and_merge_yearly_files(folder_path=None):

        # Get all CSV files in the folder
        csv_files = sorted([f for f in os.listdir(folder_path) if f.endswith('.csv') and '20' in f])
        
        all_dataframes = []

        for file in csv_files:
            file_path = os.path.join(folder_path, file)
            df = pd.read_csv(file_path)
            # print(len(df))
            all_dataframes.append(df)

        # Combine all data
        combined_df = pd.concat(all_dataframes, ignore_index=True)

        return combined_df

data = load_and_merge_yearly_files(folder_path=r"C:\Users\haris\OneDrive\Desktop\Guvi\Projects\Amazon_Sales_Analysis\dataset\cleaned_dataset")


In [12]:
data.columns

Index(['transaction_id', 'order_date', 'customer_id', 'product_id',
       'product_name', 'category', 'subcategory', 'brand',
       'original_price_inr', 'discount_percent', 'discounted_price_inr',
       'quantity', 'subtotal_inr', 'delivery_charges', 'final_amount_inr',
       'customer_city', 'customer_state', 'customer_tier',
       'customer_spending_tier', 'customer_age_group', 'payment_method',
       'delivery_days', 'delivery_type', 'is_prime_member', 'is_festival_sale',
       'festival_name', 'customer_rating', 'return_status', 'order_month',
       'order_year', 'order_quarter', 'product_weight_kg', 'is_prime_eligible',
       'product_rating', 'clean_order_date', 'clean_original_price_inr',
       'clean_discount_percent', 'clean_final_amount_inr',
       'clean_delivery_charges', 'cleaned_customer_rating',
       'cleaned_product_rating', 'cleaned_customer_city',
       'cleaned_is_prime_member', 'cleaned_is_prime_eligible',
       'cleaned_is_festival_sale', 'cleaned_c

In [13]:
data.to_csv(r"C:\Users\haris\OneDrive\Desktop\Guvi\Projects\Amazon_Sales_Analysis\dataset\cleaned_dataset\combined_csv_file.csv", index=False)