Ironhack_gambling.ipynb


In [1]:
import pandas as pd
import numpy as np
from icecream import ic
import pymysql

# First we input the test data and make it into a dictionary of dataframes
# each key of the dict will be a sheet name

# 1) Load original Excel into a dict
file_path = r'dataset\SQL Test Data.xlsx'
sql_test_data = pd.ExcelFile(file_path)
ironhack_gambling_dict = {
    sheet: sql_test_data.parse(sheet)
    for sheet in sql_test_data.sheet_names
}

# the 'betting' sheet is problematic, I will split into 3. 
df_betting = ironhack_gambling_dict['Betting']

# ic(df_betting)
# # Make a copy so I don’t accidentally change df
df_betting_1 = df_betting.iloc[:, 0:9].copy()   # columns 0 to 9
df_betting_2 = df_betting.iloc[:, 12:15].copy() # columns 13 to 16
df_betting_3 = df_betting.iloc[:, 16:26].copy() # columns 14..22

# rename so that the column names match
df_betting_2 = df_betting_2.rename(columns={
    'AccountNo.1' : 'AccountNo',
    'Bet_Amt.1' : 'Bet_Amt',
    'Product.1' : 'Product'
})

# rename so that the column names match
df_betting_3 = df_betting_3.rename(columns={
    'AccountNo.2' : 'AccountNo',
    'Bet_Amt.1' : 'Bet_Amt',
    'Product.1' : 'Product',
    df_betting_3.columns[df_betting_3.columns.isna()][0]: 'N_A'
})

# Create a dictionary with the three DataFrames
betting_dict = {
    'betting_1': df_betting_1,
    'betting_2': df_betting_2,
    'betting_3': df_betting_3
}

# ic(betting_dict['betting_1'])

# 3) Merge the two dicts
ironhack_gambling_dict.update(betting_dict)


# # 4) Confirm all sheets
# print("All sheets in ironhack_gambling_dict:")
# for sheet in ironhack_gambling_dict.keys():
#     print(" -", sheet)

# ic(ironhack_gambling_dict['Account'])



In [12]:

# Mapping of column names from DataFrame to MySQL schema
table_column_mapping = {
    'account': [
        'AccountNo', 'cust_id', 'account_location', 'currency_code', 'daily_deposit_limit', 'stake_scale', 'source_prod'
    ],
    'customer': [
        'cust_id', 'account_location', 'title', 'first_name', 'last_name', 'create_date', 'country_code', 'language',
        'status', 'date_of_birth', 'contact', 'customer_group'
    ],
    'betting_1': [
        'AccountNo', 'BetDate', 'ClassId', 'CategoryId', 'Source', 'BetCount', 'Bet_Amt', 'Win_Amt', 'Product'
    ],
    'betting_2': [
        'AccountNo', 'Bet_Amt', 'Product'
    ],
    'betting_3': [
        'AccountNo', 'Vegas', 'Sportsbook', 'Games', 'Casino', 'Poker', 'Bingo', 'N_A', 'Adjustments'
    ],
    'product': [
        'class_id', 'category_id', 'product_name', 'sub_product', 'description', 'bet_or_play'
    ],
    'student': [
        'student_id', 'student_name', 'city', 'school_id', 'gpa'
    ],
    'school': [
        'school_id', 'school_name', 'school_city'
    ]
}

In [13]:
def rename_columns(df, table_name, column_mapping):
    """
    Rename DataFrame columns based on table_column_mapping.
    
    Args:
        df (pd.DataFrame): The DataFrame to rename.
        table_name (str): The name of the table.
        column_mapping (dict): The table_column_mapping dictionary.
        
    Returns:
        pd.DataFrame: The renamed DataFrame.
    """
    if table_name not in column_mapping:
        print(f"No column mapping found for table '{table_name}'. Skipping rename.")
        return df
    
    # Create a reverse mapping: current_name -> desired_name
    desired_columns = column_mapping[table_name]
    rename_dict = {}
    
    for desired_col in desired_columns:
        # Attempt to find the matching column in the DataFrame (case-insensitive)
        matched_cols = [col for col in df.columns if col.lower() == desired_col.lower()]
        if matched_cols:
            rename_dict[matched_cols[0]] = desired_col
        else:
            print(f"Warning: Column '{desired_col}' not found in DataFrame for table '{table_name}'. Adding as None.")
            df[desired_col] = None  # Add missing columns with None
    
    df.rename(columns=rename_dict, inplace=True)
    return df


In [14]:
# print(type(ironhack_gambling_dict['Account']))
# print(ironhack_gambling_dict['Product'])


In [5]:
import pymysql


#ic(ironhack_gambling_dict['Customer'])

# Connect to MySQL
try:
    cnx = pymysql.connect(user='root', password='Malcomx1',
                          host='localhost', database='SQLTestData')
    if cnx.open:
        print("Connection open")
    else:
        print("Connection not successfully opened")

    # The object which will interact with the database
    cursor = cnx.cursor()

    # Convert dates to MySQL-compatible format (YYYY-MM-DD)
    ironhack_gambling_dict['Customer']['CreateDate'] = pd.to_datetime(
        ironhack_gambling_dict['Customer']['CreateDate'], format='%m/%d/%Y', errors='coerce'
    ).dt.strftime('%Y-%m-%d')
        
    ironhack_gambling_dict['Customer']['DateOfBirth'] = pd.to_datetime(
        ironhack_gambling_dict['Customer']['DateOfBirth'], format='%m/%d/%Y', errors='coerce'
    ).dt.strftime('%Y-%m-%d')
    
    # ic(ironhack_gambling_dict['betting_1'])
    
    ironhack_gambling_dict['betting_1']['BetDate'] = pd.to_datetime(
    ironhack_gambling_dict['betting_1']['BetDate'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce'
    ).dt.strftime('%Y-%m-%d')
    
    # Replace NaN with None
    ironhack_gambling_dict['Customer'] = ironhack_gambling_dict['Customer'].where(pd.notnull(ironhack_gambling_dict['Customer']), None)
    ironhack_gambling_dict['Account'] = ironhack_gambling_dict['Account'].where(pd.notnull(ironhack_gambling_dict['Account']), None)
    ironhack_gambling_dict['Product'] = ironhack_gambling_dict['Product'].where(pd.notnull(ironhack_gambling_dict['Product']), None)
    ironhack_gambling_dict['betting_1'] = ironhack_gambling_dict['betting_1'].where(pd.notnull(ironhack_gambling_dict['betting_1']), None)
    ironhack_gambling_dict['betting_2'] = ironhack_gambling_dict['betting_2'].where(pd.notnull(ironhack_gambling_dict['betting_2']), None)
    ironhack_gambling_dict['betting_3'] = ironhack_gambling_dict['betting_3'].where(pd.notnull(ironhack_gambling_dict['betting_3']), None)
    
    
    # Fill or drop NaN values for each DataFrame
    # Customer: Replace missing dates with None
    # ironhack_gambling_dict['Customer']['CreateDate'].fillna(None, inplace=True)
    # ironhack_gambling_dict['Customer']['DateOfBirth'].fillna(None, inplace=True)
    
        # Rename the DataFrame columns to match the MySQL table schema
    ironhack_gambling_dict['Product'].rename(
    columns={
        'CLASSID': 'class_id',
        'CATEGORYID': 'category_id',
        'product': 'product_name',
        'sub_product': 'sub_product',
        'description': 'description',
        'bet_or_play': 'bet_or_play'
    },
    inplace=True
    )
    
    
    
    
    
    ironhack_gambling_dict['Customer']['CreateDate'] = ironhack_gambling_dict['Customer']['CreateDate'].replace({pd.NaT: None})
    ironhack_gambling_dict['Customer']['DateOfBirth'] = ironhack_gambling_dict['Customer']['DateOfBirth'].replace({pd.NaT: None})

    # Product: Drop rows with missing `class_id` (if `class_id` is required)
    ironhack_gambling_dict['Product'].dropna(subset=['class_id'], inplace=True)

    # Betting 1: Drop rows with missing critical fields (e.g., AccountNo, BetDate)
    ironhack_gambling_dict['betting_1'].dropna(subset=['AccountNo', 'BetDate'], inplace=True)

    # Betting 2: Drop rows with missing critical fields (e.g., AccountNo, Bet_Amt)
    ironhack_gambling_dict['betting_2'].dropna(subset=['AccountNo', 'Bet_Amt'], inplace=True)

    # Betting 3: Drop rows with missing AccountNo (if required for relationships)
    ironhack_gambling_dict['betting_3'].dropna(subset=['AccountNo'], inplace=True)

    # Replace any remaining NaN with None for nullable columns
    for key in ['Customer', 'Account', 'Product', 'betting_1', 'betting_2', 'betting_3']:
        ironhack_gambling_dict[key] = ironhack_gambling_dict[key].where(
            pd.notnull(ironhack_gambling_dict[key]), None
    )
  

    # print(ironhack_gambling_dict['Product'].isnull().sum())
    
    # Convert DataFrame rows to lists of tuples
    customer_sheet = ironhack_gambling_dict['Customer'].to_records(index=False).tolist()
    account_sheet = ironhack_gambling_dict['Account'].to_records(index=False).tolist()
    product_sheet = ironhack_gambling_dict['Product'].to_records(index=False).tolist()
    betting_1_sheet = ironhack_gambling_dict['betting_1'].to_records(index=False).tolist()
    betting_2_sheet = ironhack_gambling_dict['betting_2'].to_records(index=False).tolist()
    betting_3_sheet = ironhack_gambling_dict['betting_3'].to_records(index=False).tolist()

    # ic(betting_1_sheet)

    # ic(account_sheet)
    # ic(product_sheet)
    
#     # Rename the DataFrame columns to match the MySQL table schema
#     ironhack_gambling_dict['Product'].rename(
#     columns={
#         'CLASSID': 'class_id',
#         'CATEGORYID': 'category_id',
#         'product': 'product_name',
#         'sub_product': 'sub_product',
#         'description': 'description',
#         'bet_or_play': 'bet_or_play'
#     },
#     inplace=True
# )  
    # Insert into customer table
    customer_query = """
    INSERT IGNORE INTO customer (
        cust_id, account_location, title, first_name, last_name, 
        create_date, country_code, language, status, date_of_birth, 
        contact, customer_group
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    """
    cursor.executemany(customer_query, customer_sheet)

    # Insert into account table
    account_query = """
    INSERT IGNORE INTO account (
        AccountNo, cust_id, account_location, currency_code, 
        daily_deposit_limit, stake_scale, source_prod
    ) VALUES (%s, %s, %s, %s, %s, %s, %s)
    """
    cursor.executemany(account_query, account_sheet)

    # Insert into product table
    product_query = """
    INSERT IGNORE INTO product (
        class_id, category_id, product_name, sub_product, description, bet_or_play
    ) VALUES (%s, %s, %s, %s, %s, %s)
    """
    cursor.executemany(product_query, product_sheet)

    # Insert into betting_1 table
    betting_1_query = """
    INSERT IGNORE INTO betting_1 (
        AccountNo, BetDate, ClassId, CategoryId, Source, BetCount, Bet_Amt, Win_Amt, Product

    ) VALUES (%s, %s, %s, %s, %s, %s,  %s, %s, %s)
    """
    cursor.executemany(betting_1_query, betting_1_sheet)


    # Insert into betting_2 table
    betting_2_query = """
    INSERT IGNORE INTO betting_2 (
        AccountNo,	Bet_Amt, Product

    ) VALUES (%s, %s, %s)
    """
    cursor.executemany(betting_2_query, betting_2_sheet)


    # Insert into betting_2 table
    betting_2_query = """
    INSERT IGNORE INTO betting_2 (
        AccountNo,	Bet_Amt, Product

    ) VALUES (%s, %s, %s)
    """
    cursor.executemany(betting_2_query, betting_2_sheet)

    # Insert into betting_3 table
    betting_3_query = """
    INSERT IGNORE INTO betting_3 (
        AccountNo, Vegas, Sportsbook, Games, Casino, Poker, Bingo, N_A, Adjustments

    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
    """
    cursor.executemany(betting_3_query, betting_3_sheet)


    # Commit the changes
    cnx.commit()
    print("Data successfully inserted into the tables.")

    # Close the connection
    try:
        cursor.close()
        cnx.close()
        print("Connection closed")
    except pymysql.MySQLError as e:
        print(f"Error closing connection: {e}")

except pymysql.MySQLError as e:
    print(f"Error connecting to MySQL: {e}")


Connection open
Data successfully inserted into the tables.
Connection closed
