In [1]:
import pyodbc
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

# Data Exracting

In [3]:
server_name = 'localhost' 
database_name = 'Retail_Staging'
trusted_connection = 'yes' 

In [4]:
connection_string = (
    f"DRIVER={{ODBC Driver 17 for SQL Server}};" 
    f"SERVER={server_name};"
    f"DATABASE={database_name};"
    f"Trusted_Connection={trusted_connection};"
)

In [5]:
connection = pyodbc.connect(connection_string)

In [6]:
sql_query = "SELECT * FROM customer_info"

In [7]:
data = pd.read_sql_query(sql_query, connection)

  data = pd.read_sql_query(sql_query, connection)


# Data Exploring

In [8]:
data.head()

Unnamed: 0,customer_id,email,signup_date,gender,region,loyalty_tier
0,C00001,shaneramirez@gmail.com,2025-04-26,Male,Central,Silver
1,C00002,jpeterson@bernard.com,2024-08-11,Female,Central,gold
2,C00003,howardmaurice@yahoo.com,2025-05-15,male,Central,gold
3,C00004,yherrera@arnold.org,2025-06-14,FEMALE,Central,GOLD
4,C00005,janetwilliams@gmail.com,2025-05-02,Male,West,bronze


In [9]:
data.tail()

Unnamed: 0,customer_id,email,signup_date,gender,region,loyalty_tier
495,C00496,simsjohn@wiley.net,2025-02-19,femle,Central,GOLD
496,C00497,cameronwilliams@yahoo.com,2024-12-30,,West,GOLD
497,C00498,ibarron@yahoo.com,2025-06-21,male,South,Silver
498,C00499,karen26@gmail.com,2024-10-02,Female,North,gold
499,C00500,jasonjohnson@jackson.com,2024-11-28,Male,North,gold


In [10]:
data.describe()

Unnamed: 0,customer_id,email,signup_date,gender,region,loyalty_tier
count,497,494,496,496,497,498
unique,497,494,271,6,5,7
top,C00001,shaneramirez@gmail.com,2024-08-10,femle,East,GOLD
freq,1,1,7,92,103,149


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   customer_id   497 non-null    object
 1   email         494 non-null    object
 2   signup_date   496 non-null    object
 3   gender        496 non-null    object
 4   region        497 non-null    object
 5   loyalty_tier  498 non-null    object
dtypes: object(6)
memory usage: 23.6+ KB


In [12]:
data.dtypes

customer_id     object
email           object
signup_date     object
gender          object
region          object
loyalty_tier    object
dtype: object

In [13]:
data.shape

(500, 6)

# Data Cleaning

## Null Check

In [14]:
data.isnull().sum()

customer_id     3
email           6
signup_date     4
gender          4
region          3
loyalty_tier    2
dtype: int64

In [15]:
data[data['customer_id'].isnull()]
# No need for customers without thier unique identifier

Unnamed: 0,customer_id,email,signup_date,gender,region,loyalty_tier
253,,mark16@parker.com,2025-02-22,femle,South,bronze
371,,ohayes@gmail.com,2024-12-09,Other,Central,Silver
470,,nancygonzalez@wright.com,2025-06-27,Male,Central,Silver


In [16]:
data[data['email'].isnull()]

Unnamed: 0,customer_id,email,signup_date,gender,region,loyalty_tier
87,C00088,,2025-01-06,male,West,bronze
147,C00148,,2025-03-02,Male,West,GOLD
303,C00304,,2024-11-02,Male,East,GOLD
339,C00340,,2025-03-24,male,Central,bronze
375,C00376,,2024-07-12,femle,South,bronze
469,C00470,,2024-07-17,femle,North,Silver


In [17]:
data = data.drop(data[data['customer_id'].isnull()].index)
data = data.drop(data[data['email'].isnull()].index)

In [18]:
data['signup_date']= data['signup_date'].fillna(data['signup_date'].mode()[0])
data['gender']= data['gender'].fillna(data['gender'].mode()[0])
data['region']= data['region'].fillna(data['region'].mode()[0])
data['loyalty_tier']= data['loyalty_tier'].fillna(data['loyalty_tier'].mode()[0])

In [19]:
data.isnull().sum()

customer_id     0
email           0
signup_date     0
gender          0
region          0
loyalty_tier    0
dtype: int64

## Checking for Duplications

In [20]:
data.duplicated().sum()
# No Duplication

np.int64(0)

## Check for Inconsistency

### email

In [21]:
data['email'].str[-4:].unique()

array(['.com', '.org', '.net', '.biz', 'info'], dtype=object)

### gender

In [22]:
data['gender'].unique()

array(['Male', 'Female', 'male', 'FEMALE', 'Other', 'femle'], dtype=object)

In [23]:
data.loc[data['gender'] == 'Male', 'gender'] = 'male'

In [24]:
data.loc[data['gender'] == 'Female', 'gender']= 'female'
data.loc[data['gender'] == 'FEMALE', 'gender'] = 'female'
data.loc[data['gender'] == 'femle', 'gender'] = 'female'

In [25]:
data['gender'].value_counts(normalize=True) * 100

gender
female    51.120163
male      33.401222
Other     15.478615
Name: proportion, dtype: float64

### region

In [26]:
data['region'].unique()

array(['Central', 'West', 'North', 'South', 'East'], dtype=object)

### loyalty_tier

In [27]:
data['loyalty_tier'].unique()

array(['Silver', 'gold ', 'GOLD', 'bronze', 'gld', 'brnze', 'sllver'],
      dtype=object)

In [28]:
data.loc[data['loyalty_tier'] == 'Silver' , 'loyalty_tier']= 'silver'
data.loc[data['loyalty_tier'] == 'sllver', 'loyalty_tier']= 'silver'

In [29]:
data.loc[data['loyalty_tier'] == 'GOLD', 'loyalty_tier'] = 'gold'
data.loc[data['loyalty_tier'] == 'gld', 'loyalty_tier'] = 'gold'
data.loc[data['loyalty_tier'] == ' gold ', 'loyalty_tier'] = 'gold'

In [30]:
data.loc[data['loyalty_tier'] == 'brnze', 'loyalty_tier'] = 'bronze'

In [31]:
data['loyalty_tier'].value_counts(normalize=True) * 100

loyalty_tier
gold      30.753564
silver    23.625255
gold      23.217923
bronze    22.403259
Name: proportion, dtype: float64

## Data Types Fixing

In [32]:
pd.to_datetime(data['signup_date'])

0     2025-04-26
1     2024-08-11
2     2025-05-15
3     2025-06-14
4     2025-05-02
         ...    
495   2025-02-19
496   2024-12-30
497   2025-06-21
498   2024-10-02
499   2024-11-28
Name: signup_date, Length: 491, dtype: datetime64[ns]

In [33]:
data['signup_date']= pd.to_datetime(data['signup_date'])

In [37]:
data.head(10)

Unnamed: 0,customer_id,email,signup_date,gender,region,loyalty_tier
0,C00001,shaneramirez@gmail.com,2025-04-26,male,Central,silver
1,C00002,jpeterson@bernard.com,2024-08-11,female,Central,gold
2,C00003,howardmaurice@yahoo.com,2025-05-15,male,Central,gold
3,C00004,yherrera@arnold.org,2025-06-14,female,Central,gold
4,C00005,janetwilliams@gmail.com,2025-05-02,male,West,bronze
5,C00006,wyattmichelle@yahoo.com,2024-10-21,male,Central,bronze
6,C00007,francisco53@hotmail.com,2024-10-17,female,North,gold
7,C00008,amandasanchez@gray-mayo.net,2024-09-29,male,Central,gold
8,C00009,perezantonio@yahoo.com,2024-12-08,female,South,silver
9,C00010,clarksherri@hotmail.com,2024-11-05,male,East,gold


# Loading The Data

In [34]:
dwh_conn = pyodbc.connect(
            "DRIVER={ODBC Driver 17 for SQL Server};"
            "SERVER=localhost;"
            "DATABASE=retail_dwh;"
            "Trusted_Connection=yes;"
        )

In [35]:
dwh_cursor = dwh_conn.cursor()

In [36]:
customer_count = 0

In [38]:
for index, row in data.iterrows():
    dwh_cursor.execute("""
        INSERT INTO Dim_Customer (customer_id, email, gender, region, loyalty_tier, signup_date)
        VALUES (?, ?, ?, ?, ?, ?)
    """,
    row['customer_id'],
    row['email'],
    row['gender'],
    row['region'],
    row['loyalty_tier'],
    row['signup_date'])
    customer_count += 1

In [39]:
dwh_conn.commit()

In [40]:
dwh_cursor.close()
dwh_conn.close()