### DMart Data modeling

Notebook models the DMART super market data. The dataset is available from Kaggle at  https://www.kaggle.com/datasets/datatattle/dt-mart-market-mix-modeling

In [1]:
import configparser
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
config = configparser.ConfigParser()

In [3]:
config.read('clusterdmart.config')

['clusterdmart.config']

In [4]:
config['POSTGRES']['PG_DB']

'dmartdb'

In [5]:
db = config['POSTGRES']['PG_DB']
user = config['POSTGRES']['PG_UNAME']
passwd = config['POSTGRES']['PG_PASS']
port = config['POSTGRES']['PG_PORT']
host = config['POSTGRES']['PG_HOST']

Using the pandas internal read_sql method to get the data from the DB

In [6]:
credentials = "postgresql://{}:{}@{}:{}/{}".format(user,passwd,host,port,db)

In [7]:
credentials

'postgresql://postgres:1234@172.17.0.2:5432/dmartdb'

In [8]:
#using psycopg2 to test connection since there are no tables

import psycopg2
try:
    conn = psycopg2.connect(host=host,dbname=db,user=user,password=passwd,port=port)
except Exception as e:
    print(e)

In [9]:
conn.set_session(autocommit=True)

In [10]:
try:
    cur = conn.cursor()
    
except:
    print(e)

Following steps to start the data modeling

0) Ingest the data into the database under multiple Raw Tables inside the dmartdb

1) Identify the primary key that will be used as reference. Design the schema around the data

2) Design the facttable and dimensions table

3) Create tables and insert data into the tables

4) Finally bring together tables under one facttable and query to check it

In [9]:
firstfile = pd.read_csv('./dmartdata/firstfile.csv')
MonthlyNPSscore = pd.read_csv('./dmartdata/MonthlyNPSscore.csv')
SpecialSale = pd.read_csv('./dmartdata/SpecialSale.csv')
MediaInvestment = pd.read_csv('./dmartdata/MediaInvestment.csv')
Secondfile = pd.read_csv('./dmartdata/Secondfile.csv')
ProductList = pd.read_csv('./dmartdata/ProductList.csv')

In [55]:
Sales = pd.read_csv('./dmartdata/csv_Sales.csv')

In [56]:
Sales_columns = Sales.loc[0]
finalColumns = Sales_columns[:13]

In [64]:
finalColumns = list(finalColumns)

In [66]:
finalColumns.pop(1)

'Date'

In [67]:
finalColumns

['ID',
 'ID_Order',
 'ID_Item_ordered',
 'GMV',
 'Units_sold',
 'SLA',
 'Product_Category',
 'Analytic_Category',
 'Sub_category',
 'product_analytic_vertical',
 'MRP',
 'Procurement_SLA']

The Column 0 and 1 needs to be appended together with ":" 

The Row 0 needs to become the table header

In [68]:
newDate = [];

date = Sales.iloc[:,1]
time = Sales.iloc[:,2]

for i,d in enumerate(date):
    newDate.append(f'{d}:{time[i]}')

In [69]:
Sales.drop(['0', '1'], axis=1,inplace=True)

In [70]:
Sales.drop(0, axis=0,inplace=True)

In [71]:
Sales.columns = finalColumns

In [72]:
Sales.head(5)

Unnamed: 0,ID,ID_Order,ID_Item_ordered,GMV,Units_sold,SLA,Product_Category,Analytic_Category,Sub_category,product_analytic_vertical,MRP,Procurement_SLA
1,ACCCX3S58G7B5F6P,3420000000000000.0,3420000000000000.0,6400,1,5,CE,CameraAccessory,CameraAccessory,CameraTripod,7190,0.0
2,ACCCX3S58G7B5F6P,1420000000000000.0,1420000000000000.0,6900,1,7,CE,CameraAccessory,CameraAccessory,CameraTripod,7190,0.0
3,ACCCX3S5AHMF55FV,2420000000000000.0,2420000000000000.0,1990,1,10,CE,CameraAccessory,CameraAccessory,CameraTripod,2099,3.0
4,ACCCX3S5AHMF55FV,4420000000000000.0,4420000000000000.0,1690,1,4,CE,CameraAccessory,CameraAccessory,CameraTripod,2099,3.0
5,ACCCX3S5AHMF55FV,4420000000000000.0,4420000000000000.0,1618,1,6,CE,CameraAccessory,CameraAccessory,CameraTripod,2099,3.0


In [74]:
Sales['Date'] = newDate[1:]
Sales.head(1)

Unnamed: 0,ID,ID_Order,ID_Item_ordered,GMV,Units_sold,SLA,Product_Category,Analytic_Category,Sub_category,product_analytic_vertical,MRP,Procurement_SLA,Date
1,ACCCX3S58G7B5F6P,3420000000000000.0,3420000000000000.0,6400,1,5,CE,CameraAccessory,CameraAccessory,CameraTripod,7190,0.0,17-10-2015:15:11


In [None]:
Sales.to_csv('')

#### Writing additional helpers

In [10]:
#Using pandas read_sql for getting schema
def getSchema(tableName, credentials):
    schema = pd.read_sql("""SELECT * FROM information_schema.columns where table_name='{}'""".format(tableName),con=credentials)
    return schema

In [11]:
#Issue is in using pd.read_sql to write data to the database. so using psycopg2
def queryTable(query):
    try:
        schema = cur.execute(query)
        return 
    except Exception as e:
        print(e)
        
#This doesn't return anything

In [12]:
#Using the pd.read_sql for getting data from db
def queryBase(query):
    requiredTable = pd.read_sql(query,con=credentials)
    return requiredTable

#This returns the dataframe

In [13]:
def schemaGen(dataframe, schemaName):
    localSchema = pd.io.sql.get_schema(dataframe,schemaName)
    localSchema = localSchema.replace('TEXT','VARCHAR(255)').replace('INTEGER','NUMERIC').replace('\n','').replace('"',"")
    return "".join(localSchema)

In [14]:
#First lets understand the Data shape

print(f'Secondfile:{Secondfile.shape}')
print(f'firstfile:{firstfile.shape}')
print(f'MonthlyNPSscore:{MonthlyNPSscore.shape}')
print(f'Sales:{Sales.shape}')
print(f'SpecialSale:{SpecialSale.shape}')
print(f'ProductList:{ProductList.shape}')
print(f'MediaInvestment:{MediaInvestment.shape}')

Secondfile:(12, 40)
firstfile:(1578079, 10)
MonthlyNPSscore:(12, 2)
Sales:(1019787, 14)
SpecialSale:(44, 2)
ProductList:(75, 3)
MediaInvestment:(12, 12)


### One good thing is the max columns are 40 in number. So the dataset is manageable. Lets get at it.

In [15]:
firstfile.head(1)

Unnamed: 0.1,Unnamed: 0,Date,Sales_name,gmv_new,units,product_mrp,discount,product_category,product_subcategory,product_vertical
0,1,2015-07-01,No Promotion,3040.0,1,3650.0,610.0,EntertainmentSmall,HomeAudio,HomeAudioSpeaker


In [16]:
Secondfile.columns

Index(['Unnamed: 0', 'month', 'Revenue_Camera', 'Revenue_CameraAccessory',
       'Revenue_EntertainmentSmall', 'Revenue_GameCDDVD',
       'Revenue_GamingHardware', 'total_gmv', 'Units_Camera',
       'Units_CameraAccessory', 'Units_EntertainmentSmall', 'Units_GameCDDVD',
       'Units_GamingHardware', 'total_Units', 'Mrp_Camera',
       'Mrp_CameraAccessory', 'Mrp_EntertainmentSmall', 'Mrp_GameCDDVD',
       'Mrp_GamingHardware', 'total_Mrp', 'Discount_Camera',
       'Discount_CameraAccessory', 'Discount_EntertainmentSmall',
       'Discount_GameCDDVD', 'Discount_GamingHardware', 'total_Discount',
       'Year', 'Month', 'Total.Investment', 'TV', 'Digital', 'Sponsorship',
       'Content.Marketing', 'Online.marketing', 'Affiliates', 'SEM', 'Radio',
       'Other', 'Date', 'NPS'],
      dtype='object')

In [24]:
Secondfile.head(1)

Unnamed: 0.1,Unnamed: 0,month,Revenue_Camera,Revenue_CameraAccessory,Revenue_EntertainmentSmall,Revenue_GameCDDVD,Revenue_GamingHardware,total_gmv,Units_Camera,Units_CameraAccessory,...,Digital,Sponsorship,Content.Marketing,Online.marketing,Affiliates,SEM,Radio,Other,Date,NPS
0,1,Jan 2016,186935802.0,26387430.0,109302000.0,16884870.0,47700160.0,387210200.0,10442,22525,...,5000000.0,42000000.0,9000000.0,229000000.0,74000000.0,42000000.0,27000000.0,271000000.0,2016-01-01,47.1


In [28]:
SpecialSale.loc[0]

Date                      7/18/2015
Sales Name    Eid & Rathayatra sale
Name: 0, dtype: object

In [30]:
ProductList.loc[1]

Product      AmplifierReceiver
Frequency                 4056
Percent                    0.2
Name: 1, dtype: object

In [31]:
MediaInvestment.loc[1]

Year                 2015.0
Month                   8.0
Total Investment        5.1
TV                      0.0
Digital                 1.3
Sponsorship             1.1
Content Marketing       0.0
Online marketing        0.1
 Affiliates             0.1
SEM                     2.5
Radio                   NaN
Other                   NaN
Name: 1, dtype: float64