In [88]:
import pandas as pd
from pandas import DataFrame
import numpy as np
import json
import os

def __get_abs_fpaths__(dir):
    for dirpath,_,filenames in os.walk(dir):
        for f in filenames:
            yield os.path.abspath(os.path.join(dirpath, f))

class Base:
  def __init__(self, record_type, columns,data_dir, top_columns=None, hover_columns=None):
    self.__record_type__= record_type
    self.__columns__= columns
    if top_columns is not None:
      self.__top_columns__ = top_columns
    else:
      self.__top_columns__ = columns

    if hover_columns is not None:
      self.__hover_columns__ = hover_columns
    else:
      self.__hover_columns__ = columns

    self.__aggregated_dir__= f'''{data_dir}/aggregated/{record_type}/'''
    self.__top_dir__=f'''{data_dir}/top/{record_type}/'''
    self.__hover_dir__= f'''{data_dir}/map/{record_type}/hover'''
    self.__record_class__ = ['aggregated','top','hover']
    self.__aggregated_result__ = None
    self.__top_result__ = None
    self.__hover_result__ = None

  '''
    Load a JSON data file into a dataframe,given the full path
  '''
  def __load_file__ (f):
    try:
      df = pd.read_json(f)
      df.drop(columns=['success','code'])
      return df
    except:
      raise Exception(f'''File not found: {f}''')

  '''
    Load data
  '''
  def load(self):
    #print( '''load() - Start''')

    # Load aggregated
    result=self.load_aggregated()

    # Load top
    result = self.load_top()

    # Load map hover data
    #result=self.load_hover()

    #print( f'''load_data() {count} files processed - End''')

  '''
    Load aggregate data
  '''
  def load_aggregated(self):
    result = dict()
    for c in self.__columns__:
      result[c]=list()

    for f in __get_abs_fpaths__(self.__aggregated_dir__):
      self.__decode_aggregate__(f,result)

    self.__aggregated_result__ = pd.DataFrame(result, columns= self.__columns__ )

  '''
    Load top data
  '''
  def load_top(self):
    result = dict()
    for c in self.__top_columns__:
      result[c]=list()

    for f in __get_abs_fpaths__(self.__top_dir__):
      self.__decode_top__(f,result)

    self.__top_result__ = pd.DataFrame(result, columns=self.__top_columns__ )
  '''
    Load hover data
  '''
  def load_hover(self):
    result = dict()
    for c in self.__hover_columns__:
      result[c]=list()
    for f in __get_abs_fpaths__(self.__hover_dir__):
      self.__decode_hover__(f,result)

    self.__hover_result__ = pd.DataFrame(result, columns=self.__hover_columns__ )


  '''
    Return a dataframe with aggregated data
  '''
  def aggregated(self):
    return self.__aggregated_result__

  '''
    Return a dataframe with top data
  '''
  def top(self):
    return self.__top_result__

  '''
    Return a dataframe with hover data
  '''
  def hover(self):
    return self.__hover_result__

  '''
    Write the loaded results into CSV files
  '''
  def to_csv(self):
    if self.__aggregated_result__  is not None:
      self.__aggregated_result__.to_csv(f'{self.__record_type__}_agg.csv', sep=',', encoding='utf-8', index=False)

    if self.__top_result__  is not None:
      self.__top_result__.to_csv(f'{self.__record_type__}_top.csv', sep=',', encoding='utf-8',index=False)

    if self.__hover_result__ is not None:
      self.__hover_result__.to_csv(f'{self.__record_type__}_hover.csv', sep=',', encoding='utf-8',index=False)

  '''
    Read CSV files in a given folder and load the dataframes
  '''
  def from_csv(self,folder):
    f=f'''{folder}/{self.__record_type__}_agg.csv'''
    if os.path.exists(f):
      self.__aggregated_result__ = pd.read_csv(f)
      self.__aggregated_result__.geo_name = self.__aggregated_result__.geo_name.astype(str)


    f=f'''{folder}/{self.__record_type__}_top.csv'''
    if os.path.exists(f):
      self.__top_result__ = pd.read_csv(f)
      self.__top_result__.geo_name = self.__top_result__.geo_name.astype(str)

    f=f'''{folder}/{self.__record_type__}_hover.csv'''
    if os.path.exists(f):
      self.__hover_result__ = pd.read_csv(f)
      self.__hover_result__.geo_name = self.__hover_result__.geo_name.astype(str)


  def __lookup_geo_id__(geo_typename,geo):
    geo_type, geo_name, geo_pname = geo_typename.split(',')

    if geo_type == 'STA':
      geo_name=geo_name.replace(' ','-')

    #print(f'''geo_type:*{geo_type}*, geo_name:*{geo_name}*''')
    geo_recs = geo[(geo['geo_type']==geo_type) & (geo['name']==geo_name)].reset_index()
    if geo_recs.shape[0]==1:
      return geo_recs.loc[0,'id']
    elif geo_recs.shape[0] > 1:
      if geo_pname is not None:
        if geo_pname != 'nan':
          precs = geo[geo.name==geo_pname].reset_index()
          if precs.shape[0] > 0 :
            return geo_recs[geo_recs.parent_id == precs.loc[0,'id']].reset_index().loc[0,'id']
        else:
            return geo_recs.sort_values('parent_id', ascending=True).loc[0,'id']
    return -1

  def __combine_geo_typename(geo_type,geo_name,geo_pname):
    if geo_pname is not None:
      return geo_type + ',' + geo_name + ',' + geo_pname
    else:
      return geo_type + ',' + geo_name

  def __preprocess__(self,df,geo):
    df['geo_name']=df['geo_name'].astype(str)
    df['geo_pname']=df['geo_pname'].astype(str)


    # Lookup the geo_id from type and name
    df["geo_typename"] = df.apply(lambda x: Base.__combine_geo_typename(x['geo_type'], x['geo_name'], x['geo_pname']), axis=1)
    df['geo_typename'].astype(str)
    df['geo_id']=df['geo_typename'].apply(Base.__lookup_geo_id__, args=(geo,))

    # Drop unnecessary columns
    df.drop(columns=['geo_type', 'geo_name', 'geo_typename', 'geo_pname'], inplace=True)
    df = df.reset_index().rename(columns={'index':'id'})
    return df

  '''
    Store the loaded records to DB
  '''
  def store(self, con, geo):
    self.__preprocess_and_store__(con, geo)

  '''
    Decode the common fields
  '''
  def __decode_common__(f, result):
    #print('''decode_common() - Start ''')
    state=None
    year=None
    quarter=None
    if 'state' in f:
      state,year,quarter = os.path.normpath(f).split(os.sep)[-3:]
    else:
      year,quarter=os.path.normpath(f).split(os.sep)[-2:]
    quarter=quarter.split('.')[0]
    geo_type='CON'
    geo_name='india'
    geo_pname=None
    if year is not None and quarter is not None:
      if state is not None:
        geo_type='STA'
        geo_name=state
        geo_pname='india'

    #print('''decode_common() - End ''')
    result['year'].append(year)
    result['quarter'].append(quarter)
    result['geo_type'].append(geo_type)
    result['geo_name'].append(geo_name)
    result['geo_pname'].append(geo_pname)


#-------------------------------- End of Base class ----------------------------------

'''
  Class to handle loading and saving of 'transaction' data
'''
class Transaction (Base):
  def __init__(self, data_dir='/content/data'):
    Base.__init__(self,
                  'transaction',
                   ['year','quarter', 'geo_type', 'geo_name', 'geo_pname','category', 'stat_type', 'count', 'amount'],
                  data_dir,
                  top_columns=['year','quarter', 'geo_type', 'geo_name','geo_pname','top_in','stat_type', 'count', 'amount'],
                  hover_columns=['year','quarter', 'geo_type', 'geo_name','geo_pname','hover_over','stat_type', 'count', 'amount'])

  '''
    Load all Transaction stats
  '''
  def __decode_aggregate__(self,f,result):
    #print('Transaction.__decode_aggregate__ - Start')

    # Load JSON contents into a temp dataframe
    df=Base.__load_file__(f)

    # Seek to the contents of the 'transaction' data
    txn_recs = df.loc['transactionData','data']
    for txn_rec in txn_recs:
      # Collect the common fields from path name
      Base.__decode_common__(f,result)

      # Process record for all category
      for payment_rec in txn_rec['paymentInstruments']:
        result['category'].append(txn_rec["name"])
        result['stat_type'].append(payment_rec["type"])
        result['count'].append(payment_rec["count"])
        result['amount'].append(payment_rec["amount"])

    #print('Transaction.__decode_aggregate__ - End')

  def __decode_top__(self,f,result):
    # Load JSON contents into a temp dataframe
    df=Base.__load_file__(f)

    # Seek to the contents of the 'transaction' data
    txn_states = df.loc['states','data']
    txn_districts = df.loc['districts','data']
    txn_pincodes = df.loc['pincodes','data']

    if txn_states is not None:
      for txn in txn_states:
        Base.__decode_common__(f,result)
        result['top_in'].append(result['geo_type'][-1])
        result['geo_type'][-1]='STA'
        result['geo_pname'][-1]='india'
        result['geo_name'][-1]=txn['entityName']
        result['stat_type'].append(txn['metric']["type"])
        result['count'].append(txn['metric']["count"])
        result['amount'].append(txn['metric']["amount"])

    if txn_districts is not None:
      for txn in txn_districts:
        Base.__decode_common__(f,result)
        result['top_in'].append(result['geo_type'][-1])
        result['geo_type'][-1]='DIS'
        if result['top_in'][-1] == 'STA':
          result['geo_pname'][-1]=result['geo_name'][-1]
        result['geo_name'][-1]=txn['entityName']
        result['stat_type'].append(txn['metric']["type"])
        result['count'].append(txn['metric']["count"])
        result['amount'].append(txn['metric']["amount"])

    if txn_pincodes is not None:
      for txn in txn_pincodes:
        Base.__decode_common__(f,result)
        result['top_in'].append(result['geo_type'][-1])
        result['geo_type'][-1]='PIN'
        if result['top_in'][-1] == 'STA':
          result['geo_pname'][-1]=result['geo_name'][-1]
        result['geo_name'][-1]=txn['entityName']
        result['stat_type'].append(txn['metric']["type"])
        result['count'].append(txn['metric']["count"])
        result['amount'].append(txn['metric']["amount"])

  def __decode_hover__(self,f,result):
    # Load JSON contents into a temp dataframe
    df=Base.__load_file__(f)
    #print(df.to_markdown())
    txn_hover_recs = df.loc['hoverDataList','data']
    for txn_hover_rec in txn_hover_recs:
      # Collect the common fields from path name
      Base.__decode_common__(f,result)
      result['hover_over'].append(result['geo_type'][-1])
      result['geo_pname'][-1]=result['geo_name'][-1]

      if result['hover_over'][-1] == 'CON':
        result['geo_type'][-1]='STA'
      else:
        result['geo_type'][-1]='DIS'

      metric=txn_hover_rec['metric'][0]
      result['geo_name'][-1]=txn_hover_rec['name']
      if result['geo_type'][-1] == 'DIS':
        result['geo_name'][-1] = result['geo_name'][-1].split(' district')[0] # Remove  the ' district' at the end of the name
      result['stat_type'].append(metric['type'])
      result['count'].append(metric['count'])
      result['amount'].append(metric['amount'])



  def __preprocess_and_store__(self, conn, geo):
    agg_txns = self.__preprocess__(self.__aggregated_result__, geo=geo)
    top_txns = self.__preprocess__(self.__top_result__, geo=geo)
    hover_txns=self.__preprocess__(self.__hover_result__, geo=geo)
    print(f'''agg_txns(db): {agg_txns.shape}, top_txns(db):{top_txns.shape}, hover_txns(db):{hover_txns.shape}''')

    # Write to 'transaction_*' tables
    agg_txns.to_sql(con=conn, name='transaction_agg',if_exists='append',index=False)
    top_txns.to_sql(con=conn, name='transaction_top',if_exists='append',index=False)
    hover_txns.to_sql(con=conn, name='transaction_hover',if_exists='append',index=False)
    conn.commit()

#---------------------------------------------End of Transaction class --------------------------
'''
  Class to load and save 'User' data
'''
class User(Base):

  def __init__(self, data_dir="/content/data"):
    Base.__init__(
        self,
        'user',
        ['year','quarter', 'geo_type', 'geo_name', 'geo_pname','reg_users', 'app_opens','brand','count','percentage'],
        data_dir,
        top_columns=['year','quarter', 'geo_type', 'geo_name','geo_pname','top_in', 'reg_users'],
        hover_columns=['year','quarter', 'geo_type', 'geo_name','geo_pname','hover_over', 'reg_users', 'app_opens'])

  '''
    Decodes aggregate user data
  '''
  def __decode_aggregate__(self,f,result):
    #print('decode_user - Start')

    # Load JSON contents into a temp dataframe
    df=Base.__load_file__(f)

    # Seek to the contents of the 'user.aggregated' data
    user_stat_rec = df.loc['aggregated','data']

    # Seek to the contents of the 'user.device' data
    device_recs = df.loc['usersByDevice','data']

    # For each device row populate both aggregate and device fields
    if device_recs is not None:
      for device_rec in device_recs:
        # Collect the common fields from path name
        Base.__decode_common__(f,result)
        result['reg_users'].append(user_stat_rec['registeredUsers'])
        result['app_opens'].append(user_stat_rec['appOpens'])
        result['brand'].append(device_rec['brand'])
        result['count'].append(device_rec['count'])
        result['percentage'].append(device_rec['percentage'])
    else:
      Base.__decode_common__(f,result)
      result['reg_users'].append(user_stat_rec['registeredUsers'])
      result['app_opens'].append(user_stat_rec['appOpens'])
      result['brand'].append('Unknown')
      result['count'].append(0)
      result['percentage'].append(100)

  #print('decode_user - End')

  '''
    Decodes top user data
  '''
  def __decode_top__(self,f,result):
    # Load JSON contents into a temp dataframe
    df=Base.__load_file__(f)

    # Seek to the contents of the 'transaction' data
    user_states = df.loc['states','data']
    user_districts = df.loc['districts','data']
    user_pincodes = df.loc['pincodes','data']


    if user_states is not None:
      for user in user_states:
        Base.__decode_common__(f,result)
        result['top_in'].append(result['geo_type'][-1])
        result['geo_type'][-1]='STA'
        result['geo_pname'][-1]='india'
        result['geo_name'][-1]=user['name']
        result['reg_users'].append(user['registeredUsers'])

    if user_districts is not None:
      for user in user_districts:
        Base.__decode_common__(f,result)
        result['top_in'].append(result['geo_type'][-1])
        result['geo_type'][-1]='DIS'
        if result['top_in'][-1] == 'STA':
          result['geo_pname'][-1]=result['geo_name'][-1]
        result['geo_name'][-1]=user['name']
        result['reg_users'].append(user['registeredUsers'])

    if user_pincodes is not None:
      for user in user_pincodes:
        Base.__decode_common__(f,result)
        result['top_in'].append(result['geo_type'][-1])
        result['geo_type'][-1]='PIN'
        if result['top_in'][-1] == 'STA':
          result['geo_pname'][-1]=result['geo_name'][-1]
        result['geo_name'][-1]=user['name']
        result['reg_users'].append(user['registeredUsers'])

  def __decode_hover__(self,f,result):
    # Load JSON contents into a temp dataframe
    df=Base.__load_file__(f)
    #print(df.to_markdown())
    usr_hover_recs = df.loc['hoverData','data']
    for rec_key in usr_hover_recs.keys():
      usr_hover_rec = usr_hover_recs[rec_key]
      # Collect the common fields from path name
      Base.__decode_common__(f,result)
      result['hover_over'].append(result['geo_type'][-1])
      result['geo_pname'][-1]=result['geo_name'][-1]

      if result['hover_over'][-1] == 'CON':
        result['geo_type'][-1]='STA'
      else:
        result['geo_type'][-1]='DIS'

      result['geo_name'][-1]=rec_key
      if result['geo_type'][-1] == 'DIS':
        result['geo_name'][-1] = result['geo_name'][-1].split(' district')[0] # Remove  the ' district' at the end of the name
      result['reg_users'].append(usr_hover_rec['registeredUsers'])
      result['app_opens'].append(usr_hover_rec['appOpens'])

  def __preprocess_and_store__(self, conn, geo):
    agg_users = self.__preprocess__(self.__aggregated_result__, geo=geo)
    top_users = self.__preprocess__(self.__top_result__, geo=geo)
    hover_users = self.__preprocess__(self.__hover_result__, geo=geo)
    print(f'''Read From CSV: agg_users(db): {agg_users.shape}, top_users(db):{top_users.shape}, hover_users(db):{hover_users.shape}''')

    df_user_agg = agg_users[['year','quarter','geo_id','reg_users','app_opens']]
    df_device_agg = agg_users[['year','quarter','geo_id','brand','count','percentage']].reset_index().rename(columns={'index':'id'})
    df_user_agg.drop_duplicates(inplace=True,ignore_index=True)

    df_user_agg = df_user_agg.reset_index().rename(columns={'index':'id'})
    df_user_agg['stat_type']='TOTAL'
    top_users['stat_type']='TOTAL'
    hover_users['stat_type']='TOTAL'


    # Write to 'user_**' tables
    df_user_agg.to_sql(con=conn, name='user_agg',if_exists='append',index=False)
    df_device_agg.to_sql(con=conn, name='device_agg',if_exists='append',index=False)
    top_users.to_sql(con=conn, name='user_top',if_exists='append',index=False)
    hover_users.to_sql(con=conn, name='user_hover',if_exists='append',index=False)
    print(f''' Written to DB: user_agg(db): {df_user_agg.shape}, device_agg(db):{df_device_agg.shape}, top_user(db):{top_users.shape}, hover_users(db):{hover_users.shape}''')
    conn.commit()
#-------------------------------End of User class ---------------------------------------
'''
  Class to load and save 'Insurance' data
'''
class Insurance(Base):

  def __init__(self, data_dir="/content/data"):
    Base.__init__(
        self,
        'insurance',
         ['year','quarter', 'geo_type', 'geo_name', 'geo_pname','category', 'stat_type', 'count', 'amount'],
        data_dir,
        top_columns=['year','quarter', 'geo_type', 'geo_name','geo_pname', 'top_in','stat_type', 'count', 'amount'],
        hover_columns=['year','quarter', 'geo_type', 'geo_name','geo_pname', 'hover_over','stat_type', 'count', 'amount'])

  '''
    Decodes aggregate insurance data
  '''
  def __decode_aggregate__(self,f,result):
     #print('decode_ins - Start')

     # Load JSON contents into a temp dataframe
      df=Base.__load_file__(f)

      # print(df.to_markdown())
      # Seek to the contents of the 'insurance' data
      ins_recs = df.loc['transactionData','data']

      # For each 'insurance' row populate fields
      for ins_rec in ins_recs:
        # Collect the common fields from path name
        Base.__decode_common__(f, result)
        for payment_rec in ins_rec['paymentInstruments']:
          result['category'].append(ins_rec["name"])
          result['stat_type'].append(payment_rec["type"])
          result['count'].append(payment_rec["count"])
          result['amount'].append(payment_rec["amount"])
      #print('decode_ins - End')

  '''
    Decodes top insurance data
  '''
  def __decode_top__(self,f,result):
    # Load JSON contents into a temp dataframe
    df=Base.__load_file__(f)

    # Seek to the contents of the 'transaction' data
    ins_states = df.loc['states','data']
    ins_districts = df.loc['districts','data']
    ins_pincodes = df.loc['pincodes','data']

    if ins_states is not None:
      for ins in ins_states:
        Base.__decode_common__(f,result)
        result['top_in'].append(result['geo_type'][-1])
        result['geo_type'][-1]='STA'
        result['geo_pname'][-1]='india'
        result['geo_name'][-1]=ins['entityName']
        result['stat_type'].append(ins['metric']["type"])
        result['count'].append(ins['metric']["count"])
        result['amount'].append(ins['metric']["amount"])

    if ins_districts is not None:
      for ins in ins_districts:
        Base.__decode_common__(f,result)
        result['top_in'].append(result['geo_type'][-1])
        result['geo_type'][-1]='DIS'
        if result['top_in'][-1] == 'STA':
          result['geo_pname'][-1]=result['geo_name'][-1]
        result['geo_name'][-1]=ins['entityName']
        result['stat_type'].append(ins['metric']["type"])
        result['count'].append(ins['metric']["count"])
        result['amount'].append(ins['metric']["amount"])

    if ins_pincodes is not None:
      for ins in ins_pincodes:
        Base.__decode_common__(f,result)
        result['top_in'].append(result['geo_type'][-1])
        result['geo_type'][-1]='PIN'
        if result['top_in'][-1] == 'STA':
          result['geo_pname'][-1]=result['geo_name'][-1]
        result['geo_name'][-1]=ins['entityName']
        result['stat_type'].append(ins['metric']["type"])
        result['count'].append(ins['metric']["count"])
        result['amount'].append(ins['metric']["amount"])

  def __decode_hover__(self,f,result):
    # Load JSON contents into a temp dataframe
    df=Base.__load_file__(f)
    #print(df.to_markdown())
    txn_hover_recs = df.loc['hoverDataList','data']
    for txn_hover_rec in txn_hover_recs:
      # Collect the common fields from path name
      Base.__decode_common__(f,result)
      result['hover_over'].append(result['geo_type'][-1])
      result['geo_pname'][-1]=result['geo_name'][-1]

      if result['hover_over'][-1] == 'CON':
        result['geo_type'][-1]='STA'
      else:
        result['geo_type'][-1]='DIS'

      metric=txn_hover_rec['metric'][0]
      result['geo_name'][-1]=txn_hover_rec['name']
      if result['geo_type'][-1] == 'DIS':
        result['geo_name'][-1] = result['geo_name'][-1].split(' district')[0] # Remove  the ' district' at the end of the name

      result['stat_type'].append(metric['type'])
      result['count'].append(metric['count'])
      result['amount'].append(metric['amount'])

  def __preprocess_and_store__(self, conn, geo):
    agg_ins = self.__preprocess__(self.__aggregated_result__, geo=geo)
    top_ins = self.__preprocess__(self.__top_result__, geo=geo)
    hover_ins = self.__preprocess__(self.__hover_result__, geo=geo)

    print(f'''agg_ins(db): {agg_ins.shape}, top_ins(db):{top_ins.shape}, , hover_ins(db):{hover_ins.shape}''')

    # Write to 'transaction_*' tables
    agg_ins.to_sql(con=conn, name='insurance_agg',if_exists='append',index=False)
    top_ins.to_sql(con=conn, name='insurance_top',if_exists='append',index=False)
    hover_ins.to_sql(con=conn, name='insurance_hover',if_exists='append',index=False)

    conn.commit()
#------------------------------------------------ End Insurance class -------------------------------

class Geo:
  def __init__(self,data_dir='/content/data'):
    self.__data_dir__= data_dir
    self.__states__=dict()
    self.__districts__=dict()
    self.__pincodes__=dict()
    self.__preprocessed_states__=pd.DataFrame({})
  def index(self):
    for f in __get_abs_fpaths__(self.__data_dir__):
      state=None
      if 'state' in f:
        state= os.path.normpath(f).split(os.sep)[-3]
        # Index the state
        if state not in self.__states__:
          self.__states__[state]='india'

        if 'top' in f:
          # Load JSON contents into a temp dataframe
          df=Base.__load_file__(f)

          # Seek to the contents of the 'transaction' data
          districts = df.loc['districts','data']
          pincodes = df.loc['pincodes','data']

          # Index the districts in the file
          if districts is not None:
            for entry in districts:
              if 'entityName' in entry:
                district = entry['entityName']
              elif 'name' in entry:
                district = entry['name']
              if district not in self.__districts__:
                self.__districts__[district] = state
              else:
                if self.__districts__[district] != state:
                  self.__districts__[district+"|"+state]=state

          # Index the pincodes in the file
          if pincodes is not None:
            for entry in pincodes:
              if 'entityName' in entry:
                pincode = entry['entityName']
              elif 'name' in entry:
                pincode = entry['name']
              if pincode not in self.__pincodes__:
                self.__pincodes__[pincode] = state

        elif 'hover' in f:
          df=Base.__load_file__(f)

          # Seek to the contents of the 'transaction' data
          if 'user' in f:
            recs = df.loc['hoverData','data']
            for rec_key in recs.keys():
              district=rec_key
              district=district.split(' district')[0]
              if district not in self.__districts__:
                self.__districts__[district] = state
              else:
                if self.__districts__[district] != state:
                  self.__districts__[district+"|"+state]=state
          else:
            recs = df.loc['hoverDataList','data']
            for rec in recs:
              district=rec['name']
              district=district.split(' district')[0]
              if district not in self.__districts__:
                self.__districts__[district] = state
              else:
                if self.__districts__[district] != state:
                  self.__districts__[district+"|"+state]=state


  def states(self):
    df = pd.DataFrame.from_dict(self.__states__,orient='index').reset_index().rename(columns={'index':'name', 0:'parent'})
    df.name = df.name.astype(str)
    df.parent = df.parent.astype(str)
    return df

  def districts(self):
    df=pd.DataFrame.from_dict(self.__districts__,orient='index').reset_index().rename(columns={'index':'name', 0:'parent'})
    df.name = df.name.astype(str)
    df.parent = df.parent.astype(str)
    return df

  def pincodes(self):
    df= pd.DataFrame.from_dict(self.__pincodes__,orient='index').reset_index().rename(columns={'index':'name', 0:'parent'})
    df.name = df.name.astype(str)
    df.parent = df.parent.astype(str)
    return df

  def to_csv(self):
    if self.__states__  is not None:
      self.states().to_csv('states.csv', sep=',', encoding='utf-8')

    if self.__districts__  is not None:
      self.districts().to_csv('districts.csv', sep=',', encoding='utf-8')

    if self.__pincodes__  is not None:
      self.pincodes().to_csv('pincodes.csv', sep=',', encoding='utf-8')

  def from_csv(self,folder):
    f=f'''{folder}/states.csv'''
    if os.path.exists(f):
      df=pd.read_csv(f).set_index(keys=['name'],drop=True)
      self.__states__ = df.to_dict(orient='dict', index=True)['parent']

    f=f'''{folder}/districts.csv'''
    if os.path.exists(f):
      df=pd.read_csv(f).set_index(keys=['name'],drop=True)
      self.__districts__ = df.to_dict(orient='dict', index=True)['parent']

    f=f'''{folder}/pincodes.csv'''
    if os.path.exists(f):
      df=pd.read_csv(f,dtype={'name':str,'parent':str}).set_index(keys=['name'],drop=True)
      self.__pincodes__ = df.to_dict(orient='dict', index=True)['parent']

  def lookup_state_id(state_name, states):
    ret=None
    ret = states[states['name']==state_name].reset_index()['id'][0]
    return ret;

  def __preprocess_and_store__(self, conn):
    pincodes=self.pincodes() #.convert_dtypes()
    states = self.states()
    districts = self.districts()

    '''
      Prepare the 'states' DF for DB insertion
    '''
    # Add geo_type column and set the value to 'STA'
    states['geo_type']='STA'

    # Replace the DF column names to match the DB column names
    states=states.reset_index().rename(columns={'index': 'id'} )

    # Assign non-conflicting IDs for DB rows
    states.id=states.id+1

    '''
      Prepare the 'districts' DF for DB insertion
    '''
    # Add geo_type column and set the value to 'DIS'
    districts['geo_type']='DIS'
    #Cleanup state name
    districts['name']=districts['name'].apply(lambda x: x.split('|')[0] )

    # Replace the DF column names to match the DB column names
    districts=districts.reset_index().rename(columns={'index': 'id'})

    # Assign non-conflicting DB ids for the districts
    districts.id=districts.id+len(states.index)+1

    '''
      Prepare the 'pincodes' DF for DB insertion
    '''
    # Add geo_type column and set the value to 'PIN'
    pincodes['geo_type']='PIN'

    # Replace the DF column names to match the DB column names
    pincodes=pincodes.reset_index().rename(columns={'index': 'id'} )

    # Assign non-conflicting DB ids for the districts
    pincodes.id=pincodes.id+len(states.index)+ len(districts.index)+1

    # Impute null values (NOTE: Analyis showed that - th  e only NA value corrsponded to 194108 for parent state ID=18)
    pincodes['name']=pincodes.name.fillna('194108')

    '''
      Replace parent names with IDs assigned in previous steps
    '''
    states['parent_id']=0

    districts['parent_id']=districts['parent'].apply(Geo.lookup_state_id,args=(states,))
    pincodes['parent_id']=pincodes['parent'].apply(Geo.lookup_state_id,args=(states,))

    states.drop(columns=['parent'], inplace=True)
    districts.drop(columns=['parent'], inplace=True)
    pincodes.drop(columns=['parent'], inplace=True)

    # Write to the DB
    country=pd.DataFrame.from_dict({'id':0, 'name':'india', 'geo_type':'CON'}, orient='index').T
    country.to_sql(con=conn, name='geo',if_exists='append',index=False)
    states.to_sql(con=conn, name='geo',if_exists='append',index=False)
    districts.to_sql(con=conn, name='geo',if_exists='append',index=False)
    pincodes.to_sql(con=conn, name='geo',if_exists='append',index=False)

  def store(self,conn):
    self.__preprocess_and_store__(conn)

  def df_from_db(self, conn):
    sql = '''SELECT id,name,geo_type,parent_id FROM geo;'''
    return pd.read_sql_query(sql, conn)




In [89]:
'''
  Configure the Test Environment
'''
from google.colab import drive
drive.mount('/content/drive')

# Set the directory variables
data_dir = "/content/drive/MyDrive/Learn/guvi/labs/Assignments/PhonePePulse/phonepe/pulse/data"
csv_dir='/content/'

from google.colab import auth

# Aunthenticate to Google Colab
auth.authenticate_user()

#Configure Google Cloud
from tools import load_db_cfg
dbcfg=load_db_cfg('pp.json')
!gcloud config set project {dbcfg['project_id']}

'''
  Enable Google SQL Admin API Services
'''
# Enable Cloud SQL Admin API
!gcloud services enable sqladmin.googleapis.com


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Updated property [core/project].


In [None]:
'''
  Test - Index States,Districts and Pincodes
'''
print(f'''Indexing States, Districts and Pincodes from  gdrive-github clone.. This may take several minutes...!!!''')
g=Geo(data_dir)
g.index()
print (f'Geo: states={g.states().shape}, districts={g.districts().shape}, pincodes={g.pincodes().shape}')
g.to_csv()


Indexing States, Districts and Pincodes from  gdrive-github clone.. This may take several minutes...!!!
Geo: states=(36, 2), districts=(733, 2), pincodes=(1115, 2)


In [90]:
'''
  Test - From CSV files load 'Geo' data - States, Districts and Pincodes
'''
geo=Geo()
geo.from_csv('/content')
states=geo.states()
districts=geo.districts()
pincodes=geo.pincodes()
print (f'Geo: states={states.shape}, districts={districts.shape}, pincodes={pincodes.shape}')
print(f'Any Empty Values? states={states.isna().any().any()}, districts={districts.isna().any().any()}, pincodes={pincodes.isna().any().any()}')

Geo: states=(36, 2), districts=(733, 2), pincodes=(1115, 2)
Any Empty Values? states=False, districts=False, pincodes=False


In [91]:
!pip install cloud-sql-python-connector



In [1]:
# Install PyMySQL python module
!pip install pymysql

Collecting pymysql
  Downloading PyMySQL-1.1.1-py3-none-any.whl.metadata (4.4 kB)
Downloading PyMySQL-1.1.1-py3-none-any.whl (44 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/45.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.0/45.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymysql
Successfully installed pymysql-1.1.1


In [92]:
'''
Connect to DB
'''
from tools import load_db_cfg
from dbconnect import DbConnector
dbcfg=load_db_cfg('pp.json')
pool = DbConnector(dbcfg)
con=pool.connect()

geo=Geo()
geo.from_csv('/content')
geo.store(con)
con.close()

Successfully connected to 'phonepe_pulse' database!! 


ERROR:sqlalchemy.pool.impl.QueuePool:Exception during reset or similar
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sqlalchemy/pool/base.py", line 986, in _finalize_fairy
    fairy._reset(
  File "/usr/local/lib/python3.10/dist-packages/sqlalchemy/pool/base.py", line 1432, in _reset
    pool._dialect.do_rollback(self)
  File "/usr/local/lib/python3.10/dist-packages/sqlalchemy/engine/default.py", line 699, in do_rollback
    dbapi_connection.rollback()
  File "/usr/local/lib/python3.10/dist-packages/pymysql/connections.py", line 493, in rollback
    self._read_ok_packet()
  File "/usr/local/lib/python3.10/dist-packages/pymysql/connections.py", line 453, in _read_ok_packet
    pkt = self._read_packet()
  File "/usr/local/lib/python3.10/dist-packages/pymysql/connections.py", line 744, in _read_packet
    packet_header = self._read_bytes(4)
  File "/usr/local/lib/python3.10/dist-packages/pymysql/connections.py", line 798, in _read_bytes
    raise err.O

In [None]:
'''
  Test - Load 'Transaction' data from gdrive-github clone
  Test - Save in local CSV
'''
print(f'''Loading transactions from remote gdrive-github clone.. This may take several minutes...!!!''')
# Load aggregated, top, hover transactions
t=Transaction(data_dir)
t.load_aggregated()
t.load_top()
t.load_hover()

# Verify the dataframe
print(f'''Transactions: txn_agg={t.aggregated().shape}, txn_top={t.top().shape}, txn_hover={t.hover().shape}''')
print(f'Empty Values: txn_agg={t.aggregated().isna().any().any()}, txn_top={t.top().isna().any().any()}, txn_hover={t.hover().isna().any().any()}')

#Write the loaded transactions to CSVs
t.to_csv()


Loading transactions from remote gdrive-github clone.. This may take several minutes...!!!
Transactions: txn_agg=(4619, 9), txn_top=(17074, 9), txn_hover=(19196, 9)
Empty Values: txn_agg=True, txn_top=True, txn_hover=False


In [93]:
'''
  Test - From CSV files load 'Transaction' data
'''
print(f'''Loading transactions from local CSV folder {'csv_dir'}...!!!''')
t1=Transaction()
t1.from_csv(csv_dir)

print(f'''Transactions (from CSV): txn_agg={t1.aggregated().shape}, txn_top={t1.top().shape}, txn_hover={t1.hover().shape}''')
print(f'Empty Values: txn_agg={t1.aggregated().isna().any().any()}, txn_top={t1.top().isna().any().any()}, txn_hover={t1.hover().isna().any().any()}')


Loading transactions from local CSV folder csv_dir...!!!
Transactions (from CSV): txn_agg=(4619, 9), txn_top=(17074, 9), txn_hover=(19196, 9)
Empty Values: txn_agg=True, txn_top=True, txn_hover=False


In [94]:
'''
  Test - Store 'Transaction' data in DB
'''
from tools import load_db_cfg
from dbconnect import DbConnector
dbcfg=load_db_cfg('pp.json')
pool = DbConnector(dbcfg)
con=pool.connect()

geo=Geo()
geo.from_csv('/content')

t1.store(con,geo.df_from_db(con))

Successfully connected to 'phonepe_pulse' database!! 
agg_txns(db): (4619, 8), top_txns(db):(17074, 8), hover_txns(db):(19196, 8)


In [None]:
'''
  Test - Load 'Insurance' data from gdrive-github clone
  Test - Save in local CSV
'''
print(f'''Loading Insurance from remote gdrive-github clone.. This may take several minutes...!!!''')
# Load aggregated, to, hover transactions
ins=Insurance(data_dir)
ins.load_aggregated()
ins.load_top()
ins.load_hover()

# Verify the dataframe
print(f'''ins_agg={ins.aggregated().shape}, ins_top={ins.top().shape}, ins_hover={ins.hover().shape}''')
#print(f'''Hover Insurance: {ins.hover().shape}''')


#Write the loaded transactions to CSVs
ins.to_csv()

Loading Insurance from remote gdrive-github clone.. This may take several minutes...!!!
ins_agg=(590, 9), ins_top=(10792, 9), ins_hover=(12133, 9)


In [95]:
'''
  Test - From CSV files, load 'Insurance' data
'''
print(f'''Loading insurance data from local CSV folder {csv_dir}...!!!''')
ins1=Insurance(data_dir)
ins1.from_csv(csv_dir)

print(f'''ins_agg(csv)={ins1.aggregated().shape}, ins_top(csv)={ins1.top().shape}, ins_hover(csv)={ins1.hover().shape}''')
#print(f'''Hover Insurance (from CSV): {ins1.hover().shape}''')


Loading insurance data from local CSV folder /content/...!!!
ins_agg(csv)=(590, 9), ins_top(csv)=(10792, 9), ins_hover(csv)=(12133, 9)


In [96]:
'''
  Save insurance data in db
'''
from tools import load_db_cfg
from dbconnect import DbConnector
dbcfg=load_db_cfg('pp.json')
pool = DbConnector(dbcfg)
con=pool.connect()

geo=Geo()
geo.from_csv('/content')

ins1.store(con,geo.df_from_db(con))


Successfully connected to 'phonepe_pulse' database!! 
agg_ins(db): (590, 8), top_ins(db):(10792, 8), , hover_ins(db):(12133, 8)


In [None]:
'''
  Test - Load 'User' data from gdrive-github clone
  Test - Save in local CSV
'''
print(f'''Remote loading of 'User' data from gdrive-github clone.. This may take several minutes...!!!''')
# Load aggregated, to, hover users
usr=User(data_dir)
usr.load_aggregated()
usr.load_top()
usr.load_hover()


# Verify the dataframe
print(f'''users_agg={usr.aggregated().shape}, users_top={usr.top().shape}, , users_hover={usr.hover().shape}''')
#print(f'''Hover Users: {usr.hover().shape}''')


#Write the loaded transactions to CSVs
usr.to_csv()

Remote loading of 'User' data from gdrive-github clone.. This may take several minutes...!!!
users_agg=(7215, 10), users_top=(17075, 7), , users_hover=(19200, 8)


In [97]:
'''
  Test - From CSV files, load 'User' data
'''
print(f'''Loading 'User' data from local CSV folder {csv_dir}...!!!''')
usr1=User(data_dir)
usr1.from_csv(csv_dir)

print(f'''users_agg(csv)={usr1.aggregated().shape}, users_top(csv)={usr1.top().shape}, users_hover(csv)={usr1.hover().shape}''')
#print(f'''Hover Users (from CSV): {usr1.hover().shape}''')

Loading 'User' data from local CSV folder /content/...!!!
users_agg(csv)=(7215, 10), users_top(csv)=(17075, 7), users_hover(csv)=(19200, 8)


In [98]:
'''
  Save user, device info in DB
'''
from tools import load_db_cfg
from dbconnect import DbConnector
dbcfg=load_db_cfg('pp.json')
pool = DbConnector(dbcfg)
con=pool.connect()

geo=Geo()
#geo.from_csv('/content')

usr1.store(con,geo.df_from_db(con))
con.close()

Successfully connected to 'phonepe_pulse' database!! 
Read From CSV: agg_users(db): (7215, 9), top_users(db):(17075, 6), hover_users(db):(19200, 7)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user_agg.drop_duplicates(inplace=True,ignore_index=True)


 Written to DB: user_agg(db): (925, 7), device_agg(db):(7215, 7), top_user(db):(17075, 7), hover_users(db):(19200, 8)


In [99]:
'''Geo lookup Test'''

from tools import load_db_cfg
from dbconnect import DbConnector
dbcfg=load_db_cfg('pp.json')
pool = DbConnector(dbcfg)
con=pool.connect()

geo=Geo()


Base.__lookup_geo_id__('DIS,kiphire,nagaland', geo.df_from_db(con))

Successfully connected to 'phonepe_pulse' database!! 


478