In [None]:
import pandas as pd
from google.oauth2.credentials import Credentials
from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import gspread
import os

def connect_to_sheets(credentials_file):
    """
    Establish connection to Google Sheets and return the spreadsheet object
    """
    credentials = service_account.Credentials.from_service_account_file(
        credentials_file,
        scopes=['https://www.googleapis.com/auth/spreadsheets']
    )
    
    gc = gspread.authorize(credentials)
    spreadsheet_url = os.getenv('SPREADSHEET_URL')
    return gc.open_by_url(spreadsheet_url)

def read_worksheet_to_df(spreadsheet, worksheet_name):
    """
    Read a worksheet and convert it to a pandas DataFrame
    """
    worksheet = spreadsheet.worksheet(worksheet_name)
    all_values = worksheet.get_all_values()
    headers = all_values[0]
    data = all_values[1:]
    return pd.DataFrame(data, columns=headers)

def standardize_dataframe(df):
    """
    Standardize DataFrame by:
    1. Converting column names to lowercase with underscores
    2. Removing whitespace from all values
    3. Converting all string values to lowercase
    4. Converting numeric columns to proper number format without commas
    """
    df_clean = df.copy()
    
    # Standardize column names
    df_clean.columns = df_clean.columns.str.lower()  
    df_clean.columns = df_clean.columns.str.strip()
    df_clean.columns = df_clean.columns.str.replace(' ', '_')  
    df_clean.columns = df_clean.columns.str.replace('-', '_')
    
    # Process each column
    for column in df_clean.columns:
        # First convert to string and clean
        df_clean[column] = df_clean[column].astype(str)
        df_clean[column] = df_clean[column].str.strip().str.lower()
        
        
        try:
            # Remove any existing commas and convert to numeric
            numeric_values = df_clean[column].str.replace(',', '').astype(float)
            # If conversion successful, update the column
            df_clean[column] = numeric_values
        except (ValueError, TypeError):
            # If conversion fails, keep as string
            pass
    
    return df_clean

def standardize_dates(df):
    """
    Standardize month and date columns to datetime format.
    Handles multiple date formats and converts to Looker-compatible formats.
    """
    df = df.copy()
    
    try:
        df['date'] = pd.to_datetime(df['date'], format='%d %b %Y')
    except ValueError:
        try:
            df['date'] = pd.to_datetime(df['date'], format='%d/%m/%y')
        except ValueError:
            df['date'] = pd.to_datetime(df['date'], format='mixed', dayfirst=True)
    
    # Extract standardized month (as full month name) from the date column
    df['month'] = df['date'].dt.strftime('%b').str.lower()
    
    # Add month-year column in YYYY-MM format for Looker compatibility
    df['year_month'] = df['date'].dt.strftime('%Y-%b')
    
    # Format the date column in YYYY-MM-DD format for Looker compatibility
    df['date'] = df['date'].dt.strftime('%Y-%m-%d')
    
    return df

def process_sheets_data(stock_inflow_df, release_df):
    """
    Process both DataFrames with standardization
    """
    # Apply the standardization to both DataFrames
    stock_inflow_df = standardize_dataframe(stock_inflow_df)
    release_df = standardize_dataframe(release_df)
    
    # Apply date standardization to both DataFrames
    stock_inflow_df = standardize_dates(stock_inflow_df)
    release_df = standardize_dates(release_df)
    
    return stock_inflow_df, release_df

def upload_df_to_gsheet(df, spreadsheet_id, sheet_name, credentials_file):
    """
    Upload a pandas DataFrame to Google Sheets.
    """
    try:
        # Create a copy of the DataFrame to avoid modifying the original
        df_copy = df.copy()
        
        # Convert datetime columns to string format
        datetime_columns = df_copy.select_dtypes(include=['datetime64[ns]']).columns
        for col in datetime_columns:
            df_copy[col] = df_copy[col].dt.strftime('%Y-%m-%d')
            
        # Convert any remaining non-serializable objects to strings
        for col in df_copy.columns:
            if df_copy[col].dtype == 'object':
                df_copy[col] = df_copy[col].astype(str)
        
        # Load credentials from service account file
        SCOPES = ['https://www.googleapis.com/auth/spreadsheets']
        credentials = service_account.Credentials.from_service_account_file(
            credentials_file,
            scopes=SCOPES
        )
        
        # Build the Sheets API service
        service = build('sheets', 'v4', credentials=credentials)
        
        # Convert DataFrame to values list
        values = [df_copy.columns.values.tolist()]  # Header row
        values.extend(df_copy.values.tolist())      # Data rows
        
        body = {
            'values': values
        }
        
        # Clear existing content first
        clear_request = service.spreadsheets().values().clear(
            spreadsheetId=spreadsheet_id,
            range=f'{sheet_name}!A1:ZZ'
        )
        clear_request.execute()
        
        # Update the sheet with new values
        result = service.spreadsheets().values().update(
            spreadsheetId=spreadsheet_id,
            range=f'{sheet_name}!A1',
            valueInputOption='RAW',
            body=body
        ).execute()
        
        print(f"Updated {result.get('updatedCells')} cells")
        return True
        
    except HttpError as error:
        print(f"An error occurred: {error}")
        return False
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
        return False

def main():
    # Get credentials file path
    CREDENTIALS_FILE = 'credentials.json' 
    
    try:
        # Connect to Google Sheets
        spreadsheet = connect_to_sheets(CREDENTIALS_FILE)
        
        # Read worksheets
        stock_inflow_df = read_worksheet_to_df(spreadsheet, 'stock_inflow_')
        release_df = read_worksheet_to_df(spreadsheet, 'release')
        
        # Process the data
        stock_inflow_df, release_df = process_sheets_data(stock_inflow_df, release_df)
        
        # Upload processed data to new sheets
        spreadsheet_id = os.getenv('SPREADSHEET_ID')
        
        # Upload stock inflow data
        success_stock = upload_df_to_gsheet(
            stock_inflow_df, 
            spreadsheet_id, 
            'stock_inflow_clean',
            CREDENTIALS_FILE
        )
        
        # Upload release data
        success_release = upload_df_to_gsheet(
            release_df, 
            spreadsheet_id, 
            'release_clean',
            CREDENTIALS_FILE
        )
        
        if success_stock and success_release:
            print("Data processing and upload completed successfully!")
        else:
            raise Exception("Failed to upload one or both datasets")
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    main()