In [0]:
import numpy as np
import pandas as pd
import json
import os



In [0]:
os.getcwd()

'c:\\Users\\HALID\\Documents\\ITE Assignment\\ITE-data-sciene-assignment\\code'

### Visitor profile analysis 

In [0]:
def process_visitor_data(visitors: pd.DataFrame, visitors_questions: pd.DataFrame, visitors_answers: pd.DataFrame) -> pd.DataFrame:
    """Processes visitor data and returns the final pivot table for ML processing."""
    visitors['data'] = visitors['data'].apply(json.loads)
    visitors_exploded = visitors.explode('data').reset_index(drop=True)
    visitors_normalized = pd.json_normalize(visitors_exploded['data'])
    visitors_final = visitors_exploded.drop(columns='data').join(visitors_normalized)
    
    visitors_final['Country'] = visitors_final.apply(lambda x: x['answerValue'] if x['answerTypeId'] == 'Country' else None, axis=1)
    visitors_final['Region'] = visitors_final.apply(lambda x: x['answerValue'] if x['answerTypeId'] == 'Region' else None, axis=1)
    visitors_final['Country'] = visitors_final.groupby('id')['Country'].transform(lambda x: x.ffill().bfill())
    visitors_final['Region'] = visitors_final.groupby('id')['Region'].transform(lambda x: x.ffill().bfill())
    
    visitors_final_df = visitors_final.merge(
        visitors_questions.add_suffix("_questions"),
        left_on='questionId',
        right_on="id_questions",
        how='left'
    ).merge(
        visitors_answers.add_suffix("_answers"),
        left_on='answerId',
        right_on="id_answers",
        how='inner'
    ).drop(
        ['questionId_answers', 'id_answers', 'stepId_questions', 'questionId', 
         'id_questions', 'questionTypeId_questions', 'stepId_questions'],
        axis=1
    )
    
    visitors_final_base = visitors_final_df[['email', 'gender', 'id', 'Country', 'Region']].drop_duplicates().reset_index(drop=True)
    
    visitor_pivot = visitors_final_df.pivot_table(
        index=['id'],
        columns='question_questions',
        values='answer_answers',
        aggfunc=lambda x: ','.join(x)
    ).reset_index().merge(
        visitors_final_base,
        left_on='id',
        right_on='id',
        how='left'
    )
    
    selected_columns = ['id', 'email', 'gender', 'Country', 'Region',
                        "Please indicate your company's main area of business",
                        "Reason for Attending the Event",
                        "What is your company's annual purchasing budget?",
                        "What role do you play in the purchasing decision making process?",
                        "Which of the following best describes your job function?"]
    visitor_pivot = visitor_pivot[selected_columns]
    
    visitor_answer_count = visitors_final_df.groupby(['id', 'question_questions'])['answer_answers'].count().unstack(fill_value=0).reset_index()
    
    return visitor_pivot,visitor_answer_count,visitors_final_df


### Exhibitor

In [0]:
import pandas as pd

def preprocess_exhibitor_categories(exhibitors: pd.DataFrame, exhibitor_categories: pd.DataFrame) -> pd.DataFrame:
    """
    Processes exhibitor data by splitting main categories, exploding them,
    converting to integer, merging with category names, and aggregating.
    
    Parameters:
    exhibitors (pd.DataFrame): DataFrame containing exhibitor data with 'MainCategories'.
    exhibitor_categories (pd.DataFrame): DataFrame mapping category IDs to category names.
    
    Returns:
    pd.DataFrame: Aggregated DataFrame with concatenated category names per exhibitor.
    """
    # Split and explode the 'MainCategories' column
    exhibitors_splitted = exhibitors.assign(
        MainCategories=exhibitors['MainCategories'].str.split('|')
    ).explode('MainCategories')
    
    # Convert 'MainCategories' to integer
    exhibitors_splitted['MainCategories'] = exhibitors_splitted['MainCategories'].astype(int)
    
    # Merge with category names
    exhibitors_final = exhibitors_splitted.merge(
        exhibitor_categories,
        left_on='MainCategories',
        right_on='categoryId',
        how='left'
    ).drop(['MainCategories', 'categoryId'], axis=1)
    
    # Aggregate category names per exhibitor
    exhibitor_agg = exhibitors_final.groupby(['exhibitorid', 'Name']).agg(
        {'categoryName': lambda x: ','.join(x)}
    ).reset_index()
    exhibitor_agg['categoryName'] = exhibitor_agg['categoryName'].str.replace(r'[0-9.]', '', regex=True)
    
    return exhibitor_agg,exhibitors_final
