In [2]:
import pandas as pd
import numpy as np  
df= pd.read_csv("cleaned.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182 entries, 0 to 181
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   Unnamed: 0                        182 non-null    int64 
 1   gender                            181 non-null    object
 2   age_group                         182 non-null    object
 3   familiarity_with_herbal_essences  182 non-null    object
 4   respondent_location               182 non-null    object
 5   hair_care_routine_story           182 non-null    object
 6   ideal_hair_care_product_message   182 non-null    object
 7   celeb_hair_care_philosophy        182 non-null    object
 8   current_natural_shampoo_user      182 non-null    bool  
 9   motivation_to_try_herbal_essence  182 non-null    object
 10  price_sensitivity                 182 non-null    object
dtypes: bool(1), int64(1), object(9)
memory usage: 14.5+ KB


In [4]:
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Unique values in column '{column}': {unique_values}")

Unique values in column 'Unnamed: 0': [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181]
Unique values in column 'gender': ['MALE' 'FEMALE' 'Prefer not to say' nan]
Unique values in column 'age_group': ['18-25' '25-40' '40 +']
Unique values in column 'familiarity_with_herbal_essences': ['Somewhat familiar' 'Ne

In [26]:

def targeted_persona_assignment(df, threshold):
    """
    Assign personas with strategic customer segmentation considerations
    and leave rows uncategorized if no persona crosses the threshold.
    """
    def score_persona(row, persona_criteria):
        """
        Calculate persona matching score with strategic weighting
        """
        score = 0
        for criterion, (match_values, weight) in persona_criteria.items():
            # Check if the row's value matches any of the defined match values
            if isinstance(match_values, list):
                # For list-based criteria
                if row[criterion] in match_values:
                    score += weight
            else:
                # For single value criteria
                if row[criterion] == match_values:
                    score += weight
        return score

    # Refined Persona Definitions with Strategic Weighting (Age removed)

    
    persona_definitions = {
    'Urban Youth': {
        'hair_care_routine_story': (['Time-Pressed', 'Exploration-Driven'], 5),
        'ideal_hair_care_product_message': (['Personalization-Focused', 'Image-Driven'], 4),
        'price_sensitivity': (['Mid-Range Value Seekers'], 3),
        'familiarity_with_herbal_essences': (['Somewhat familiar'], 3),
        'current_natural_shampoo_user': ([False], 2),
    },
    'Budget-Conscious ': {
        'hair_care_routine_story': (['Cost-Aware', 'Time-Pressed'], 4),
        'price_sensitivity': (['Budget-Conscious Consumers'], 5),
        'ideal_hair_care_product_message': (['Naturally Healthy'], 3),
        'current_natural_shampoo_user': ([False], 1),
        'familiarity_with_herbal_essences': (['Somewhat familiar'], 2),
    },
    'Wellness and Quality Seeker': {
        'hair_care_routine_story': (['Nature inspired', 'Relaxation-Seeking'], 5),
        'ideal_hair_care_product_message': (['Naturally Healthy', 'Gentle-Caring'], 4),
        'current_natural_shampoo_user': ([True], 5),
        'price_sensitivity': (['Quality-Oriented Buyers'], 2),
        'familiarity_with_herbal_essences': (['Very familiar'], 5),
    }
}


    # Initialize persona column
    df['persona'] = 'Uncategorized'

    # Assign personas based on scoring
    for persona_name, criteria in persona_definitions.items():
        # Create a temporary score column
        df[f'{persona_name}_score'] = df.apply(
            lambda row: score_persona(row, criteria), 
            axis=1
        )

    # Find the persona with the highest score for each row
    persona_score_columns = [f'{persona_name}_score' for persona_name in persona_definitions.keys()]
    
    # Apply threshold check: only assign persona if score exceeds threshold
    def assign_persona(row):
        max_score = row[persona_score_columns].max()
        if max_score >= threshold:
            return row[persona_score_columns].idxmax().replace('_score', '')
        else:
            return 'Uncategorized'

    df['persona'] = df.apply(assign_persona, axis=1)

    # Clean up temporary score columns
    for col in persona_score_columns:
        df.drop(columns=[col], inplace=True)

    return df

def analyze_persona_characteristics(df):
    """
    Provide detailed analysis of assigned personas
    """
    personas = df['persona'].unique()
    persona_analysis = {}
    
    for persona in personas:
        persona_subset = df[df['persona'] == persona]
        
        persona_analysis[persona] = {
            'count': len(persona_subset),
            'percentage': len(persona_subset) / len(df) * 100,
            'key_characteristics': {
                'hair_care_routine': persona_subset['hair_care_routine_story'].value_counts(normalize=True).head(),
                'price_sensitivity': persona_subset['price_sensitivity'].value_counts(normalize=True).head(),
                'product_familiarity': persona_subset['familiarity_with_herbal_essences'].value_counts(normalize=True).head(),
                'current_natural_user': persona_subset['current_natural_shampoo_user'].value_counts(normalize=True).head()
            }
        }
    
    return persona_analysis

def main():
    # Load the dataset
    df = pd.read_csv('cleaned.csv')

    # Apply targeted persona assignment
    df = targeted_persona_assignment(df, threshold=10)

    # Analyze persona distribution
    persona_distribution = df['persona'].value_counts(normalize=True)
    print("Persona Distribution:")
    print(persona_distribution)

    # Perform persona characteristics analysis
    persona_characteristics = analyze_persona_characteristics(df)
    print("\nPersona Characteristics:")
    for persona, details in persona_characteristics.items():
        print(f"\n{persona}:")
        print(f"Total Count: {details['count']} ({details['percentage']:.2f}%)")
        print("Key Characteristics:")
        for category, breakdown in details['key_characteristics'].items():
            print(f"  {category}:")
            for value, proportion in breakdown.items():
                print(f"    {value}: {proportion:.2%}")

    # Save the updated dataset
    output_path = 'cleaned_with_targeted_personas.csv'
    df.to_csv(output_path, index=False)
    print(f"\nPersonas assigned and saved to {output_path}")

if __name__ == "__main__":
    main()


Persona Distribution:
persona
Budget-Conscious               0.307692
Urban Youth                    0.252747
Wellness and Quality Seeker    0.247253
Uncategorized                  0.192308
Name: proportion, dtype: float64

Persona Characteristics:

Uncategorized:
Total Count: 35 (19.23%)
Key Characteristics:
  hair_care_routine:
    Nature inspired: 28.57%
    Cost-Aware: 25.71%
    Exploration-Driven: 20.00%
    Time-Pressed: 14.29%
    Relaxation-Seeking: 11.43%
  price_sensitivity:
    Budget-Conscious Consumers: 37.14%
    Mid-Range Value Seekers: 20.00%
    Quality-Oriented Buyers: 20.00%
    Premium Product Enthusiasts: 14.29%
    Luxury Brand Believers: 8.57%
  product_familiarity:
    Somewhat familiar: 51.43%
    Never heard of it: 40.00%
    Very familiar: 8.57%
  current_natural_user:
    False: 77.14%
    True: 22.86%

Urban Youth:
Total Count: 46 (25.27%)
Key Characteristics:
  hair_care_routine:
    Time-Pressed: 63.04%
    Exploration-Driven: 28.26%
    Nature inspired: