In [196]:
import pandas as pd
import random
import string
import numpy as np
import re


In [None]:
# drugs and related diseases
# https://www.kaggle.com/datasets/jithinanievarghese/drugs-side-effects-and-medical-condition
# 1462 unique entries
# one to many relationship between medical_condition and generic_name

df_drugs = pd.read_csv('drugs_side_effects_drugs_com.csv')
df_drugs = df_drugs[['medical_condition', 'generic_name', 'rx_otc']].drop_duplicates()
df_drugs['generic_name'] = df_drugs['generic_name'].str.replace(r'[()]', '', regex=True)


Unnamed: 0,medical_condition,generic_name,rx_otc
76,Acne,Benzoyl Peroxide Cream,OTC
100,Acne,benzoyl peroxide topical,OTC
101,Acne,Benzoyl Peroxide Wash and Cleanser,OTC
112,Acne,benzoyl peroxide topical route,OTC
115,Acne,resorcinol and sulfur topical route,OTC
...,...,...,...
2577,Pain,Trolamine Cream and Lotion,OTC
2710,Psoriasis,,OTC
2711,Psoriasis,Coal Tar and Salicylic Acid,OTC
2772,Rheumatoid Arthritis,"camphor, menthol, and methyl salicylate topical",OTC


In [None]:
# diseases and corresponding symptoms
# https://www.kaggle.com/datasets/itachi9604/disease-symptom-description-dataset
# 4920 unique entries 

df_disease = pd.read_csv('dataset.csv')
symptom_groups = []
diseases = []  # List to store corresponding diseases

for index, row in df_disease.iterrows():
    disease = row['Disease']
    symptoms = []

    # Collect all non-empty symptoms
    for i in range(1, 18):
        symptom = row[f'Symptom_{i}']
        if pd.notna(symptom) and symptom.strip():
            symptoms.append(symptom.strip())
        if i == 17:
            symptom_groups.append(symptoms)
            diseases.append(disease)  # Store the disease name

# Create a DataFrame with the mapping
mapping_df = pd.DataFrame({
    'SymptomGroupId': range(len(symptom_groups)),
    'Disease': diseases
})

# Save to CSV
mapping_df.to_csv('disease_index_mapping.csv', index=False)

First few rows of the mapping:
    SymptomGroupId           Disease
0                0  Fungal infection
1                1  Fungal infection
2                2  Fungal infection
3                3  Fungal infection
4                4  Fungal infection
5                5  Fungal infection
6                6  Fungal infection
7                7  Fungal infection
8                8  Fungal infection
9                9  Fungal infection
10              10           Allergy
11              11           Allergy
12              12           Allergy
13              13           Allergy
14              14           Allergy
15              15           Allergy
16              16           Allergy
17              17           Allergy
18              18           Allergy
19              19           Allergy


In [None]:
# Create lists to store the data
all_symptoms = []
all_indices = []

# Iterate through symptom_groups
for index, symptom_list in enumerate(symptom_groups):
    # For each symptom in the list
    for symptom in symptom_list:
        all_symptoms.append(symptom.strip().replace('_', ' ').replace('  ', ' '))
        all_indices.append(index)

# Create a DataFrame
mapping_df = pd.DataFrame({
    'SymptomName': all_symptoms,
    'SymptomGroupId': all_indices
})

# Save to CSV
mapping_df.to_csv('symptom_to_group_mapping.csv', index=True)

# Print first few rows to verify
print("First few rows of the mapping:")
print(mapping_df.head())

In [183]:
# Helper function to clean symptom strings consistently
def clean_symptom(symptom):
    return symptom.strip().replace('_', ' ').replace('  ', ' ')

# Create KnownSymptoms.csv
unique_symptoms = list(set(map(clean_symptom, all_symptoms)))  # Clean all symptoms
symptoms_df = pd.DataFrame({
    'SymptomIndex': range(len(unique_symptoms)),
    'SymptomName': unique_symptoms
})
symptoms_df.to_csv('KnownSymptoms.csv', index=False)

# Create Diagnosis.csv
diagnosis_df = pd.DataFrame({
    'SymptomGroupId': range(len(symptom_groups)),
    'DiseaseName': diseases
})
diagnosis_df.to_csv('Diagnosis.csv', index=False)

# Create HasSymptom.csv
# Create dictionary mapping symptoms to their indices
symptom_to_index = dict(zip(symptoms_df['SymptomName'], symptoms_df['SymptomIndex']))

# Debug: print the keys in symptom_to_index to check what's available
print("Available symptoms in mapping:")
for symptom in sorted(symptom_to_index.keys()):
    print(f"'{symptom}'")

has_symptom_records = []
for group_id, symptom_list in enumerate(symptom_groups):
    for symptom in symptom_list:
        cleaned_symptom = clean_symptom(symptom)
        # Debug: print problematic symptoms
        if cleaned_symptom not in symptom_to_index:
            print(f"Warning: Could not find '{cleaned_symptom}' in symptom index")
            continue
        has_symptom_records.append({
            'SymptomIndex': symptom_to_index[cleaned_symptom],
            'SymptomGroupId': group_id
        })

has_symptom_df = pd.DataFrame(has_symptom_records)
has_symptom_df.to_csv('HasSymptom.csv', index=False)

# Print some validation information
print("\nNumber of unique symptoms:", len(unique_symptoms))
print("Number of symptom groups:", len(symptom_groups))
print("Number of relationships:", len(has_symptom_records))

Available symptoms in mapping:
'abdominal pain'
'abnormal menstruation'
'acidity'
'acute liver failure'
'altered sensorium'
'anxiety'
'back pain'
'belly pain'
'blackheads'
'bladder discomfort'
'blister'
'blood in sputum'
'bloody stool'
'blurred and distorted vision'
'breathlessness'
'brittle nails'
'bruising'
'burning micturition'
'chest pain'
'chills'
'cold hands and feets'
'coma'
'congestion'
'constipation'
'continuous feel of urine'
'continuous sneezing'
'cough'
'cramps'
'dark urine'
'dehydration'
'depression'
'diarrhoea'
'dischromic patches'
'distention of abdomen'
'dizziness'
'drying and tingling lips'
'enlarged thyroid'
'excessive hunger'
'extra marital contacts'
'family history'
'fast heart rate'
'fatigue'
'fluid overload'
'foul smell of urine'
'headache'
'high fever'
'hip joint pain'
'history of alcohol consumption'
'increased appetite'
'indigestion'
'inflammatory nails'
'internal itching'
'irregular sugar level'
'irritability'
'irritation in anus'
'itching'
'joint pain'
'knee 

In [185]:
# Extend the variety of first and last names
first_names = [
    'Alice', 'Bob', 'Charlie', 'Diana', 'Eve', 'Frank', 'Grace', 'Hank', 'Ivy', 'Jack',
    'Karen', 'Liam', 'Mia', 'Nathan', 'Olivia', 'Paul', 'Quincy', 'Rachel', 'Steve', 'Tina',
    'Uma', 'Victor', 'Wendy', 'Xander', 'Yara', 'Zane'
]

last_names = [
    'Smith', 'Johnson', 'Williams', 'Jones', 'Brown', 'Davis', 'Miller', 'Wilson', 'Moore', 'Taylor',
    'Anderson', 'Thomas', 'Jackson', 'White', 'Harris', 'Martin', 'Thompson', 'Garcia', 'Martinez', 'Robinson',
    'Clark', 'Rodriguez', 'Lewis', 'Lee', 'Walker', 'Hall', 'Allen', 'Young', 'King', 'Scott'
]

# Function to generate a random username
def generate_username(first_name, last_name):
    return f"{first_name.lower()}.{last_name.lower()}{random.randint(1, 999)}"

# Generate random data for the CSV
genders = ['Male', 'Female', 'Non-binary']

data = []
usernames = set()

# Generate 1010 rows of data
while len(data) < 1010:
    first_name = random.choice(first_names)
    last_name = random.choice(last_names)
    gender = random.choice(genders)
    age = random.randint(18, 99)

    # Ensure unique usernames
    username = generate_username(first_name, last_name)
    while username in usernames:
        username = generate_username(first_name, last_name)
    usernames.add(username)

    # Append row
    data.append({'Username': username, 'FirstName': first_name, 'LastName': last_name, 'Gender': gender, 'Age': age})

# Convert to DataFrame
df = pd.DataFrame(data)

# Save to CSV
file_path = "generated_user_data.csv"
df.to_csv(file_path, index=False)


In [186]:
symptom_data = []
for username in df['Username']:
    symptom_group_id = random.randint(0, 4919)  # Random number between 0 and 4919 inclusive
    symptom_data.append({'Username': username, 'SymptomGroupId': symptom_group_id})

# Convert to DataFrame
df_symptom = pd.DataFrame(symptom_data)

# Save to CSV
symptom_file_path = 'HasDiagnosis.csv'
df_symptom.to_csv(symptom_file_path, index=False)


In [188]:
def validate_usernames(csv1_path, csv2_path):
    """
    Validates that all usernames in csv1 exist in csv2.
    
    Parameters:
    csv1_path (str): Path to first CSV (Username, SymptomGroupId)
    csv2_path (str): Path to second CSV (Username, FirstName, LastName, Gender, Age)
    
    Returns:
    tuple: (bool, list) - (validation result, list of missing usernames)
    """
    # Read the CSVs
    df1 = pd.read_csv(csv1_path)
    df2 = pd.read_csv(csv2_path)
    
    # Get unique usernames from both CSVs
    usernames_csv1 = set(df1['Username'].unique())
    usernames_csv2 = set(df2['Username'].unique())
    
    # Find usernames that are in csv1 but not in csv2
    missing_usernames = usernames_csv1 - usernames_csv2
    
    # Validation result
    is_valid = len(missing_usernames) == 0
    
    # Create detailed report
    report = {
        'total_usernames_csv1': len(usernames_csv1),
        'total_usernames_csv2': len(usernames_csv2),
        'missing_usernames': sorted(list(missing_usernames)),
        'is_valid': is_valid
    }
    
    if not is_valid:
        print(f"❌ Validation failed! Found {len(missing_usernames)} username(s) in csv1 that don't exist in csv2:")
        for username in sorted(missing_usernames):
            print(f"  - {username}")
    else:
        print("✅ Validation passed! All usernames in csv1 exist in csv2")
        
    print(f"\nSummary:")
    print(f"Total unique usernames in csv1: {report['total_usernames_csv1']}")
    print(f"Total unique usernames in csv2: {report['total_usernames_csv2']}")
    
    return is_valid, report

# Example usage:
if __name__ == "__main__":
    csv1_path = "User.csv"
    csv2_path = "HasDiagnosis.csv"
    
    is_valid, report = validate_usernames(csv1_path, csv2_path)

✅ Validation passed! All usernames in csv1 exist in csv2

Summary:
Total unique usernames in csv1: 1010
Total unique usernames in csv2: 1010


In [204]:
import pandas as pd
import numpy as np
import re

def clean_rx_otc(value):
    """Helper function to clean and standardize RX/OTC values"""
    if pd.isna(value):
        return 'RX'  # Default value for missing entries
    
    value = str(value).upper().strip()
    return 'RX' if value not in ['RX', 'OTC'] else value

def clean_string(value):
    """Helper function to clean string values and remove quotes"""
    if pd.isna(value):
        return ''
    
    # Convert to string and clean
    cleaned = str(value).lower().strip()
    
    # Remove both single and double quotes
    cleaned = re.sub(r'["\']', '', cleaned)
    
    # Remove any parenthetical information
    cleaned = re.sub(r'\s*\([^)]*\)', '', cleaned)
    
    # Clean up any extra whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned)
    
    return cleaned.strip()

def process_medication_data(drugs_file, diagnosis_file, output_file):
    """
    Process drugs CSV file to create a Medication CSV file that maps to disease groups
    
    Parameters:
    drugs_file (str): Path to the drugs_side_effects_drugs_com.csv file
    diagnosis_file (str): Path to the Diagnosis.csv file
    output_file (str): Path where the output Medication.csv will be saved
    """
    
    # Read the input files
    try:
        drugs_df = pd.read_csv(drugs_file)
        diagnosis_df = pd.read_csv(diagnosis_file)
    except FileNotFoundError as e:
        print(f"Error: Could not find input file - {e}")
        return None
    except pd.errors.EmptyDataError:
        print("Error: One of the input files is empty")
        return None
    
    # Print initial data info
    print(f"Loaded {len(drugs_df)} drug entries and {len(diagnosis_df)} diagnosis entries")
    
    # Create a mapping dictionary from disease names to symptom group IDs
    disease_to_group = dict(zip(diagnosis_df['DiseaseName'].str.lower(), 
                               diagnosis_df['SymptomGroupId']))
    
    # Initialize lists to store the processed data
    medications = []
    skipped_count = 0
    
    # Process each drug entry
    for idx, row in drugs_df.iterrows():
        try:
            generic_name = clean_string(row['generic_name'])
            rx_otc = clean_rx_otc(row['rx_otc'])
            medical_condition = clean_string(row['medical_condition'])
            
            # Skip if generic name is missing or empty
            if not generic_name:
                skipped_count += 1
                continue
                
            # Look up the symptom group ID for the medical condition
            symptom_group_id = disease_to_group.get(medical_condition)
            
            # Only add if we found a matching disease in the diagnosis file
            if symptom_group_id is not None:
                medications.append({
                    'MedicationName': generic_name,
                    'Prescription': rx_otc,
                    'SymptomGroupId': symptom_group_id
                })
                
        except Exception as e:
            print(f"Warning: Error processing row {idx}: {e}")
            skipped_count += 1
            continue
    
    if not medications:
        print("Error: No valid medications found to process")
        return None
    
    # Create DataFrame from processed data
    medication_df = pd.DataFrame(medications).drop_duplicates()
    
    # Sort by MedicationName and SymptomGroupId
    medication_df = medication_df.sort_values(['MedicationName', 'SymptomGroupId'])
    
    try:
        # Save to CSV without quotes
        medication_df.to_csv(output_file, index=True, quoting=None, escapechar='\\')
        print(f"\nProcessing Summary:")
        print(f"- Processed {len(medication_df)} unique medication entries")
        print(f"- Skipped {skipped_count} invalid entries")
        print(f"- Output saved to {output_file}")
    except Exception as e:
        print(f"Error saving output file: {e}")
        return medication_df
    
    return medication_df

In [None]:
drugs_file = "drugs_side_effects_drugs_com.csv"
diagnosis_file = "Diagnosis.csv"
output_file = "Medication.csv"

result_df = process_medication_data(drugs_file, diagnosis_file, output_file)


In [209]:
def generate_medical_profiles(medication_file, output_file, num_profiles=100):
    """
    Generate medical profiles with random allergic and current medications
    
    Parameters:
    medication_file (str): Path to the Medication.csv file
    output_file (str): Path where the output MedicalProfile.csv will be saved
    num_profiles (int): Number of profiles to generate
    """
    
    # Read the medications file
    try:
        meds_df = pd.read_csv(medication_file)
        medication_list = meds_df['MedicationName'].unique().tolist()
    except FileNotFoundError:
        print(f"Error: Could not find medication file")
        return None
    except Exception as e:
        print(f"Error reading medication file: {e}")
        return None
        
    print(f"Loaded {len(medication_list)} unique medications")
    
    # Initialize lists for profile data
    profiles = []
    
    for i in range(0, num_profiles + 1):
        profile = {'ProfileIndex': i}
        
        # Randomly decide if we'll include allergic medication (70% chance of having an allergy)
        if random.random() < 0.7:
            profile['AllergicMedication'] = random.choice(medication_list)
        else:
            profile['AllergicMedication'] = ''
            
        # Randomly decide if we'll include current medication (80% chance of having current medication)
        if random.random() < 0.8:
            current_med = random.choice(medication_list)
            # Make sure current medication isn't the same as allergic medication
            while current_med == profile.get('AllergicMedication') and profile['AllergicMedication']:
                current_med = random.choice(medication_list)
            profile['CurrentMedication'] = current_med
        else:
            profile['CurrentMedication'] = ''
            
        profiles.append(profile)
    
    # Create DataFrame
    profiles_df = pd.DataFrame(profiles)
    
    try:
        # Save to CSV
        profiles_df.to_csv(output_file, index=False)
        print(f"\nGeneration Summary:")
        print(f"- Generated {len(profiles_df)} medical profiles")
        print(f"- Output saved to {output_file}")
        
        # Print some statistics
        print(f"\nProfile Statistics:")
        print(f"- Profiles with allergies: {(profiles_df['AllergicMedication'] != '').sum()}")
        print(f"- Profiles with current medications: {(profiles_df['CurrentMedication'] != '').sum()}")
        print(f"- Profiles with both: {((profiles_df['AllergicMedication'] != '') & (profiles_df['CurrentMedication'] != '')).sum()}")
        
    except Exception as e:
        print(f"Error saving output file: {e}")
        return profiles_df
    
    return profiles_df

medication_file = "Medication.csv"
output_file = "MedicalProfile.csv"

# Generate 100 profiles (you can change this number)
profiles_df = generate_medical_profiles(medication_file, output_file, num_profiles=1010)
    

Error reading medication file: 'MedicationName'


In [203]:
def generate_has_profiles(user_file, output_file, min_profiles=1, max_profiles=3):
    """
    Generate HasProfile relationships between users and profile indices
    
    Parameters:
    user_file (str): Path to the User.csv file
    output_file (str): Path where the output HasProfile.csv will be saved
    min_profiles (int): Minimum number of profiles per user
    max_profiles (int): Maximum number of profiles per user
    """
    
    # Read the users file
    try:
        users_df = pd.read_csv(user_file)
        usernames = users_df['Username'].unique().tolist()
    except FileNotFoundError:
        print(f"Error: Could not find user file")
        return None
    except Exception as e:
        print(f"Error reading user file: {e}")
        return None
        
    print(f"Loaded {len(usernames)} unique users")
    
    # Initialize lists for relationship data
    relationships = []
    
    # Generate relationships for each user
    for username in usernames:
        # Randomly decide how many profiles this user will have
        num_profiles = random.randint(min_profiles, max_profiles)
        
        # Generate random unique profile indices for this user
        profile_indices = random.sample(range(0, 1010), num_profiles)  # 0 to 1009 inclusive
        
        # Create relationships for this user
        for profile_index in profile_indices:
            relationships.append({
                'Username': username,
                'ProfileIndex': profile_index
            })
    
    # Create DataFrame
    relationships_df = pd.DataFrame(relationships)
    
    # Sort by Username and ProfileIndex for better readability
    relationships_df = relationships_df.sort_values(['Username', 'ProfileIndex'])
    
    try:
        # Save to CSV
        relationships_df.to_csv(output_file, index=False)
        print(f"\nGeneration Summary:")
        print(f"- Generated {len(relationships_df)} profile relationships")
        print(f"- For {len(usernames)} unique users")
        print(f"- Output saved to {output_file}")
        
        # Print some statistics
        profiles_per_user = relationships_df.groupby('Username').size()
        print(f"\nRelationship Statistics:")
        print(f"- Average profiles per user: {profiles_per_user.mean():.2f}")
        print(f"- Min profiles per user: {profiles_per_user.min()}")
        print(f"- Max profiles per user: {profiles_per_user.max()}")
        
    except Exception as e:
        print(f"Error saving output file: {e}")
        return relationships_df
    
    return relationships_df

user_file = "User.csv"
output_file = "HasProfile.csv"

# Generate relationships with 1-3 profiles per user
relationships_df = generate_has_profiles(user_file, output_file, min_profiles=1, max_profiles=3)

Loaded 1010 unique users

Generation Summary:
- Generated 2030 profile relationships
- For 1010 unique users
- Output saved to HasProfile.csv

Relationship Statistics:
- Average profiles per user: 2.01
- Min profiles per user: 1
- Max profiles per user: 3


In [243]:
import pandas as pd

# Load the data
df = pd.read_csv('/Users/linneamarsh/Desktop/cs411/fa24-cs411-team115-llamas/updated_database/drugs_side_effects_drugs_com.csv')

# Group by 'generic_name' and 'medical_condition' and reset index
condensed = df.groupby(['generic_name', 'medical_condition', 'rx_otc']).first().reset_index()

# Standardize the 'generic_name' to lowercase
condensed['generic_name'] = condensed['generic_name'].str.lower()

# Keep only relevant columns
condensed = condensed[['generic_name', 'medical_condition', 'rx_otc']]

# Save the condensed data to a new CSV file
condensed.to_csv('condensed.csv', index=False)

# Extract unique medical conditions
unique_conditions = condensed['medical_condition'].unique()

# Print unique medical conditions
print(unique_conditions)


['Pain' 'Osteoarthritis' 'Colds & Flu' 'Migraine' 'Anxiety'
 'GERD (Heartburn)' 'Insomnia' 'Angina' 'Hypertension' 'Constipation'
 'Acne' 'Hayfever' 'Covid 19' 'Asthma' 'Osteoporosis' 'Cancer' 'Psoriasis'
 'Allergies' 'Diarrhea' 'Diabetes (Type 2)' 'Bronchitis' 'Eczema'
 'Diabetes (Type 1)' 'Seizures' 'Gastrointestinal' 'UTI' 'Depression'
 'Pneumonia' 'Cholesterol' 'Rheumatoid Arthritis' "Alzheimer's" 'AIDS/HIV'
 'COPD' 'Herpes' 'Gout' 'Erectile Dysfunction' 'Stroke' 'ADHD'
 'Schizophrenia' 'Bipolar Disorder' 'IBD (Bowel)' 'Weight Loss'
 'Incontinence' 'Menopause' 'Hypothyroidism' 'Hair Loss' 'Swine Flu']


In [233]:
df2 = pd.read_csv('dataset.csv')
unique_conditions2 = df2['Disease'].unique()
print(unique_conditions2)

['Fungal infection' 'Allergy' 'GERD' 'Chronic cholestasis' 'Drug Reaction'
 'Peptic ulcer diseae' 'AIDS' 'Diabetes ' 'Gastroenteritis'
 'Bronchial Asthma' 'Hypertension ' 'Migraine' 'Cervical spondylosis'
 'Paralysis (brain hemorrhage)' 'Jaundice' 'Malaria' 'Chicken pox'
 'Dengue' 'Typhoid' 'hepatitis A' 'Hepatitis B' 'Hepatitis C'
 'Hepatitis D' 'Hepatitis E' 'Alcoholic hepatitis' 'Tuberculosis'
 'Common Cold' 'Pneumonia' 'Dimorphic hemmorhoids(piles)' 'Heart attack'
 'Varicose veins' 'Hypothyroidism' 'Hyperthyroidism' 'Hypoglycemia'
 'Osteoarthristis' 'Arthritis' '(vertigo) Paroymsal  Positional Vertigo'
 'Acne' 'Urinary tract infection' 'Psoriasis' 'Impetigo']


In [238]:
for cond in unique_conditions: 
    for disease in unique_conditions2: 
        if (cond == disease):
            print(cond)

# cond / disease - 'Diabetes (Type 2)'/'Diabetes ', 'Rheumatoid Arthritis'/'Osteoarthristis' 'Arthritis', 'UTI'/'Urinary tract infection', 'Asthma'/'Bronchial Asthma'
# 'Colds & Flu'/'enCommon Cold'

Migraine
Acne
Psoriasis
Pneumonia
Hypothyroidism


In [248]:
import pandas as pd

# Read CSVs
disease_mapping = pd.read_csv('disease_index_mapping.csv')
medications = pd.read_csv('condensed.csv')

# Create disease to group ID mapping
disease_to_group = {}
for _, row in disease_mapping.iterrows():
   disease_to_group[row['Disease']] = row['SymptomGroupId']

# Define condition mappings
condition_mappings = {
   'Diabetes (Type 2)': 'Diabetes ',
   'Rheumatoid Arthritis': 'Arthritis',
   'UTI': 'Urinary tract infection', 
   'Asthma': 'Bronchial Asthma',
   'Colds & Flu': 'Common Cold'
}

# Map medications to group IDs
results = []
for _, row in medications.iterrows():
   med_name = row['generic_name'].replace(',', '')
   condition = row['medical_condition']
   prescription = row['rx_otc']
   
   # Direct match
   if condition in disease_to_group:
       results.append([med_name, prescription, disease_to_group[condition]])
       continue

   # Check mappings
   for med_cond, diseases in condition_mappings.items():
       if condition == med_cond:
           if isinstance(diseases, list):
               for disease in diseases:
                   if disease in disease_to_group:
                       results.append([med_name, prescription, disease_to_group[disease]])
                       break
           elif diseases in disease_to_group:
               results.append([med_name, prescription, disease_to_group[diseases]])

# Create and save output DataFrame            
output_df = pd.DataFrame(results, columns=['MedicationName', 'Prescription', 'SymptomGroupId'])
output_df['Prescription'] = output_df['Prescription'].str.lower()
output_df.to_csv('medication_groups.csv', index=True)

In [249]:
output_df['SymptomGroupId'].unique()

array([4905, 4890, 4916, 4888, 4918, 4886, 4917, 4906, 4914, 4910])