<a href="https://colab.research.google.com/github/jsale017/BA780-A2-Deciphering_Key_Influences_on_Restaurant_Success_within_Uber_Eats/blob/main/Hack_diversity_challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hack Diversity Challenge

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Read the xlsx file
df = pd.read_excel('inclusive_features_usage.xlsx')
print(df)

        user_id        feature_used  session_duration  engagement_score  \
0     user_4737        Multilingual             60.77                 4   
1     user_1492      Text-to-Speech             20.20                 5   
2     user_5936     Colorblind Mode             93.78                 2   
3     user_5519     Colorblind Mode             26.36                 3   
4     user_1853  High Contrast Mode             91.67                 5   
...         ...                 ...               ...               ...   
9995   user_667          Visual Aid            110.11                 1   
9996  user_3554          Visual Aid             22.44                 3   
9997  user_8892        Multilingual             12.91                 4   
9998  user_8014        Multilingual             19.23                 3   
9999  user_7722      Text-to-Speech             92.66                 2   

      accessibility_settings     age_group      gender additional_attributes  
0                   

## Data Cleaning

In [4]:
# Checking the dataframe types and missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   user_id                 10000 non-null  object 
 1   feature_used            10000 non-null  object 
 2   session_duration        10000 non-null  float64
 3   engagement_score        10000 non-null  int64  
 4   accessibility_settings  10000 non-null  int64  
 5   age_group               10000 non-null  object 
 6   gender                  6003 non-null   object 
 7   additional_attributes   3997 non-null   object 
dtypes: float64(1), int64(2), object(5)
memory usage: 625.1+ KB


In [5]:
# Checking for duplicates
df.duplicated().sum()

0

In [6]:
df.isnull().sum()

Unnamed: 0,0
user_id,0
feature_used,0
session_duration,0
engagement_score,0
accessibility_settings,0
age_group,0
gender,3997
additional_attributes,6003


In [7]:
unique_genders = df['gender'].unique()
print(unique_genders)

[nan 'Non-Binary' 'Female' 'Male']


In [8]:
df['gender'].fillna('Not Mentioned', inplace=True)
print(df['gender'].unique())

['Not Mentioned' 'Non-Binary' 'Female' 'Male']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['gender'].fillna('Not Mentioned', inplace=True)


In [9]:
df['additional_attributes'].unique()

array(['LGBTQ+', nan, 'Person of Color'], dtype=object)

In [10]:
df['additional_attributes'].fillna('Not Mentioned', inplace=True)
print(df['additional_attributes'].unique())

['LGBTQ+' 'Not Mentioned' 'Person of Color']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['additional_attributes'].fillna('Not Mentioned', inplace=True)


In [11]:
# Count the occurrences of all values in 'session_duration'
session_duration_counts = df['session_duration'].value_counts()
session_duration_counts_df = session_duration_counts.reset_index()
session_duration_counts_df.columns = ['Session Duration', 'Count']
session_duration_counts_df

Unnamed: 0,Session Duration,Count
0,113.58,6
1,119.00,6
2,80.93,6
3,81.37,5
4,77.72,5
...,...,...
6696,77.95,1
6697,57.92,1
6698,61.10,1
6699,6.77,1


In [12]:
# Outliers
Q1_duration = df['session_duration'].quantile(0.25)
Q3_duration = df['session_duration'].quantile(0.75)
IQR_duration = Q3_duration - Q1_duration

lower_bound_duration = Q1_duration - 1.5 * IQR_duration
upper_bound_duration = Q3_duration + 1.5 * IQR_duration

outliers_duration = df[(df['session_duration'] < lower_bound_duration) | (df['session_duration'] > upper_bound_duration)]
outliers_duration

Unnamed: 0,user_id,feature_used,session_duration,engagement_score,accessibility_settings,age_group,gender,additional_attributes


In [13]:
# Count the occurrences of all values in 'Engagement Score'
engagement_score_counts = df['engagement_score'].value_counts()
engagement_score_counts_df = engagement_score_counts.reset_index()
engagement_score_counts_df.columns = ['Engagement Score', 'Count']
engagement_score_counts_df

Unnamed: 0,Engagement Score,Count
0,5,2107
1,4,2037
2,2,1977
3,3,1960
4,1,1919


In [14]:
# Outliers
Q1_engagement = df['engagement_score'].quantile(0.25)
Q3_engagement = df['engagement_score'].quantile(0.75)
IQR_engagement = Q3_engagement - Q1_engagement

lower_bound_engagement = Q1_engagement - 1.5 * IQR_engagement
upper_bound_engagement = Q3_engagement + 1.5 * IQR_engagement

outliers_engagement = df[(df['engagement_score'] < lower_bound_engagement) | (df['engagement_score'] > upper_bound_engagement)]
outliers_engagement

Unnamed: 0,user_id,feature_used,session_duration,engagement_score,accessibility_settings,age_group,gender,additional_attributes


In [15]:
df['high_accessibility_user'] = df['accessibility_settings'].apply(lambda x: 1 if x >= 3 else 0)
print(df[['user_id', 'accessibility_settings', 'high_accessibility_user']].head())
print('')
print('Values counts for High Accessibility Users:')
df['high_accessibility_user'].value_counts()

     user_id  accessibility_settings  high_accessibility_user
0  user_4737                       4                        1
1  user_1492                       4                        1
2  user_5936                       2                        0
3  user_5519                       1                        0
4  user_1853                       1                        0

Values counts for High Accessibility Users:


Unnamed: 0_level_0,count
high_accessibility_user,Unnamed: 1_level_1
0,5035
1,4965


In [16]:
df['intersectional_user'] = df.apply(lambda row: f"{row['age_group']}_{row['gender']} - {row['additional_attributes']}", axis=1)

In [17]:
df['intersectional_user'].value_counts()

Unnamed: 0_level_0,count
intersectional_user,Unnamed: 1_level_1
Adult_Non-Binary - Not Mentioned,431
Adult_Male - Not Mentioned,426
Senior_Male - Not Mentioned,420
Professional_Male - Not Mentioned,417
Young Adult_Female - Not Mentioned,414
Professional_Not Mentioned - Person of Color,411
Senior_Not Mentioned - LGBTQ+,411
Young Adult_Not Mentioned - Person of Color,410
Senior_Not Mentioned - Person of Color,408
Senior_Female - Not Mentioned,406


In [None]:
from google.colab import drive
drive.mount('/content/drive')