In [7]:
# Libraries.
try:
  import polars as pl
except:
  !pip install polars
  import polars as pl

In [8]:
# Import data from URL and assign it to a variable called users.
url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user"

users = pl.read_csv(url, separator = "|")

# Preview data.
users.head()

user_id,age,gender,occupation,zip_code
i64,i64,str,str,str
1,24,"""M""","""technician""","""85711"""
2,53,"""F""","""other""","""94043"""
3,23,"""M""","""writer""","""32067"""
4,24,"""M""","""technician""","""43537"""
5,33,"""F""","""other""","""15213"""


In [9]:
# Discover what's the mean age per occupation.
users.select("occupation", "age").group_by("occupation").mean()

occupation,age
str,f64
"""marketing""",37.615385
"""educator""",42.010526
"""executive""",38.71875
"""entertainment""",29.222222
"""homemaker""",32.571429
…,…
"""retired""",63.071429
"""programmer""",33.121212
"""engineer""",36.38806
"""artist""",31.392857


In [65]:
# Discover the Male ratio per occupation and sort it from the most to the least.
users \
  .with_columns(pl.when(pl.col("gender") == "M").then(1).otherwise(0).alias("is_male")) \
  .group_by("occupation") \
  .agg(male_ratio = (pl.col("is_male").sum() / pl.len()) * 100) \
  .sort("male_ratio", descending = True)

occupation,male_ratio
str,f64
"""doctor""",100.0
"""engineer""",97.014925
"""technician""",96.296296
"""retired""",92.857143
"""programmer""",90.909091
…,…
"""administrator""",54.43038
"""artist""",53.571429
"""librarian""",43.137255
"""healthcare""",31.25


In [67]:
# For each occupation, calculate the minimum and maximum ages.
users \
  .group_by("occupation") \
  .agg(
      min_age = pl.col("age").min(),
      max_age = pl.col("age").max()
  )

occupation,min_age,max_age
str,i64,i64
"""student""",7,42
"""educator""",23,63
"""other""",13,64
"""scientist""",23,55
"""lawyer""",21,53
…,…,…
"""marketing""",24,55
"""entertainment""",15,50
"""administrator""",21,70
"""executive""",22,69


In [71]:
# For each combination of occupation and gender, calculate the mean age.
users \
  .group_by("occupation", "gender") \
  .agg(
      mean_age = pl.col("age").mean().round(2)
  )

occupation,gender,mean_age
str,str,f64
"""writer""","""M""",35.35
"""programmer""","""M""",33.22
"""artist""","""M""",32.33
"""engineer""","""F""",29.5
"""technician""","""F""",38.0
…,…,…
"""other""","""M""",34.03
"""other""","""F""",35.47
"""retired""","""M""",62.54
"""writer""","""F""",37.63


In [73]:
# Method 2.
users \
  .group_by("occupation") \
  .agg(
      mean_age_male = pl.col("age").where(pl.col("gender") == "M").mean().round(2),
      mean_age_female = pl.col("age").where(pl.col("gender") != "M").mean().round(2)
  )

occupation,mean_age_male,mean_age_female
str,f64,f64
"""executive""",38.17,44.0
"""salesman""",38.56,27.0
"""other""",34.03,35.47
"""artist""",32.33,30.31
"""engineer""",36.6,29.5
…,…,…
"""homemaker""",23.0,34.17
"""educator""",43.1,39.12
"""scientist""",36.32,28.33
"""technician""",32.96,38.0


In [79]:
# For each occupation, present the percentage of women and men.
users \
  .group_by("occupation") \
  .agg(
      percentage_men = (pl.col("user_id").where(pl.col("gender") == "M").count() / pl.len() * 100).round(2),
      percentage_women = (pl.col("user_id").where(pl.col("gender") != "M").count() / pl.len() * 100).round(2)
  )

occupation,percentage_men,percentage_women
str,f64,f64
"""none""",55.56,44.44
"""retired""",92.86,7.14
"""doctor""",100.0,0.0
"""marketing""",61.54,38.46
"""technician""",96.3,3.7
…,…,…
"""writer""",57.78,42.22
"""homemaker""",14.29,85.71
"""other""",65.71,34.29
"""healthcare""",31.25,68.75
