# Occupation

### Introduction:

Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

spark = SparkSession.builder.appName("Occupation").getOrCreate()

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user). 

### Step 3. Assign it to a variable called users.

In [2]:
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user'
from pyspark import SparkFiles
spark.sparkContext.addFile(url)

users = spark.read.csv(SparkFiles.get("u.user"), sep='|', header=True, inferSchema= True)

In [3]:
users.take(5)

[Row(user_id=1, age=24, gender='M', occupation='technician', zip_code='85711'),
 Row(user_id=2, age=53, gender='F', occupation='other', zip_code='94043'),
 Row(user_id=3, age=23, gender='M', occupation='writer', zip_code='32067'),
 Row(user_id=4, age=24, gender='M', occupation='technician', zip_code='43537'),
 Row(user_id=5, age=33, gender='F', occupation='other', zip_code='15213')]

### Step 4. Discover what is the mean age per occupation

In [6]:
users.groupBy(F.col('occupation')).agg({'age': 'mean'}).head(10)

[Row(occupation='librarian', avg(age)=40.0),
 Row(occupation='retired', avg(age)=63.07142857142857),
 Row(occupation='lawyer', avg(age)=36.75),
 Row(occupation='none', avg(age)=26.555555555555557),
 Row(occupation='writer', avg(age)=36.31111111111111),
 Row(occupation='programmer', avg(age)=33.121212121212125),
 Row(occupation='marketing', avg(age)=37.61538461538461),
 Row(occupation='other', avg(age)=34.523809523809526),
 Row(occupation='executive', avg(age)=38.71875),
 Row(occupation='scientist', avg(age)=35.54838709677419)]

### Step 5. Discover the Male ratio per occupation and sort it from the most to the least

In [30]:
a = users.where(F.col("gender") == "M").groupby('occupation').count()
b = users.groupby('occupation').count()

In [60]:
male_percent = a.join(b, a.occupation == b.occupation).select(
    a["occupation"],
    a["count"].alias('males'),
    b["count"].alias('total'),
    (a["count"] / b["count"] * 100).alias('ratio') 
).orderBy('ratio', ascending=False)

In [61]:
male_percent.show()

+-------------+-----+-----+-----------------+
|   occupation|males|total|            ratio|
+-------------+-----+-----+-----------------+
|       doctor|    7|    7|            100.0|
|     engineer|   65|   67|97.01492537313433|
|   technician|   26|   27|96.29629629629629|
|      retired|   13|   14|92.85714285714286|
|   programmer|   60|   66| 90.9090909090909|
|    executive|   29|   32|           90.625|
|    scientist|   28|   31|90.32258064516128|
|entertainment|   16|   18|88.88888888888889|
|       lawyer|   10|   12|83.33333333333334|
|     salesman|    9|   12|             75.0|
|     educator|   69|   95|72.63157894736842|
|      student|  136|  196|69.38775510204081|
|        other|   69|  105|65.71428571428571|
|    marketing|   16|   26|61.53846153846154|
|       writer|   26|   45|57.77777777777777|
|         none|    5|    9|55.55555555555556|
|administrator|   43|   79|54.43037974683544|
|       artist|   15|   28|53.57142857142857|
|    librarian|   22|   51|43.1372

### Step 6. For each occupation, calculate the minimum and maximum ages

In [55]:
users.select('occupation', 'age').groupby('occupation').agg(
        F.min(F.col('age')).alias('min_age'),
        F.max(F.col('age')).alias('max_age')
).show(10)

+----------+-------+-------+
|occupation|min_age|max_age|
+----------+-------+-------+
| librarian|     23|     69|
|   retired|     51|     73|
|    lawyer|     21|     53|
|      none|     11|     55|
|    writer|     18|     60|
|programmer|     20|     63|
| marketing|     24|     55|
|     other|     13|     64|
| executive|     22|     69|
| scientist|     23|     55|
+----------+-------+-------+
only showing top 10 rows



### Step 7. For each combination of occupation and gender, calculate the mean age

In [57]:
users.groupby('occupation', 'gender').agg(
        F.mean(F.col('age')).alias('mean_age'),
).orderBy('occupation').show(10)

+-------------+------+------------------+
|   occupation|gender|          mean_age|
+-------------+------+------------------+
|administrator|     M| 37.16279069767442|
|administrator|     F|40.638888888888886|
|       artist|     F|30.307692307692307|
|       artist|     M|32.333333333333336|
|       doctor|     M| 43.57142857142857|
|     educator|     M| 43.10144927536232|
|     educator|     F| 39.11538461538461|
|     engineer|     F|              29.5|
|     engineer|     M|              36.6|
|entertainment|     F|              31.0|
+-------------+------+------------------+
only showing top 10 rows



### Step 8.  For each occupation present the percentage of women and men

In [71]:
male_percent.select(
    'occupation',
    F.col('ratio').alias('male'),
    (100 - male_percent["ratio"]).alias('female') 
).show()

+-------------+-----------------+------------------+
|   occupation|             male|            female|
+-------------+-----------------+------------------+
|       doctor|            100.0|               0.0|
|     engineer|97.01492537313433| 2.985074626865668|
|   technician|96.29629629629629|3.7037037037037095|
|      retired|92.85714285714286| 7.142857142857139|
|   programmer| 90.9090909090909| 9.090909090909093|
|    executive|           90.625|             9.375|
|    scientist|90.32258064516128| 9.677419354838719|
|entertainment|88.88888888888889|11.111111111111114|
|       lawyer|83.33333333333334|16.666666666666657|
|     salesman|             75.0|              25.0|
|     educator|72.63157894736842|27.368421052631575|
|      student|69.38775510204081|30.612244897959187|
|        other|65.71428571428571| 34.28571428571429|
|    marketing|61.53846153846154| 38.46153846153846|
|       writer|57.77777777777777| 42.22222222222223|
|         none|55.55555555555556| 44.444444444