# Ex3 - Getting and Knowing your Data

This time we are going to pull data directly from the internet.
Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

spark = SparkSession.builder.appName("Occupation").getOrCreate()

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user). 

### Step 3. Assign it to a variable called users and use the 'user_id' as index

In [2]:
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user'
from pyspark import SparkFiles
spark.sparkContext.addFile(url)

users = spark.read.csv(SparkFiles.get("u.user"), sep='|', header=True, inferSchema= True)

### Step 4. See the first 25 entries

In [6]:
users.head(25)

[Row(user_id=1, age=24, gender='M', occupation='technician', zip_code='85711'),
 Row(user_id=2, age=53, gender='F', occupation='other', zip_code='94043'),
 Row(user_id=3, age=23, gender='M', occupation='writer', zip_code='32067'),
 Row(user_id=4, age=24, gender='M', occupation='technician', zip_code='43537'),
 Row(user_id=5, age=33, gender='F', occupation='other', zip_code='15213'),
 Row(user_id=6, age=42, gender='M', occupation='executive', zip_code='98101'),
 Row(user_id=7, age=57, gender='M', occupation='administrator', zip_code='91344'),
 Row(user_id=8, age=36, gender='M', occupation='administrator', zip_code='05201'),
 Row(user_id=9, age=29, gender='M', occupation='student', zip_code='01002'),
 Row(user_id=10, age=53, gender='M', occupation='lawyer', zip_code='90703'),
 Row(user_id=11, age=39, gender='F', occupation='other', zip_code='30329'),
 Row(user_id=12, age=28, gender='F', occupation='other', zip_code='06405'),
 Row(user_id=13, age=47, gender='M', occupation='educator', zip

### Step 5. See the last 10 entries

In [5]:
users.tail(10)

[Row(user_id=934, age=61, gender='M', occupation='engineer', zip_code='22902'),
 Row(user_id=935, age=42, gender='M', occupation='doctor', zip_code='66221'),
 Row(user_id=936, age=24, gender='M', occupation='other', zip_code='32789'),
 Row(user_id=937, age=48, gender='M', occupation='educator', zip_code='98072'),
 Row(user_id=938, age=38, gender='F', occupation='technician', zip_code='55038'),
 Row(user_id=939, age=26, gender='F', occupation='student', zip_code='33319'),
 Row(user_id=940, age=32, gender='M', occupation='administrator', zip_code='02215'),
 Row(user_id=941, age=20, gender='M', occupation='student', zip_code='97229'),
 Row(user_id=942, age=48, gender='F', occupation='librarian', zip_code='78209'),
 Row(user_id=943, age=22, gender='M', occupation='student', zip_code='77841')]

### Step 6. What is the number of observations in the dataset?

In [7]:
users.count()

943

### Step 7. What is the number of columns in the dataset?

In [9]:
len(users.columns)

5

### Step 8. Print the name of all the columns.

In [10]:
users.columns

['user_id', 'age', 'gender', 'occupation', 'zip_code']

### Step 9. How is the dataset indexed?

In [16]:
users.select(F.col('user_id')).summary().show()

+-------+-----------------+
|summary|          user_id|
+-------+-----------------+
|  count|              943|
|   mean|            472.0|
| stddev|272.3649512449549|
|    min|                1|
|    25%|              236|
|    50%|              472|
|    75%|              708|
|    max|              943|
+-------+-----------------+



### Step 10. What is the data type of each column?

In [18]:
users.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- zip_code: string (nullable = true)



### Step 11. Print only the occupation column

In [20]:
users.select(F.col('occupation')).take(10)

[Row(occupation='technician'),
 Row(occupation='other'),
 Row(occupation='writer'),
 Row(occupation='technician'),
 Row(occupation='other'),
 Row(occupation='executive'),
 Row(occupation='administrator'),
 Row(occupation='administrator'),
 Row(occupation='student'),
 Row(occupation='lawyer')]

### Step 12. How many different occupations are in this dataset?

In [22]:
users.select(F.countDistinct('occupation')).collect()

[Row(count(DISTINCT occupation)=21)]

### Step 13. What is the most frequent occupation?

In [25]:
users.groupBy('occupation').count().orderBy('count', ascending=False).take(1)

[Row(occupation='student', count=196)]

### Step 14. Summarize the DataFrame.

In [27]:
users.summary().show()

+-------+-----------------+-----------------+------+-------------+------------------+
|summary|          user_id|              age|gender|   occupation|          zip_code|
+-------+-----------------+-----------------+------+-------------+------------------+
|  count|              943|              943|   943|          943|               943|
|   mean|            472.0|34.05196182396607|  null|         null| 50868.78810810811|
| stddev|272.3649512449549|12.19273973305903|  null|         null|30891.373254138176|
|    min|                1|                7|     F|administrator|             00000|
|    25%|              236|               25|  null|         null|           21227.0|
|    50%|              472|               31|  null|         null|           53711.0|
|    75%|              708|               43|  null|         null|           78741.0|
|    max|              943|               73|     M|       writer|             Y1A6B|
+-------+-----------------+-----------------+------+--

### Step 15. Summarize all the columns

In [28]:
users.summary().show()

+-------+-----------------+-----------------+------+-------------+------------------+
|summary|          user_id|              age|gender|   occupation|          zip_code|
+-------+-----------------+-----------------+------+-------------+------------------+
|  count|              943|              943|   943|          943|               943|
|   mean|            472.0|34.05196182396607|  null|         null| 50868.78810810811|
| stddev|272.3649512449549|12.19273973305903|  null|         null|30891.373254138176|
|    min|                1|                7|     F|administrator|             00000|
|    25%|              236|               25|  null|         null|           21227.0|
|    50%|              472|               31|  null|         null|           53711.0|
|    75%|              708|               43|  null|         null|           78741.0|
|    max|              943|               73|     M|       writer|             Y1A6B|
+-------+-----------------+-----------------+------+--

### Step 16. Summarize only the occupation column

In [30]:
users.select(F.col('occupation')).summary().show()

+-------+-------------+
|summary|   occupation|
+-------+-------------+
|  count|          943|
|   mean|         null|
| stddev|         null|
|    min|administrator|
|    25%|         null|
|    50%|         null|
|    75%|         null|
|    max|       writer|
+-------+-------------+



### Step 17. What is the mean age of users?

In [38]:
users.agg({'age': 'mean'}).show()

+-----------------+
|         avg(age)|
+-----------------+
|34.05196182396607|
+-----------------+



### Step 18. What is the age with least occurrence?

In [41]:
users.groupBy('age').count().orderBy('count', ascending=True).where(F.col('count') < 2).take(10)

[Row(age=7, count=1),
 Row(age=10, count=1),
 Row(age=73, count=1),
 Row(age=11, count=1),
 Row(age=66, count=1)]