# Ex3 - Getting and Knowing your Data

This time we are going to pull data directly from the internet.
Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [None]:
from pyspark.sql import SparkSession

spark_context = SparkSession.builder.appName('Spark Exercises').getOrCreate()

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user). 

In [36]:
import pandas

#1 Using Pandas
URL_DATA = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user'
pandas_df_user = pandas.read_csv(URL_DATA, sep='|')

df_user = spark_context.createDataFrame(pandas_df_user)
df_user.printSchema()
df_user.show(5)

#2 Using PySpark
FILE_PATH = 'u.user.txt'
df_user \
    = spark_context \
        .read \
        .csv(path=FILE_PATH, sep='|', header=True, inferSchema=True)
df_user.printSchema()
df_user.show(5)

root
 |-- user_id: long (nullable = true)
 |-- age: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- zip_code: string (nullable = true)

+-------+---+------+----------+--------+
|user_id|age|gender|occupation|zip_code|
+-------+---+------+----------+--------+
|      1| 24|     M|technician|   85711|
|      2| 53|     F|     other|   94043|
|      3| 23|     M|    writer|   32067|
|      4| 24|     M|technician|   43537|
|      5| 33|     F|     other|   15213|
+-------+---+------+----------+--------+
only showing top 5 rows
root
 |-- user_id: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- zip_code: string (nullable = true)

+-------+---+------+----------+--------+
|user_id|age|gender|occupation|zip_code|
+-------+---+------+----------+--------+
|      1| 24|     M|technician|   85711|
|      2| 53|     F|     other|   94043|
|   

### Step 3. Assign it to a variable called users and use the 'user_id' as index

In [5]:
users = df_user

### Step 4. See the first 25 entries

In [11]:
# In Pandas
pandas_df_user.head(10)

# In PySpark

##'head' should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory.
df_user.head(10)

df_user.orderBy(df_user.user_id.asc()).limit(10).show()

+-------+---+------+-------------+--------+
|user_id|age|gender|   occupation|zip_code|
+-------+---+------+-------------+--------+
|      1| 24|     M|   technician|   85711|
|      2| 53|     F|        other|   94043|
|      3| 23|     M|       writer|   32067|
|      4| 24|     M|   technician|   43537|
|      5| 33|     F|        other|   15213|
|      6| 42|     M|    executive|   98101|
|      7| 57|     M|administrator|   91344|
|      8| 36|     M|administrator|   05201|
|      9| 29|     M|      student|   01002|
|     10| 53|     M|       lawyer|   90703|
+-------+---+------+-------------+--------+


### Step 5. See the last 10 entries

In [12]:
# In Pandas
pandas_df_user.tail(10)

# In PySpark

##'head' should only be used if the resulting array is expected to be small, as all the data is loaded into the driver's memory.
df_user.tail(10)

df_user.orderBy(df_user.user_id.desc()).limit(10).show()

+-------+---+------+-------------+--------+
|user_id|age|gender|   occupation|zip_code|
+-------+---+------+-------------+--------+
|    943| 22|     M|      student|   77841|
|    942| 48|     F|    librarian|   78209|
|    941| 20|     M|      student|   97229|
|    940| 32|     M|administrator|   02215|
|    939| 26|     F|      student|   33319|
|    938| 38|     F|   technician|   55038|
|    937| 48|     M|     educator|   98072|
|    936| 24|     M|        other|   32789|
|    935| 42|     M|       doctor|   66221|
|    934| 61|     M|     engineer|   22902|
+-------+---+------+-------------+--------+


### Step 6. What is the number of observations in the dataset?

In [14]:
from pyspark.sql import functions as F

df_user.count()

df_user.select(F.count('*').alias('number_of_observations')).show()

+----------------------+
|number_of_observations|
+----------------------+
|                   943|
+----------------------+


### Step 7. What is the number of columns in the dataset?

In [16]:
len(df_user.columns)

5

### Step 8. Print the name of all the columns.

In [17]:
df_user.columns

['user_id', 'age', 'gender', 'occupation', 'zip_code']

### Step 9. How is the dataset indexed?

### Step 10. What is the data type of each column?

In [20]:
df_user.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- zip_code: string (nullable = true)


### Step 11. Print only the occupation column

In [35]:
df_user.select(df_user.occupation).show(5)

+----------+
|occupation|
+----------+
|technician|
|     other|
|    writer|
|technician|
|     other|
+----------+


### Step 12. How many different occupations are in this dataset?

In [23]:
df_user.select(F.count_distinct(df_user.occupation).alias('different_occupations')).show()

+---------------------+
|different_occupations|
+---------------------+
|                   21|
+---------------------+


### Step 13. What is the most frequent occupation?

In [24]:
df_user_grouped \
    = df_user.groupBy(df_user.occupation).agg(
        F.count(df_user.user_id).alias('user_count')
    )
df_user_grouped.orderBy(df_user_grouped.user_count.desc()).limit(10).show()

+-------------+----------+
|   occupation|user_count|
+-------------+----------+
|      student|       196|
|        other|       105|
|     educator|        95|
|administrator|        79|
|     engineer|        67|
|   programmer|        66|
|    librarian|        51|
|       writer|        45|
|    executive|        32|
|    scientist|        31|
+-------------+----------+


### Step 14. Summarize the DataFrame.

In [26]:
df_user.summary().show()

+-------+-----------------+-----------------+------+-------------+------------------+
|summary|          user_id|              age|gender|   occupation|          zip_code|
+-------+-----------------+-----------------+------+-------------+------------------+
|  count|              943|              943|   943|          943|               943|
|   mean|            472.0|34.05196182396607|  NULL|         NULL| 50868.78810810811|
| stddev|272.3649512449549|12.19273973305903|  NULL|         NULL|30891.373254138176|
|    min|                1|                7|     F|administrator|             00000|
|    25%|              236|               25|  NULL|         NULL|           21227.0|
|    50%|              472|               31|  NULL|         NULL|           53711.0|
|    75%|              708|               43|  NULL|         NULL|           78741.0|
|    max|              943|               73|     M|       writer|             Y1A6B|
+-------+-----------------+-----------------+------+--

### Step 15. Summarize all the columns

In [27]:
df_user.summary().show()

+-------+-----------------+-----------------+------+-------------+------------------+
|summary|          user_id|              age|gender|   occupation|          zip_code|
+-------+-----------------+-----------------+------+-------------+------------------+
|  count|              943|              943|   943|          943|               943|
|   mean|            472.0|34.05196182396607|  NULL|         NULL| 50868.78810810811|
| stddev|272.3649512449549|12.19273973305903|  NULL|         NULL|30891.373254138176|
|    min|                1|                7|     F|administrator|             00000|
|    25%|              236|               25|  NULL|         NULL|           21227.0|
|    50%|              472|               31|  NULL|         NULL|           53711.0|
|    75%|              708|               43|  NULL|         NULL|           78741.0|
|    max|              943|               73|     M|       writer|             Y1A6B|
+-------+-----------------+-----------------+------+--

### Step 16. Summarize only the occupation column

In [31]:
df_user.select(df_user.occupation).describe().show()

df_user.select(df_user.occupation).summary().show()

+-------+-------------+
|summary|   occupation|
+-------+-------------+
|  count|          943|
|   mean|         NULL|
| stddev|         NULL|
|    min|administrator|
|    max|       writer|
+-------+-------------+

+-------+-------------+
|summary|   occupation|
+-------+-------------+
|  count|          943|
|   mean|         NULL|
| stddev|         NULL|
|    min|administrator|
|    25%|         NULL|
|    50%|         NULL|
|    75%|         NULL|
|    max|       writer|
+-------+-------------+


### Step 17. What is the mean age of users?

In [32]:
df_user.select(F.mean(df_user.age).alias('mean_age')).show()

+-----------------+
|         mean_age|
+-----------------+
|34.05196182396607|
+-----------------+


### Step 18. What is the age with least occurrence?

In [34]:
df_user_grouped \
    = df_user.groupBy(df_user.age).agg(
        F.count(df_user.user_id).alias('user_count')
    )
df_user_grouped.orderBy(
    df_user_grouped.user_count.asc(),
    df_user_grouped.age.asc()
).limit(10).show()

+---+----------+
|age|user_count|
+---+----------+
|  7|         1|
| 10|         1|
| 11|         1|
| 66|         1|
| 73|         1|
| 62|         2|
| 64|         2|
| 68|         2|
| 69|         2|
| 14|         3|
+---+----------+
