# Student Alcohol Consumption

### Introduction:

This time you will download a dataset from the UCI.

### Step 1. Import the necessary libraries

In [83]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

spark = SparkSession.builder.appName("Students_Alcohol_Consumption").getOrCreate()

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/04_Apply/Students_Alcohol_Consumption/student-mat.csv).

### Step 3. Assign it to a variable called df.

In [84]:
url = 'https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/04_Apply/Students_Alcohol_Consumption/student-mat.csv'
from pyspark import SparkFiles
spark.sparkContext.addFile(url)

df = spark.read.csv(SparkFiles.get("student-mat.csv"),header=True, inferSchema= True)

### Step 4. For the purpose of this exercise slice the dataframe from 'school' until the 'guardian' column

In [85]:
df.printSchema()

root
 |-- school: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- famsize: string (nullable = true)
 |-- Pstatus: string (nullable = true)
 |-- Medu: integer (nullable = true)
 |-- Fedu: integer (nullable = true)
 |-- Mjob: string (nullable = true)
 |-- Fjob: string (nullable = true)
 |-- reason: string (nullable = true)
 |-- guardian: string (nullable = true)
 |-- traveltime: integer (nullable = true)
 |-- studytime: integer (nullable = true)
 |-- failures: integer (nullable = true)
 |-- schoolsup: string (nullable = true)
 |-- famsup: string (nullable = true)
 |-- paid: string (nullable = true)
 |-- activities: string (nullable = true)
 |-- nursery: string (nullable = true)
 |-- higher: string (nullable = true)
 |-- internet: string (nullable = true)
 |-- romantic: string (nullable = true)
 |-- famrel: integer (nullable = true)
 |-- freetime: integer (nullable = true)
 |-- goout: integer (null

In [86]:
stud_alcoh = df.select(df.columns[df.columns.index('school'):df.columns.index('guardian') + 1])

In [87]:
stud_alcoh.show(5)

+------+---+---+-------+-------+-------+----+----+-------+--------+------+--------+
|school|sex|age|address|famsize|Pstatus|Medu|Fedu|   Mjob|    Fjob|reason|guardian|
+------+---+---+-------+-------+-------+----+----+-------+--------+------+--------+
|    GP|  F| 18|      U|    GT3|      A|   4|   4|at_home| teacher|course|  mother|
|    GP|  F| 17|      U|    GT3|      T|   1|   1|at_home|   other|course|  father|
|    GP|  F| 15|      U|    LE3|      T|   1|   1|at_home|   other| other|  mother|
|    GP|  F| 15|      U|    GT3|      T|   4|   2| health|services|  home|  mother|
|    GP|  F| 16|      U|    GT3|      T|   3|   3|  other|   other|  home|  father|
+------+---+---+-------+-------+-------+----+----+-------+--------+------+--------+
only showing top 5 rows



### Step 5. Create a lambda function that will capitalize strings.

In [88]:
capitalize = F.udf(lambda x: x.capitalize())

### Step 6. Capitalize both Mjob and Fjob

In [89]:
df.withColumn("Mjob", capitalize(F.col('Mjob'))).select('Mjob').show(1)
df.withColumn("Fjob", capitalize(F.col('Fjob'))).select('Fjob').show(1)

+-------+
|   Mjob|
+-------+
|At_home|
+-------+
only showing top 1 row

+-------+
|   Fjob|
+-------+
|Teacher|
+-------+
only showing top 1 row



### Step 7. Print the last elements of the data set.

In [90]:
stud_alcoh.tail(5)

[Row(school='MS', sex='M', age=20, address='U', famsize='LE3', Pstatus='A', Medu=2, Fedu=2, Mjob='services', Fjob='services', reason='course', guardian='other'),
 Row(school='MS', sex='M', age=17, address='U', famsize='LE3', Pstatus='T', Medu=3, Fedu=1, Mjob='services', Fjob='services', reason='course', guardian='mother'),
 Row(school='MS', sex='M', age=21, address='R', famsize='GT3', Pstatus='T', Medu=1, Fedu=1, Mjob='other', Fjob='other', reason='course', guardian='other'),
 Row(school='MS', sex='M', age=18, address='R', famsize='LE3', Pstatus='T', Medu=3, Fedu=2, Mjob='services', Fjob='other', reason='course', guardian='mother'),
 Row(school='MS', sex='M', age=19, address='U', famsize='LE3', Pstatus='T', Medu=1, Fedu=1, Mjob='other', Fjob='at_home', reason='course', guardian='father')]

### Step 8. Did you notice the original dataframe is still lowercase? Why is that? Fix it and capitalize Mjob and Fjob.

In [91]:
df.take(1)

[Row(school='GP', sex='F', age=18, address='U', famsize='GT3', Pstatus='A', Medu=4, Fedu=4, Mjob='at_home', Fjob='teacher', reason='course', guardian='mother', traveltime=2, studytime=2, failures=0, schoolsup='yes', famsup='no', paid='no', activities='no', nursery='yes', higher='yes', internet='no', romantic='no', famrel=4, freetime=3, goout=4, Dalc=1, Walc=1, health=3, absences=6, G1=5, G2=6, G3=6)]

In [92]:
df = df.withColumn("Mjob", capitalize(F.col('Mjob')))
df = df.withColumn("Fjob", capitalize(F.col('Fjob')))

### Step 9. Create a function called majority that returns a boolean value to a new column called legal_drinker (Consider majority as older than 17 years old)

In [93]:
majority = F.udf(lambda age: age > 17, T.BooleanType())

In [94]:
df = df.withColumn('legal_drinker', majority(F.col('age')))

### Step 10. Multiply every number of the dataset by 10. 
##### I know this makes no sense, don't forget it is just an exercise

In [95]:
multiplier = F.udf(lambda x: x * 10, T.IntegerType())

In [96]:
df.head(1)

[Row(school='GP', sex='F', age=18, address='U', famsize='GT3', Pstatus='A', Medu=4, Fedu=4, Mjob='At_home', Fjob='Teacher', reason='course', guardian='mother', traveltime=2, studytime=2, failures=0, schoolsup='yes', famsup='no', paid='no', activities='no', nursery='yes', higher='yes', internet='no', romantic='no', famrel=4, freetime=3, goout=4, Dalc=1, Walc=1, health=3, absences=6, G1=5, G2=6, G3=6, legal_drinker=True)]

In [97]:
for col in df.columns:
    if(df.schema[col].dataType == T.IntegerType()):
        df = df.withColumn(col, multiplier(F.col(col)))

In [98]:
df.head(1)

[Row(school='GP', sex='F', age=180, address='U', famsize='GT3', Pstatus='A', Medu=40, Fedu=40, Mjob='At_home', Fjob='Teacher', reason='course', guardian='mother', traveltime=20, studytime=20, failures=0, schoolsup='yes', famsup='no', paid='no', activities='no', nursery='yes', higher='yes', internet='no', romantic='no', famrel=40, freetime=30, goout=40, Dalc=10, Walc=10, health=30, absences=60, G1=50, G2=60, G3=60, legal_drinker=True)]