# Chapter 2

## Connect and Read in Data

In [1]:
# Connect to Spark
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("Data_Wrangling").getOrCreate()

In [2]:
# Set parameters
file_location = "Ch02/Chapter2_Data/movie_data_part1.csv"
file_type = "csv"
infer_schema = "False"
first_row_is_header = "True"
delimiter = "|"

In [3]:
# Read in dataframe
df_raw = (spark.read.format(file_type)
                    .option("inferSchema", infer_schema)
                    .option("header", first_row_is_header)
                    .option("sep", delimiter)
                    .load(file_location))

## Exploratory Analysis

In [4]:
df_raw.printSchema()

root
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- vote_average: string (nullable = true)



In [5]:
df_raw.dtypes

[('belongs_to_collection', 'string'),
 ('budget', 'string'),
 ('id', 'string'),
 ('original_language', 'string'),
 ('original_title', 'string'),
 ('overview', 'string'),
 ('popularity', 'string'),
 ('production_companies', 'string'),
 ('production_countries', 'string'),
 ('release_date', 'string'),
 ('revenue', 'string'),
 ('runtime', 'string'),
 ('status', 'string'),
 ('tagline', 'string'),
 ('title', 'string'),
 ('vote_average', 'string')]

In [6]:
df_raw.count()

43998

In [7]:
print(f'The total number of records is {df_raw.count()}!')

The total number of records is 43998!


In [8]:
select_columns=['id','budget','popularity','release_date','revenue','title']

In [9]:
df = df_raw.select(*select_columns)

In [10]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- title: string (nullable = true)



In [11]:
df.show()

+-----+-------+------------------+------------+-------+--------------------+
|   id| budget|        popularity|release_date|revenue|               title|
+-----+-------+------------------+------------+-------+--------------------+
|43000|      0|             2.503|  1962-05-23|      0|The Elusive Corporal|
|43001|      0|              5.51|  1962-11-12|      0|  Sundays and Cybele|
|43002|      0|              5.62|  1962-05-24|      0|Lonely Are the Brave|
|43003|      0|             7.159|  1975-03-12|      0|          F for Fake|
|43004| 500000|             3.988|  1962-10-09|      0|Long Day's Journe...|
|43006|      0|             3.194|  1962-03-09|      0|           My Geisha|
|43007|      0|             2.689|  1962-10-31|      0|Period of Adjustment|
|43008|      0|             6.537|  1959-03-13|      0|    The Hanging Tree|
|43010|      0|             4.297|  1962-01-01|      0|Sherlock Holmes a...|
|43011|      0|             4.417|  1962-01-01|      0|  Sodom and Gomorrah|

## Missing Values

In [12]:
# Import sql functions
from pyspark.sql.functions import *

In [13]:
# Calculate the missing values in one column
df.filter((df['popularity']=="")|df['popularity'].isNull()|isnan(df['popularity'])).count()

215

In [14]:
# Calculate the missing values in all columns
df.select([count(when((col(c)=='')|col(c).isNull()|isnan(c), c)).alias(c) for c in df.columns]).show()

+---+------+----------+------------+-------+-----+
| id|budget|popularity|release_date|revenue|title|
+---+------+----------+------------+-------+-----+
|125|   125|       215|         221|    215|  304|
+---+------+----------+------------+-------+-----+



In [15]:
# One way frequencies
df.groupBy(df['title']).count().show()

+--------------------+-----+
|               title|count|
+--------------------+-----+
|   The Corn Is Green|    1|
|Meet The Browns -...|    1|
|Morenita, El Esca...|    1|
| Father Takes a Wife|    1|
|The Werewolf of W...|    1|
|My Wife Is a Gang...|    1|
|Depeche Mode: Tou...|    1|
|  A Woman Is a Woman|    1|
|History Is Made a...|    1|
|      Colombian Love|    1|
|        Ace Attorney|    1|
|     Not Like Others|    1|
|40 Guns to Apache...|    1|
|          Middle Men|    1|
|         It's a Gift|    1|
|    La Vie de Bohème|    1|
|Rasputin: The Mad...|    1|
|The Ballad of Jac...|    1|
|         How to Deal|    1|
|             Freaked|    1|
+--------------------+-----+
only showing top 20 rows



In [16]:
# One way freq and sort
df.groupBy(df['title']).count().sort(desc("count")).show(10, False)

+--------------------+-----+
|title               |count|
+--------------------+-----+
|NULL                |304  |
|Les Misérables      |8    |
|The Three Musketeers|8    |
|Cinderella          |8    |
|A Christmas Carol   |7    |
|Hamlet              |7    |
|The Island          |7    |
|Dracula             |7    |
|Frankenstein        |7    |
|Framed              |6    |
+--------------------+-----+
only showing top 10 rows



In [17]:
# Sort and filter one way frequencies
df_temp = df.filter((df['title']!='') & (df['title'].isNotNull()) & (~isnan(df['title'])))

In [18]:
# Find titles that are repeated more than 4 times
(df_temp.groupBy(df_temp['title'])
        .count()
        .filter("`count`>4")
        .sort(col("count").desc())
        .show(10,False))

+--------------------+-----+
|title               |count|
+--------------------+-----+
|Les Misérables      |8    |
|The Three Musketeers|8    |
|Cinderella          |8    |
|A Christmas Carol   |7    |
|Frankenstein        |7    |
|Hamlet              |7    |
|The Island          |7    |
|Dracula             |7    |
|First Love          |6    |
|Beauty and the Beast|6    |
+--------------------+-----+
only showing top 10 rows



## Casting Variables

In [19]:
# Show types
df.dtypes

[('id', 'string'),
 ('budget', 'string'),
 ('popularity', 'string'),
 ('release_date', 'string'),
 ('revenue', 'string'),
 ('title', 'string')]

In [20]:
# Fix the budget variable
df = df.withColumn('budget', df['budget'].cast("float"))

In [21]:
df.dtypes

[('id', 'string'),
 ('budget', 'float'),
 ('popularity', 'string'),
 ('release_date', 'string'),
 ('revenue', 'string'),
 ('title', 'string')]

In [22]:
df.show(10)

+-----+--------+----------+------------+-------+--------------------+
|   id|  budget|popularity|release_date|revenue|               title|
+-----+--------+----------+------------+-------+--------------------+
|43000|     0.0|     2.503|  1962-05-23|      0|The Elusive Corporal|
|43001|     0.0|      5.51|  1962-11-12|      0|  Sundays and Cybele|
|43002|     0.0|      5.62|  1962-05-24|      0|Lonely Are the Brave|
|43003|     0.0|     7.159|  1975-03-12|      0|          F for Fake|
|43004|500000.0|     3.988|  1962-10-09|      0|Long Day's Journe...|
|43006|     0.0|     3.194|  1962-03-09|      0|           My Geisha|
|43007|     0.0|     2.689|  1962-10-31|      0|Period of Adjustment|
|43008|     0.0|     6.537|  1959-03-13|      0|    The Hanging Tree|
|43010|     0.0|     4.297|  1962-01-01|      0|Sherlock Holmes a...|
|43011|     0.0|     4.417|  1962-01-01|      0|  Sodom and Gomorrah|
+-----+--------+----------+------------+-------+--------------------+
only showing top 10 

In [23]:
# Looping over columns
from pyspark.sql.types import *
int_vars = ['id']
float_vars = ['budget', 'popularity', 'revenue']
date_vars = ['release_date']

for column in float_vars:
    df=df.withColumn(column, df[column].cast(FloatType()))

for column in int_vars:
    df=df.withColumn(column, df[column].cast(IntegerType()))

for column in date_vars:
    df=df.withColumn(column, df[column].cast(DateType()))

df.dtypes

[('id', 'int'),
 ('budget', 'float'),
 ('popularity', 'float'),
 ('release_date', 'date'),
 ('revenue', 'float'),
 ('title', 'string')]

In [24]:
df.show(10, False)

+-----+--------+----------+------------+-------+---------------------------------------+
|id   |budget  |popularity|release_date|revenue|title                                  |
+-----+--------+----------+------------+-------+---------------------------------------+
|43000|0.0     |2.503     |1962-05-23  |0.0    |The Elusive Corporal                   |
|43001|0.0     |5.51      |1962-11-12  |0.0    |Sundays and Cybele                     |
|43002|0.0     |5.62      |1962-05-24  |0.0    |Lonely Are the Brave                   |
|43003|0.0     |7.159     |1975-03-12  |0.0    |F for Fake                             |
|43004|500000.0|3.988     |1962-10-09  |0.0    |Long Day's Journey Into Night          |
|43006|0.0     |3.194     |1962-03-09  |0.0    |My Geisha                              |
|43007|0.0     |2.689     |1962-10-31  |0.0    |Period of Adjustment                   |
|43008|0.0     |6.537     |1959-03-13  |0.0    |The Hanging Tree                       |
|43010|0.0     |4.297

## Descriptive Statistics

In [25]:
df.describe().show()

+-------+------------------+--------------------+-----------------+-------------------+--------------------+
|summary|                id|              budget|       popularity|            revenue|               title|
+-------+------------------+--------------------+-----------------+-------------------+--------------------+
|  count|             43784|               43873|            43783|              43783|               43694|
|   mean|44502.304312077475|   3736901.834963167|5.295444259579189|  9697079.597382545|            Infinity|
| stddev|27189.646588626394|1.5871814952777326E7|6.168030519208252|5.687938449628811E7|                 NaN|
|    min|                 2|                 0.0|              0.6|                0.0|!Women Art Revolu...|
|    max|            100988|               3.8E8|            180.0|       2.78796518E9|       시크릿 Secret|
+-------+------------------+--------------------+-----------------+-------------------+--------------------+



In [26]:
df_temp = df.filter((df['budget']!=0) & (df['budget'].isNotNull()) & (~isnan(df['budget'])))

In [27]:
median = df_temp.approxQuantile(['budget','popularity'],[0.5],0.1)

In [28]:
median

[[5000000.0], [8.413999557495117]]

In [29]:
df.agg(countDistinct(col('title')).alias("count")).show()

+-----+
|count|
+-----+
|41138|
+-----+



In [30]:
df.select('title').distinct().show(10,False)

+---------------------------------------------+
|title                                        |
+---------------------------------------------+
|The Corn Is Green                            |
|Meet The Browns - The Play                   |
|Morenita, El Escandalo                       |
|Father Takes a Wife                          |
|The Werewolf of Washington                   |
|My Wife Is a Gangster                        |
|Depeche Mode: Touring the Angel Live in Milan|
|A Woman Is a Woman                           |
|History Is Made at Night                     |
|Colombian Love                               |
+---------------------------------------------+
only showing top 10 rows



In [31]:
# Extract year
df_temp = df.withColumn('release_year',year('release_date'))

In [32]:
# Extract month
df_temp = df_temp.withColumn('release_month', month('release_date'))

In [33]:
# Count distinct titles by 
df_temp.groupBy("release_year").agg(countDistinct("title")).show(10,False)

+------------+---------------------+
|release_year|count(DISTINCT title)|
+------------+---------------------+
|1959        |271                  |
|1990        |496                  |
|1975        |365                  |
|1977        |415                  |
|1924        |19                   |
|2003        |1199                 |
|2007        |1896                 |
|2018        |4                    |
|1974        |434                  |
|2015        |13                   |
+------------+---------------------+
only showing top 10 rows



In [34]:
# Find all titles starting with the word 'Meet'
df.filter(df['title'].like('Meet%')).show()

+-----+---------+----------+------------+------------+--------------------+
|   id|   budget|popularity|release_date|     revenue|               title|
+-----+---------+----------+------------+------------+--------------------+
|43957| 500000.0|     2.649|  2005-06-28|   1000000.0|Meet The Browns -...|
|39997|      0.0|     3.585|  1989-11-15|         0.0|Meet the Hollowheads|
|16710|      0.0|    11.495|  2008-03-21| 4.1939392E7|     Meet the Browns|
|20430|      0.0|     3.614|  2004-01-29|         0.0|         Meet Market|
|76435|      0.0|     1.775|  2011-03-31|         0.0|    Meet the In-Laws|
|76516|5000000.0|      4.05|  1990-11-08|    485772.0| Meet the Applegates|
| 7278|    3.0E7|    11.116|  2008-01-24| 8.4646832E7|   Meet the Spartans|
|32574|      0.0|      7.42|  1941-03-14|         0.0|       Meet John Doe|
|40506|      0.0|     4.814|  1997-01-31|         0.0|   Meet Wally Sparks|
|40688|    2.4E7|     6.848|  1998-03-27|   4562146.0|    Meet the Deedles|
|58401|     

In [35]:
# Find all titles that do not end in 's'
df.filter(~df['title'].like('%s')).show(10,False)

+-----+--------+----------+------------+-------+---------------------------------------+
|id   |budget  |popularity|release_date|revenue|title                                  |
+-----+--------+----------+------------+-------+---------------------------------------+
|43000|0.0     |2.503     |1962-05-23  |0.0    |The Elusive Corporal                   |
|43001|0.0     |5.51      |1962-11-12  |0.0    |Sundays and Cybele                     |
|43002|0.0     |5.62      |1962-05-24  |0.0    |Lonely Are the Brave                   |
|43003|0.0     |7.159     |1975-03-12  |0.0    |F for Fake                             |
|43004|500000.0|3.988     |1962-10-09  |0.0    |Long Day's Journey Into Night          |
|43006|0.0     |3.194     |1962-03-09  |0.0    |My Geisha                              |
|43007|0.0     |2.689     |1962-10-31  |0.0    |Period of Adjustment                   |
|43008|0.0     |6.537     |1959-03-13  |0.0    |The Hanging Tree                       |
|43010|0.0     |4.297

In [36]:
# You can also use regular expressions
df.filter(df['title'].rlike('\w*ove')).show(10,False)

+-----+------+----------+------------+------------+------------------------+
|id   |budget|popularity|release_date|revenue     |title                   |
+-----+------+----------+------------+------------+------------------------+
|43100|0.0   |7.252     |1959-10-07  |0.0         |General Della Rovere    |
|43152|0.0   |5.126     |2001-06-21  |0.0         |Love on a Diet          |
|43191|0.0   |4.921     |1952-08-29  |0.0         |Beware, My Lovely       |
|43281|0.0   |2.411     |1989-11-22  |0.0         |Love Without Pity       |
|43343|0.0   |3.174     |1953-12-25  |0.0         |Easy to Love            |
|43347|3.0E7 |14.863    |2010-11-22  |1.02820008E8|Love & Other Drugs      |
|43362|0.0   |1.705     |1952-02-23  |0.0         |Love Is Better Than Ever|
|43363|0.0   |2.02      |1952-05-29  |0.0         |Lovely to Look At       |
|43395|0.0   |4.758     |1950-11-10  |0.0         |Two Weeks with Love     |
|43455|0.0   |4.669     |1948-08-23  |0.0         |The Loves of Carmen     |

In [37]:
# It can also be the following:
df.filter(df['title'].contains('ove')).show()

+-----+------+----------+------------+------------+--------------------+
|   id|budget|popularity|release_date|     revenue|               title|
+-----+------+----------+------------+------------+--------------------+
|43100|   0.0|     7.252|  1959-10-07|         0.0|General Della Rovere|
|43152|   0.0|     5.126|  2001-06-21|         0.0|      Love on a Diet|
|43191|   0.0|     4.921|  1952-08-29|         0.0|   Beware, My Lovely|
|43281|   0.0|     2.411|  1989-11-22|         0.0|   Love Without Pity|
|43343|   0.0|     3.174|  1953-12-25|         0.0|        Easy to Love|
|43347| 3.0E7|    14.863|  2010-11-22|1.02820008E8|  Love & Other Drugs|
|43362|   0.0|     1.705|  1952-02-23|         0.0|Love Is Better Th...|
|43363|   0.0|      2.02|  1952-05-29|         0.0|   Lovely to Look At|
|43395|   0.0|     4.758|  1950-11-10|         0.0| Two Weeks with Love|
|43455|   0.0|     4.669|  1948-08-23|         0.0| The Loves of Carmen|
|43483|   0.0|     3.161|  1946-12-25|         0.0|

In [38]:
# You can also select columns by Regex
# select columns starting with re
df.select(df.colRegex("`re\w*`")).printSchema()

root
 |-- release_date: date (nullable = true)
 |-- revenue: float (nullable = true)



In [39]:
# Select columns ending with 'e'
df.select(df.colRegex("`\w*e`")).printSchema()

root
 |-- release_date: date (nullable = true)
 |-- revenue: float (nullable = true)
 |-- title: string (nullable = true)



## Creating New Columns

In [40]:
# Create the mean population
mean_pop = df.agg({'popularity':'mean'}).collect()[0]['avg(popularity)']
mean_pop

5.295444259579189

In [41]:
# Add this value to all rows of the data frame
df = df.withColumn('mean_popularity', lit(mean_pop))
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- budget: float (nullable = true)
 |-- popularity: float (nullable = true)
 |-- release_date: date (nullable = true)
 |-- revenue: float (nullable = true)
 |-- title: string (nullable = true)
 |-- mean_popularity: double (nullable = false)



In [42]:
# Intermediate calculation
df = df.withColumn('variance', pow(df['popularity']-df['mean_popularity'],2))
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- budget: float (nullable = true)
 |-- popularity: float (nullable = true)
 |-- release_date: date (nullable = true)
 |-- revenue: float (nullable = true)
 |-- title: string (nullable = true)
 |-- mean_popularity: double (nullable = false)
 |-- variance: double (nullable = true)



In [43]:
# Add up all of the variances
variance_sum = df.agg({'variance':'sum'}).collect()[0]['sum(variance)']
count_obs = df.count()
variance_population = variance_sum/(count_obs-1)
variance_population

37.858688057662825

In [44]:
# Define a python function and return a tuple
def new_cols(budget, popularity):
    if budget < 10000000: budget_cat = "Small"
    elif budget < 100000000: budget_cat = "Medium"
    else: budget_cat = "Big"

    if popularity < 3: ratings = "Low"
    elif popularity < 5: ratings = "Mid"
    else: ratings = "High"

    return budget_cat, ratings



In [45]:
# Wrap the python function for use
udfB = udf(new_cols, StructType([StructField("budget_cat", StringType(), True), StructField("ratings", StringType(), True)]))

# Calculate using the custom function
temp_df = (df.select('id', 'budget', 'popularity')
             .withColumn('newcat', udfB('budget', 'popularity')))

# Show results
temp_df.show()

+-----+---------+----------+-------------+
|   id|   budget|popularity|       newcat|
+-----+---------+----------+-------------+
|43000|      0.0|     2.503| {Small, Low}|
|43001|      0.0|      5.51|{Small, High}|
|43002|      0.0|      5.62|{Small, High}|
|43003|      0.0|     7.159|{Small, High}|
|43004| 500000.0|     3.988| {Small, Mid}|
|43006|      0.0|     3.194| {Small, Mid}|
|43007|      0.0|     2.689| {Small, Low}|
|43008|      0.0|     6.537|{Small, High}|
|43010|      0.0|     4.297| {Small, Mid}|
|43011|      0.0|     4.417| {Small, Mid}|
|43012|7000000.0|     4.722| {Small, Mid}|
|43013|      0.0|     2.543| {Small, Low}|
|43014|      0.0|     4.303| {Small, Mid}|
|43015|      0.0|     3.493| {Small, Mid}|
|43016|      0.0|     2.851| {Small, Low}|
|43017|      0.0|     4.047| {Small, Mid}|
|43018|      0.0|     2.661| {Small, Low}|
|43019|      0.0|     3.225| {Small, Mid}|
|43020|      0.0|      5.72|{Small, High}|
|43021|      0.0|     3.292| {Small, Mid}|
+-----+----

In [59]:
# Unbundle the struct column using getItem
(temp_df.select('id', 'budget', 'popularity', 'newcat')
        .withColumn('budget_cat', temp_df['newcat'].getItem('budget_cat'))
        .withColumn('ratings', temp_df['newcat'].getItem('ratings'))
        .drop('newcat')).show()

+-----+---------+----------+----------+-------+
|   id|   budget|popularity|budget_cat|ratings|
+-----+---------+----------+----------+-------+
|43000|      0.0|     2.503|     Small|    Low|
|43001|      0.0|      5.51|     Small|   High|
|43002|      0.0|      5.62|     Small|   High|
|43003|      0.0|     7.159|     Small|   High|
|43004| 500000.0|     3.988|     Small|    Mid|
|43006|      0.0|     3.194|     Small|    Mid|
|43007|      0.0|     2.689|     Small|    Low|
|43008|      0.0|     6.537|     Small|   High|
|43010|      0.0|     4.297|     Small|    Mid|
|43011|      0.0|     4.417|     Small|    Mid|
|43012|7000000.0|     4.722|     Small|    Mid|
|43013|      0.0|     2.543|     Small|    Low|
|43014|      0.0|     4.303|     Small|    Mid|
|43015|      0.0|     3.493|     Small|    Mid|
|43016|      0.0|     2.851|     Small|    Low|
|43017|      0.0|     4.047|     Small|    Mid|
|43018|      0.0|     2.661|     Small|    Low|
|43019|      0.0|     3.225|     Small| 

In [63]:
# Unbundle the columns with when
df_with_newcols = (df.select('id','budget','popularity')
                     .withColumn('budget_cat', when(df['budget']<10000000, 'Small').
                                               when(df['budget']<100000000, 'Medium').
                                               otherwise('Big'))
                     .withColumn('ratings', when(df['popularity']<3, 'Low').
                                            when(df['popularity']<5, 'Mid').
                                            otherwise('High')))

df_with_newcols.show()

+-----+---------+----------+----------+-------+
|   id|   budget|popularity|budget_cat|ratings|
+-----+---------+----------+----------+-------+
|43000|      0.0|     2.503|     Small|    Low|
|43001|      0.0|      5.51|     Small|   High|
|43002|      0.0|      5.62|     Small|   High|
|43003|      0.0|     7.159|     Small|   High|
|43004| 500000.0|     3.988|     Small|    Mid|
|43006|      0.0|     3.194|     Small|    Mid|
|43007|      0.0|     2.689|     Small|    Low|
|43008|      0.0|     6.537|     Small|   High|
|43010|      0.0|     4.297|     Small|    Mid|
|43011|      0.0|     4.417|     Small|    Mid|
|43012|7000000.0|     4.722|     Small|    Mid|
|43013|      0.0|     2.543|     Small|    Low|
|43014|      0.0|     4.303|     Small|    Mid|
|43015|      0.0|     3.493|     Small|    Mid|
|43016|      0.0|     2.851|     Small|    Low|
|43017|      0.0|     4.047|     Small|    Mid|
|43018|      0.0|     2.661|     Small|    Low|
|43019|      0.0|     3.225|     Small| 

## Deleting and Renaming Columns

In [65]:
# Deleting Columns
columns_to_drop = ['budget_cat']
df_with_newcols = df_with_newcols.drop(*columns_to_drop)
df_with_newcols.show()

+-----+---------+----------+-------+
|   id|   budget|popularity|ratings|
+-----+---------+----------+-------+
|43000|      0.0|     2.503|    Low|
|43001|      0.0|      5.51|   High|
|43002|      0.0|      5.62|   High|
|43003|      0.0|     7.159|   High|
|43004| 500000.0|     3.988|    Mid|
|43006|      0.0|     3.194|    Mid|
|43007|      0.0|     2.689|    Low|
|43008|      0.0|     6.537|   High|
|43010|      0.0|     4.297|    Mid|
|43011|      0.0|     4.417|    Mid|
|43012|7000000.0|     4.722|    Mid|
|43013|      0.0|     2.543|    Low|
|43014|      0.0|     4.303|    Mid|
|43015|      0.0|     3.493|    Mid|
|43016|      0.0|     2.851|    Low|
|43017|      0.0|     4.047|    Mid|
|43018|      0.0|     2.661|    Low|
|43019|      0.0|     3.225|    Mid|
|43020|      0.0|      5.72|   High|
|43021|      0.0|     3.292|    Mid|
+-----+---------+----------+-------+
only showing top 20 rows



In [67]:
# Renaming Columns with withColumnRenamed
(df_with_newcols.withColumnRenamed('id', 'film_id')
                .withColumnRenamed('ratings', 'film_ratings')).show()

+-------+---------+----------+------------+
|film_id|   budget|popularity|film_ratings|
+-------+---------+----------+------------+
|  43000|      0.0|     2.503|         Low|
|  43001|      0.0|      5.51|        High|
|  43002|      0.0|      5.62|        High|
|  43003|      0.0|     7.159|        High|
|  43004| 500000.0|     3.988|         Mid|
|  43006|      0.0|     3.194|         Mid|
|  43007|      0.0|     2.689|         Low|
|  43008|      0.0|     6.537|        High|
|  43010|      0.0|     4.297|         Mid|
|  43011|      0.0|     4.417|         Mid|
|  43012|7000000.0|     4.722|         Mid|
|  43013|      0.0|     2.543|         Low|
|  43014|      0.0|     4.303|         Mid|
|  43015|      0.0|     3.493|         Mid|
|  43016|      0.0|     2.851|         Low|
|  43017|      0.0|     4.047|         Mid|
|  43018|      0.0|     2.661|         Low|
|  43019|      0.0|     3.225|         Mid|
|  43020|      0.0|      5.72|        High|
|  43021|      0.0|     3.292|  

In [69]:
# Renaming columns with alias
new_names = [('budget', 'film_budget'), ('popularity', 'film_popularity')]

(df_with_newcols.select(list(map(lambda old,new:col(old).alias(new), *zip(*new_names))))).show()

+-----------+---------------+
|film_budget|film_popularity|
+-----------+---------------+
|        0.0|          2.503|
|        0.0|           5.51|
|        0.0|           5.62|
|        0.0|          7.159|
|   500000.0|          3.988|
|        0.0|          3.194|
|        0.0|          2.689|
|        0.0|          6.537|
|        0.0|          4.297|
|        0.0|          4.417|
|  7000000.0|          4.722|
|        0.0|          2.543|
|        0.0|          4.303|
|        0.0|          3.493|
|        0.0|          2.851|
|        0.0|          4.047|
|        0.0|          2.661|
|        0.0|          3.225|
|        0.0|           5.72|
|        0.0|          3.292|
+-----------+---------------+
only showing top 20 rows



## Excercise 2.1