In [1]:
from pyspark.sql import SparkSession
import findspark

findspark.init()
spark = SparkSession.builder.appName('challenge').config('spark.sql.legacy.timeParserPolicy','LEGACY').getOrCreate()

In [2]:
data_3 = spark.read.csv('./salary_data/data_3.csv', header=True, multiLine=True)

In [3]:
data_3.printSchema()

root
 |-- Timestamp: string (nullable = true)
 |-- Employer: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Job Title: string (nullable = true)
 |-- Years at Employer: string (nullable = true)
 |-- Years of Experience: string (nullable = true)
 |-- Annual Base Pay: string (nullable = true)
 |-- Signing Bonus: string (nullable = true)
 |-- Annual Bonus: string (nullable = true)
 |-- Annual Stock Value/Bonus: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Additional Comments: string (nullable = true)



In [4]:
from functools import reduce
better_names = ['timestamp','employer','location','job_title','years_at_employer','years_of_experience','annual_base_pay','signing_bonus','annual_bonus','annual_stock_value','gender','additional_comments']
data_3 = reduce(lambda df, params: df.withColumnRenamed(*params), zip(data_3.columns, better_names), data_3)

In [5]:
data_3.createOrReplaceTempView('data_3')
data_3.printSchema()

root
 |-- timestamp: string (nullable = true)
 |-- employer: string (nullable = true)
 |-- location: string (nullable = true)
 |-- job_title: string (nullable = true)
 |-- years_at_employer: string (nullable = true)
 |-- years_of_experience: string (nullable = true)
 |-- annual_base_pay: string (nullable = true)
 |-- signing_bonus: string (nullable = true)
 |-- annual_bonus: string (nullable = true)
 |-- annual_stock_value: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- additional_comments: string (nullable = true)



In [6]:
spark.sql('''
    SELECT to_timestamp(timestamp, 'M/d/yyyy HH:mm:ss'), timestamp
    FROM data_3
''').show()

+------------------------------------------+------------------+
|to_timestamp(timestamp, M/d/yyyy HH:mm:ss)|         timestamp|
+------------------------------------------+------------------+
|                                      null|              null|
|                       2016-03-21 12:54:49|3/21/2016 12:54:49|
|                       2016-03-21 12:58:52|3/21/2016 12:58:52|
|                       2016-03-21 12:58:57|3/21/2016 12:58:57|
|                       2016-03-21 12:58:58|3/21/2016 12:58:58|
|                       2016-03-21 12:59:11|3/21/2016 12:59:11|
|                       2016-03-21 12:59:30|3/21/2016 12:59:30|
|                       2016-03-21 13:00:17|3/21/2016 13:00:17|
|                       2016-03-21 13:00:20|3/21/2016 13:00:20|
|                       2016-03-21 13:01:19|3/21/2016 13:01:19|
|                       2016-03-21 13:01:19|3/21/2016 13:01:19|
|                       2016-03-21 13:02:01|3/21/2016 13:02:01|
|                       2016-03-21 13:02

This dataset is by far the most difficult to work with because there are many data informed in a "troll" way and should be discarded for any valid analysis. Due to time constraints, this cases were not removed from the dataset.

In [7]:
spark.sql('''
    SELECT DISTINCT initcap(location)
    FROM data_3
    
''').show(100000, truncate=False)

+---------------------------------------------+
|initcap(location)                            |
+---------------------------------------------+
|94102                                        |
|Brisbane, Australia                          |
|Alexandria, Va                               |
|Bangalore                                    |
|70170                                        |
|Somerville, Ma                               |
|Utah                                         |
|Nigga                                        |
|Did                                          |
|Woburn, Ma                                   |
|Charleston                                   |
|Paraguay                                     |
|Wellington                                   |
|Russia                                       |
|Alpharetta, Ga                               |
|Radford, Va                                  |
|판교                                         |
|Remote - North America                   

In [8]:
spark.sql('''
    SELECT initcap(job_title), count(1)
    FROM data_3
    GROUP BY 1
    ORDER BY 2 DESC
''').show(truncate=False)

+-------------------------+--------+
|initcap(job_title)       |count(1)|
+-------------------------+--------+
|Software Engineer        |642     |
|Senior Software Engineer |222     |
|Software Developer       |218     |
|Developer                |80      |
|Web Developer            |77      |
|Senior Developer         |56      |
|Data Scientist           |40      |
|Engineer                 |39      |
|Senior Software Developer|39      |
|Software Engineer Ii     |35      |
|Product Manager          |33      |
|Cto                      |33      |
|Devops Engineer          |25      |
|Senior Engineer          |24      |
|Consultant               |24      |
|Analyst                  |19      |
|Software Architect       |18      |
|Full Stack Developer     |17      |
|Sr Software Engineer     |16      |
|Front End Developer      |15      |
+-------------------------+--------+
only showing top 20 rows



In [9]:
spark.sql('''
    SELECT DISTINCT annual_base_pay
    FROM data_3
''').show(100000, truncate=False)

+------------------------------------+
|annual_base_pay                     |
+------------------------------------+
|57670 USD                           |
|50,000 EUR                          |
|$128,000                            |
|77,000                              |
|28500                               |
|175000                              |
|$33,000                             |
|57500                               |
|66000                               |
|46444                               |
|$74,500                             |
|125                                 |
|30953                               |
|44500                               |
|33500                               |
|185,000                             |
|75000 CAD                           |
|14000                               |
|30 000€                             |
|42100 USD                           |
|53000                               |
|112,000                             |
|7445745                 

In [10]:
spark.sql('''
    SELECT DISTINCT explode(regexp_extract_all(upper(annual_base_pay), '([A-Z]{3,})'))
    FROM data_3
''').show(10000, truncate=False)

+-------+
|col    |
+-------+
|AWAY   |
|DKK    |
|KUSD   |
|REALLY |
|NZD    |
|HUF    |
|GBP    |
|CHF    |
|RUB    |
|BRL    |
|TODAY  |
|BENNIES|
|TESTING|
|TRAP   |
|CAD    |
|MILLION|
|KBRL   |
|CNY    |
|EUR    |
|RRR    |
|BONER  |
|RUBLES |
|TAX    |
|ZAR    |
|FOOD   |
|MONTH  |
|PKR    |
|EURO   |
|ABOUT  |
|NET    |
|NOK    |
|AUD    |
|POUNDS |
|CDN    |
|PER    |
|SECRET |
|JPY    |
|PEPE   |
|HKD    |
|YEAR   |
|INR    |
|RARE   |
|RMB    |
|CAN    |
|HOUR   |
|DAY    |
|CIRCA  |
|PLN    |
|YEN    |
|NOW    |
|THE    |
|CORP   |
|USD    |
|SEK    |
|EUROS  |
|HAVE   |
|AFTER  |
|MAKES  |
|RATE   |
|THIS   |
|SGD    |
+-------+



In an attempt to remove at least the currencies that were not fake. A list of valid ones were extracted from the `annual_base_pay` column and used to eliminate (turn the column to NULL) invalid information. 

In [11]:
identified_currencies = ['AUD','BRL','CAD','CAN','CDN','CHF','CNY','DKK','EUR','GBP','HKD','HUF','INR','JPY','NOK','NZD','PKR','PLN','RMB','RUB','SEK','SGD','USD','YEN','ZAR']
currencies_regex = '|'.join(identified_currencies)

spark.sql(f'''
    SELECT annual_base_pay, regexp_extract(annual_base_pay, '({currencies_regex})') AS currency
    FROM data_3
    WHERE length(regexp_extract(annual_base_pay, '({currencies_regex})')) > 0
''').show(100000)

+--------------------+--------+
|     annual_base_pay|currency|
+--------------------+--------+
|    36.5K USD / YEAR|     USD|
|           62500 USD|     USD|
|          SEK 380000|     SEK|
|60000PLN (circa 1...|     PLN|
|           48000 USD|     USD|
|          120000 USD|     USD|
|         $23,000 USD|     USD|
|         ZAR 675,000|     ZAR|
|          NOK 450000|     NOK|
|          75,000 CAD|     CAD|
|           58000 EUR|     EUR|
|       1,000,000 USD|     USD|
|           45000 USD|     USD|
|           57670 USD|     USD|
|          50,000 EUR|     EUR|
|90000 BRL / 22500...|     BRL|
|             110 GBP|     GBP|
|           124000USD|     USD|
|            EUR 5000|     EUR|
|         DKK 1000000|     DKK|
|          PKR 100000|     PKR|
|          82,500 CAD|     CAD|
|          CAN$100000|     CAN|
|           51000 USD|     USD|
|~66 000 (250 000 ...|     PLN|
|           50000 EUR|     EUR|
|            110k CAD|     CAD|
|          USD$120000|     USD|
|       

In [12]:
currency_symbols_regex = '[¥€£]'

spark.sql(f'''
    SELECT DISTINCT regexp_extract(annual_base_pay, '({currency_symbols_regex})') AS currency
    FROM data_3
''').show(100000)

+--------+
|currency|
+--------+
|    null|
|       ¥|
|        |
|       £|
|       €|
+--------+



In [13]:
spark.sql(f'''
    SELECT CASE WHEN regexp_like(salary, '^[0-9.,]+$')
                THEN regexp_replace(salary, '[.,]', '') || if(regexp_like(salary, '^[0-9.,]+?[.,][0-9]{2}$'), '', '00')
                ELSE NULL 
                END AS final_salary,
                currency,
                salary,
                annual_base_pay
    FROM (
        SELECT regexp_replace(regexp_replace(regexp_replace(upper(annual_base_pay), '{currencies_regex}|{currency_symbols_regex}|[$~ ]', ''), '(?<=[0-9])(k|K)', '000'), '(?<=[0-9])(m|M)', '000000') AS salary,
               regexp_extract(upper(annual_base_pay), '({currencies_regex}|{currency_symbols_regex})') AS currency,
               annual_base_pay 
        FROM data_3
    )
    ORDER BY 1
''').show(truncate=False) 

+------------+--------+-----------------+-----------------------+
|final_salary|currency|salary           |annual_base_pay        |
+------------+--------+-----------------+-----------------------+
|null        |NOK     | 450000          |NOK 450000             |
|null        |        |-                |-                      |
|null        |        |70000?           |~70000?                |
|null        |null    |null             |null                   |
|null        |EUR     |30000OS          |30000 euros            |
|null        |USD     |36.5000/YEAR     |36.5K USD / YEAR       |
|null        |BRL     |90000/22500      |90000 BRL / 22500 USD  |
|null        |null    |null             |null                   |
|null        |        |G23              |g23                    |
|null        |        |15000(NOW)       |15k (now)              |
|null        |        |ABOUT2           |about $2               |
|null        |        |110000-120000    |110000-120000          |
|null     

One valid case of `annual_base_pay` value was found on `annual_base_pay = 'NOK 450000'`. The problem was that the RegEx used could not remove the whitespace from the string due to the fact that it is actually a different symbol on Unicode notation (`u'\xa0'`). This symbol was added manually to the RegEx, eliminating 16 of false-negative cases on the dataset, but further "whitespace" symbols could be present.

In [14]:
spark.sql(f'''
    SELECT CASE WHEN regexp_like(salary, '^[0-9.,]+$')
                THEN regexp_replace(salary, '[.,]', '') || if(regexp_like(salary, '^[0-9.,]+?[.,][0-9]{2}$'), '', '00')
                ELSE NULL 
                END AS final_salary,
                currency,
                salary,
                annual_base_pay
    FROM (
        SELECT regexp_replace(regexp_replace(regexp_replace(upper(annual_base_pay), '{currencies_regex}|{currency_symbols_regex}|[$~ ]', ''), '(?<=[0-9])(k|K)', '000'), '(?<=[0-9])(m|M)', '000000') AS salary,
               regexp_extract(upper(annual_base_pay), '({currencies_regex}|{currency_symbols_regex})') AS currency,
               annual_base_pay 
        FROM data_3
    )
    WHERE currency = 'NOK'
    ORDER BY 1
''').show(truncate=False) 

+------------+--------+-------+---------------+
|final_salary|currency|salary |annual_base_pay|
+------------+--------+-------+---------------+
|null        |NOK     | 450000|NOK 450000     |
+------------+--------+-------+---------------+



In [15]:
spark.sql(f'''
    SELECT final_salary IS NULL, count(1)
    FROM (
        SELECT CASE WHEN regexp_like(salary, '^[0-9.,]+$')
                    THEN regexp_replace(salary, '[.,]', '') || if(regexp_like(salary, '^[0-9.,]+?[.,][0-9]{2}$'), '', '00')
                    ELSE NULL 
                    END AS final_salary
        FROM (
            SELECT regexp_replace(regexp_replace(regexp_replace(upper(annual_base_pay), '{currencies_regex}|{currency_symbols_regex}|[$~ ]', ''), '(?<=[0-9])(k|K)', '000'), '(?<=[0-9])(m|M)', '000000') AS salary
            FROM data_3
            WHERE annual_base_pay IS NOT NULL
        )
    )
    GROUP BY 1
''').show(truncate=False) 

+----------------------+--------+
|(final_salary IS NULL)|count(1)|
+----------------------+--------+
|true                  |87      |
|false                 |3689    |
+----------------------+--------+



In [16]:
spark.sql(f'''
    SELECT CASE WHEN regexp_like(salary, '^[0-9.,]+$')
                THEN regexp_replace(salary, '[.,]', '') || if(regexp_like(salary, '^[0-9.,]+?[.,][0-9]{2}$'), '', '00')
                ELSE NULL 
                END AS final_salary,
                currency,
                salary,
                annual_base_pay
    FROM (
        SELECT regexp_replace(regexp_replace(regexp_replace(upper(annual_base_pay), '{currencies_regex}|{currency_symbols_regex}|[$~ ]', ''), '(?<=[0-9])(k|K)', '000'), '(?<=[0-9])(m|M)', '000000') AS salary,
               regexp_extract(upper(annual_base_pay), '({currencies_regex}|{currency_symbols_regex})') AS currency,
               annual_base_pay 
        FROM data_3
    )
    WHERE currency = 'NOK'
''').collect()

[Row(final_salary=None, currency='NOK', salary='\xa0450000', annual_base_pay='NOK\xa0450000')]

In [17]:
unicode_whitespace = u'\xa0'

In [18]:
spark.sql(f'''
    SELECT CASE WHEN regexp_like(salary, '^[0-9.,]+$')
                THEN regexp_replace(salary, '[.,]', '') || if(regexp_like(salary, '^[0-9.,]+?[.,][0-9]{2}$'), '', '00')
                ELSE NULL 
                END AS final_salary,
                currency,
                salary,
                annual_base_pay
    FROM (
        SELECT regexp_replace(regexp_replace(regexp_replace(upper(annual_base_pay), '{currencies_regex}|{currency_symbols_regex}|[$~{unicode_whitespace} ]', ''), '(?<=[0-9])(k|K)', '000'), '(?<=[0-9])(m|M)', '000000') AS salary,
               regexp_extract(upper(annual_base_pay), '({currencies_regex}|{currency_symbols_regex})') AS currency,
               annual_base_pay 
        FROM data_3
    )
    WHERE currency = 'NOK'
''').show(truncate=False) 

+------------+--------+------+---------------+
|final_salary|currency|salary|annual_base_pay|
+------------+--------+------+---------------+
|45000000    |NOK     |450000|NOK 450000     |
+------------+--------+------+---------------+



In [19]:
spark.sql(f'''
    SELECT final_salary IS NULL, count(1)
    FROM (
        SELECT CASE WHEN regexp_like(salary, '^[0-9.,]+$')
                    THEN regexp_replace(salary, '[.,]', '') || if(regexp_like(salary, '^[0-9.,]+?[.,][0-9]{2}$'), '', '00')
                    ELSE NULL 
                    END AS final_salary
        FROM (
            SELECT regexp_replace(regexp_replace(regexp_replace(upper(annual_base_pay), '{currencies_regex}|{currency_symbols_regex}|[$~{unicode_whitespace} ]', ''), '(?<=[0-9])(k|K)', '000'), '(?<=[0-9])(m|M)', '000000') AS salary
            FROM data_3
            WHERE annual_base_pay IS NOT NULL
        )
    )
    GROUP BY 1
''').show(truncate=False) 

+----------------------+--------+
|(final_salary IS NULL)|count(1)|
+----------------------+--------+
|true                  |71      |
|false                 |3705    |
+----------------------+--------+



In [20]:
spark.sql(f'''
    SELECT *
    FROM (
        SELECT annual_base_pay, regexp_replace(regexp_replace(regexp_replace(upper(annual_base_pay), '{currencies_regex}|{currency_symbols_regex}|[$~{unicode_whitespace} ]', ''), '(?<=[0-9])(k|K)', '000'), '(?<=[0-9])(m|M)', '000000') AS salary
        FROM data_3
    )
    WHERE regexp_like(salary, '^[0-9.,]+$') = FALSE
''').show(100, truncate=False) 

+------------------------------------+----------------------------+
|annual_base_pay                     |salary                      |
+------------------------------------+----------------------------+
|$24/hr                              |24/HR                       |
|36.5K USD / YEAR                    |36.5000/YEAR                |
|55k - 60k                           |55000-60000                 |
|j                                   |J                           |
|15k (now)                           |15000(NOW)                  |
|60000PLN (circa 16000$)             |60000(CIRCA16000)           |
|110000-120000                       |110000-120000               |
|1 rare pepe                         |1RAREPEPE                   |
|~70000?                             |70000?                      |
|30000 euros                         |30000OS                     |
|90000 BRL / 22500 USD               |90000/22500                 |
|g23                                 |G23       

In [21]:
spark.sql(f'''
    SELECT final_bonus IS NULL, count(1)
    FROM (
        SELECT CASE WHEN regexp_like(bonus, '^[0-9.,]+$')
                    THEN regexp_replace(bonus, '[.,]', '') || if(regexp_like(bonus, '^[0-9.,]+?[.,][0-9]{2}$'), '', '00')
                    ELSE NULL 
                    END AS final_bonus
        FROM (
            SELECT regexp_replace(regexp_replace(regexp_replace(upper(annual_bonus), '{currencies_regex}|{currency_symbols_regex}|[$~{unicode_whitespace} ]', ''), '(?<=[0-9])(k|K)', '000'), '(?<=[0-9])(m|M)', '000000') AS bonus
            FROM data_3
            WHERE annual_bonus IS NOT NULL
        )
    )
    GROUP BY 1
''').show(truncate=False) 

+---------------------+--------+
|(final_bonus IS NULL)|count(1)|
+---------------------+--------+
|true                 |184     |
|false                |2485    |
+---------------------+--------+



In [22]:
spark.sql(f'''
    SELECT CASE WHEN regexp_like(bonus, '^[0-9.,]+$')
                THEN regexp_replace(bonus, '[.,]', '') || if(regexp_like(bonus, '^[0-9.,]+?[.,][0-9]{2}$'), '', '00')
                ELSE NULL 
                END AS final_bonus,
                annual_bonus
    FROM (
        SELECT annual_bonus, regexp_replace(regexp_replace(regexp_replace(upper(annual_bonus), '{currencies_regex}|{currency_symbols_regex}|[$~{unicode_whitespace} ]', ''), '(?<=[0-9])(k|K)', '000'), '(?<=[0-9])(m|M)', '000000') AS bonus
        FROM data_3
        WHERE annual_bonus IS NOT NULL
    )
    WHERE regexp_like(bonus, '^[0-9.,]+$') = FALSE
''').show(200, truncate=False) 

+-----------+--------------------------------------------------------+
|final_bonus|annual_bonus                                            |
+-----------+--------------------------------------------------------+
|null       |10%                                                     |
|null       |15000 (Just for first 4 months as a hire)               |
|null       |10%                                                     |
|null       |2 - 3%                                                  |
|null       |10%                                                     |
|null       |2%                                                      |
|null       |25%                                                     |
|null       |$0-$50,000                                              |
|null       |1%                                                      |
|null       |variable, around 10% of base salary (6000$)             |
|null       |15%                                                     |
|null 

`years_of_experience` column is not a range-based value, but a free-form text value, so it could not be directly converted to a numerical representation. 

In [23]:
spark.sql(f'''
    SELECT DISTINCT years_of_experience
    FROM data_3
    ORDER BY 1
''').show(truncate=False)

+-------------------+
|years_of_experience|
+-------------------+
|null               |
|-1                 |
|-10                |
|-114               |
|-13                |
|-16                |
|-3                 |
|-6                 |
|-9001              |
|0                  |
|0 (Just graduated) |
|0.2                |
|0.5                |
|0.6                |
|0.75               |
|0.9                |
|1                  |
|1 of employment    |
|1.2                |
|1.25               |
+-------------------+
only showing top 20 rows



Also, many values where clearly invalid due to the fact of being negatives, and were removed (turned to NULL).

In [24]:
spark.sql(f'''
    SELECT CAST(years_of_experience AS DOUBLE) > 0, count(1)
    FROM data_3
    WHERE years_of_experience IS NOT NULL
    GROUP BY 1
''').show(truncate=False)

+-----------------------------------------+--------+
|(CAST(years_of_experience AS DOUBLE) > 0)|count(1)|
+-----------------------------------------+--------+
|null                                     |5       |
|true                                     |3317    |
|false                                    |120     |
+-----------------------------------------+--------+



In [25]:
spark.sql(f'''
    SELECT years_of_experience
    FROM data_3
    WHERE years_of_experience IS NOT NULL
    AND CAST(years_of_experience AS DOUBLE) IS NULL
''').show(truncate=False)

+-------------------+
|years_of_experience|
+-------------------+
|1 of employment    |
|<1                 |
|0 (Just graduated) |
|10+ years          |
|5 years            |
+-------------------+



In [26]:
def outer_monetary_query_expression(column: str):
    return f'''
        CAST(
            CAST(CASE WHEN regexp_like({column}, '^[0-9.,]+$')
                      THEN regexp_replace({column}, '[.,]', '') || if(regexp_like({column}, '^[0-9.,]+?[.,][0-9]{{2}}$'), '', '00')
                      ELSE NULL 
                 END
                 AS LONG
            ) / 100
            AS DECIMAL(15,2)
        )
    '''

def inner_monetary_query_expression(column: str):
    return f"regexp_replace(regexp_replace(regexp_replace(upper({column}), '{currencies_regex}|{currency_symbols_regex}|[$~{unicode_whitespace} ]', ''), '(?<=[0-9])(K)', '000'), '(?<=[0-9])(M)', '000000') AS {column}"


inner_annual_bonus_expr = inner_monetary_query_expression('annual_bonus')
outer_annual_bonus_expr = outer_monetary_query_expression('annual_bonus')

inner_annual_base_pay_expr = inner_monetary_query_expression('annual_base_pay')
outer_annual_base_pay_expr = outer_monetary_query_expression('annual_base_pay')

inner_years_of_experience_expr = "CAST(years_of_experience AS DOUBLE) AS years_of_experience"
outer_years_of_experience_expr = 'if(years_of_experience >= 0, years_of_experience, NULL)'

timestamp_expr = "to_timestamp(timestamp, 'M/d/yyyy HH:mm:ss') AS timestamp" 
location_expr = 'trim(location) AS location'
job_title_expr = 'trim(job_title) AS job_title'
currency_expr = f"regexp_extract(upper(annual_base_pay), '({currencies_regex}|{currency_symbols_regex})') AS currency"

final_df = spark.sql(f'''
    SELECT timestamp,
           {outer_annual_base_pay_expr} AS annual_salary,
           {outer_annual_bonus_expr} AS annual_bonus,
           currency,
           {outer_years_of_experience_expr} years_experience,
           location,
           job_title
    FROM (
        SELECT {inner_annual_bonus_expr},
               {inner_annual_base_pay_expr},
               {inner_years_of_experience_expr},
               {timestamp_expr},
               {location_expr},
               {job_title_expr},
               {currency_expr}
        FROM data_3
    )
''')

final_df.printSchema()

final_df.show()

root
 |-- timestamp: timestamp (nullable = true)
 |-- annual_salary: decimal(15,2) (nullable = true)
 |-- annual_bonus: decimal(15,2) (nullable = true)
 |-- currency: string (nullable = true)
 |-- years_experience: double (nullable = true)
 |-- location: string (nullable = true)
 |-- job_title: string (nullable = true)

+-------------------+-------------+------------+--------+----------------+-------------------+--------------------+
|          timestamp|annual_salary|annual_bonus|currency|years_experience|           location|           job_title|
+-------------------+-------------+------------+--------+----------------+-------------------+--------------------+
|               null|         null|        null|    null|            null|               null|                null|
|2016-03-21 12:54:49|    122000.00|        null|        |            18.0|        Raleigh, NC|  Software Developer|
|2016-03-21 12:58:52|    125000.00|        0.00|        |            13.0|  San Francisco, CA|    

In [27]:
final_df.write.parquet('./optimized_data/data_3')