# Public_copy.ipynb
A public copy of "Private_notebook.ipynb" (may not yet be fully updated). If any fellow groupmates happen to be viewing this notebook, feel free to replace it with a copy of the most up-to-date version of "Private_notebook.ipynb" (if it is different) and make any edits you like.

In [1]:
import pyspark.sql as ps

In [2]:
sc = ps.SparkSession.builder \
    .config("spark.driver.memory", "2g") \
	.config("spark.executor.memory", "2g") \
    .config('spark.executor.instances', 4) \
	.appName("Data Exploration") \
	.getOrCreate()

# Data Exploration

## Preliminary Exploration

In [3]:
# Inspect schema of an initial load of the data.

rawload = sc.read.csv('merged_data.csv', header = True)
rawload.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- title: string (nullable = true)
 |-- rank: string (nullable = true)
 |-- date: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- url: string (nullable = true)
 |-- region: string (nullable = true)
 |-- chart: string (nullable = true)
 |-- trend: string (nullable = true)
 |-- streams: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- album: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- duration_ms: string (nullable = true)
 |-- explicit: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- available_markets: string (nullable = true)
 |-- af_danceability: string (nullable = true)
 |-- af_energy: string (nullable = true)
 |-- af_key: string (nullable = true)
 |-- af_loudness: string (nullable = true)
 |-- af_mode: string (nullable = true)
 |-- af_speechiness: string (nullable = true)
 |-- af_acousticness: string (nullable = true)
 |-- af_instrumentalness: strin

In [4]:
# Show the first 5 rows of raw data
rawload.head(5)

[Row(_c0='0', title='Chantaje (feat. Maluma)', rank='1', date='2017-01-01', artist='Shakira', url='https://open.spotify.com/track/6mICuAdrwEjh6Y6lroV2Kg', region='Argentina', chart='top200', trend='SAME_POSITION', streams='253019.0', track_id='6mICuAdrwEjh6Y6lroV2Kg', album='El Dorado', popularity='78.0', duration_ms='195840.0', explicit='False', release_date='2017-05-26', available_markets="['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA', 'CL', 'CO', 'CR', 'CY', 'CZ', 'DK', 'DO', 'DE', 'EC', 'EE', 'SV', 'FI', 'FR', 'GR', 'GT', 'HN', 'HK', 'HU', 'IS', 'IE', 'IT', 'LV', 'LT', 'LU', 'MY', 'MT', 'MX', 'NL', 'NZ', 'NI', 'NO', 'PA', 'PY', 'PE', 'PH', 'PL', 'PT', 'SG', 'SK', 'ES', 'SE', 'CH', 'TW', 'TR', 'UY', 'US', 'GB', 'AD', 'LI', 'MC', 'ID', 'JP', 'TH', 'VN', 'RO', 'IL', 'ZA', 'SA', 'AE', 'BH', 'QA', 'OM', 'KW', 'EG', 'MA', 'DZ', 'TN', 'LB', 'JO', 'PS', 'IN', 'BY', 'KZ', 'MD', 'UA', 'AL', 'BA', 'HR', 'ME', 'MK', 'RS', 'SI', 'KR', 'BD', 'PK', 'LK', 'GH', 'KE', 'NG', 'TZ', 'UG', 'AG', 'AM'

In [5]:
# Rename the first column, which refers to a primary key that was blandly named '_c0' by default.

df = rawload.withColumnRenamed('_c0', 'ID')

In [6]:
# Count observations in the dataset.

df_ct = df.count()
print(f'There are {df_ct} observations in this dataset.')

There are 26174269 observations in this dataset.


## Getting Summary Statistics for Numeric Attributes

In [7]:
# As the above schema shows, all attributes assume the string type, even though many of them are numeric. View a
# single observation to determine which attributes are numeric.

print(f'SAMPLE OBSERVATION:\n\n{df.take(1)}')

SAMPLE OBSERVATION:

[Row(ID='0', title='Chantaje (feat. Maluma)', rank='1', date='2017-01-01', artist='Shakira', url='https://open.spotify.com/track/6mICuAdrwEjh6Y6lroV2Kg', region='Argentina', chart='top200', trend='SAME_POSITION', streams='253019.0', track_id='6mICuAdrwEjh6Y6lroV2Kg', album='El Dorado', popularity='78.0', duration_ms='195840.0', explicit='False', release_date='2017-05-26', available_markets="['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA', 'CL', 'CO', 'CR', 'CY', 'CZ', 'DK', 'DO', 'DE', 'EC', 'EE', 'SV', 'FI', 'FR', 'GR', 'GT', 'HN', 'HK', 'HU', 'IS', 'IE', 'IT', 'LV', 'LT', 'LU', 'MY', 'MT', 'MX', 'NL', 'NZ', 'NI', 'NO', 'PA', 'PY', 'PE', 'PH', 'PL', 'PT', 'SG', 'SK', 'ES', 'SE', 'CH', 'TW', 'TR', 'UY', 'US', 'GB', 'AD', 'LI', 'MC', 'ID', 'JP', 'TH', 'VN', 'RO', 'IL', 'ZA', 'SA', 'AE', 'BH', 'QA', 'OM', 'KW', 'EG', 'MA', 'DZ', 'TN', 'LB', 'JO', 'PS', 'IN', 'BY', 'KZ', 'MD', 'UA', 'AL', 'BA', 'HR', 'ME', 'MK', 'RS', 'SI', 'KR', 'BD', 'PK', 'LK', 'GH', 'KE', 'NG', 'T

In [8]:
# Evaluating the above observation allows us to establish which attributes are numeric.

numeric_attributes = ['ID', 'rank', 'streams', 'popularity', 'duration_ms', 'af_danceability', 'af_energy',
                      'af_key', 'af_loudness', 'af_mode', 'af_speechiness', 'af_acousticness',
                      'af_instrumentalness', 'af_liveness', 'af_valence', 'af_tempo',  'af_time_signature']
df[numeric_attributes].show(5)

+---+----+--------+----------+-----------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+
| ID|rank| streams|popularity|duration_ms|af_danceability|af_energy|af_key|af_loudness|af_mode|af_speechiness|af_acousticness|af_instrumentalness|af_liveness|af_valence|af_tempo|af_time_signature|
+---+----+--------+----------+-----------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+
|  0|   1|253019.0|      78.0|   195840.0|          0.852|    0.773|   8.0|     -2.921|    0.0|        0.0776|          0.187|           3.05e-05|      0.159|     0.907| 102.034|              4.0|
|  1|   2|223988.0|      72.0|   259195.0|          0.663|     0.92|  11.0|      -4.07|    0.0|         0.226|        0.00431|           1.69e-05|      0.101|     0.533|  99.935|              4.0|
|  2|   3|21094

In [9]:
# We'll have to cast numeric attributes to floats to explore these attributes.

df = df.withColumns(dict(zip(numeric_attributes, [df[attribute].cast('float') for attribute in
                                                  numeric_attributes])))

In [10]:
# Confirm that type casting was correctly implemented.

print('UPDATED SCHEMA:\n')
df.printSchema()
print(f'\n\nSAMPLE OBSERVATION:\n\n{df.take(1)}')

UPDATED SCHEMA:

root
 |-- ID: float (nullable = true)
 |-- title: string (nullable = true)
 |-- rank: float (nullable = true)
 |-- date: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- url: string (nullable = true)
 |-- region: string (nullable = true)
 |-- chart: string (nullable = true)
 |-- trend: string (nullable = true)
 |-- streams: float (nullable = true)
 |-- track_id: string (nullable = true)
 |-- album: string (nullable = true)
 |-- popularity: float (nullable = true)
 |-- duration_ms: float (nullable = true)
 |-- explicit: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- available_markets: string (nullable = true)
 |-- af_danceability: float (nullable = true)
 |-- af_energy: float (nullable = true)
 |-- af_key: float (nullable = true)
 |-- af_loudness: float (nullable = true)
 |-- af_mode: float (nullable = true)
 |-- af_speechiness: float (nullable = true)
 |-- af_acousticness: float (nullable = true)
 |-- af_instrumentalness: f

In [11]:
# Now that we've casted numeric attributes to floats, we can generate summary statistics for them.

nas = df[numeric_attributes].describe()

In [12]:
# We can display summary statistics in a visually appealing format by implementing the following.

for attribute in numeric_attributes:
    nas[['summary', attribute]].show()

+-------+-----------------+
|summary|               ID|
+-------+-----------------+
|  count|         26174269|
|   mean|      1.3087134E7|
| stddev|7555860.770833806|
|    min|              0.0|
|    max|      2.6174268E7|
+-------+-----------------+

+-------+------------------+
|summary|              rank|
+-------+------------------+
|  count|          26172127|
|   mean|  80.9233979339929|
| stddev|59.185732716119134|
|    min|               1.0|
|    max|             200.0|
+-------+------------------+

+-------+------------------+
|summary|           streams|
+-------+------------------+
|  count|          20318240|
|   mean|55266.480036705936|
| stddev|209601.75006362287|
|    min|            1001.0|
|    max|       1.9749704E7|
+-------+------------------+

+-------+-----------------+
|summary|       popularity|
+-------+-----------------+
|  count|         25853279|
|   mean|46.71695659958646|
| stddev|32.81488206233617|
|    min|              0.0|
|    max|          19176.0|

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


Py4JError: An error occurred while calling o151.showString

## Missing Data

In [None]:
# Determine how many null values there are per attribute.

nulls_dict = {}
for attribute in df.columns:
    null_IDs = df[['ID']].where(df[attribute].isNull())
    nulls_dict[attribute] = (null_IDs, null_IDs.count())
    print(f'Attribute "{attribute}" contains {nulls_dict[attribute][1]} nulls. {nulls_dict[}')

In [None]:
#Exploring categorical data

categorical_columns = ['region', 'chart', 'trend', 'explicit']
for column in categorical_columns:
    df.groupBy(column).count().show()