### Spuštění PySpark

`export PYSPARK_PYTHON=python3`

`pyspark --master yarn --num-executors 2 --executor-memory 4G --conf spark.ui.port=1<ddmm>`, kde `<ddmm>` je váš den a měsíc narození, např. `spark.ui.port=10811`

In [None]:
# helpful import

from pyspark.sql import functions as F

# you can use hive_02/PID_GTFS/...
df = spark.read.csv('/your_user/spark_05/PID_GTFS/trips.txt', header=True)


In [None]:
# 1.2 Cache the file in memory.

df.cache()

In [None]:
#1.3 Write out a sample of the data.

df.show()

In [None]:
#1.4 Write out a basic exploration of the data.

df.describe()

In [None]:
#1.5 Find out the total number of records (rows) in the DataFrame. (72 530)

df.count()

In [None]:
#1.6 Find out how many unique routes are there. (816)

df_2 = df.select('route_id').distinct()
df_2.count()

In [None]:
#1.7 Find out the lowest and highest route number are there. (1, 3360)

df_2 = df.withColumn('route_id_number', F.split('route_id', 'L').getItem(1).cast('integer'))
df_2.agg(F.min(F.col('route_id_number')), F.max(F.col('route_id_number'))).show()

+--------------------+--------------------+
|min(route_id_number)|max(route_id_number)|
+--------------------+--------------------+
|                   1|                3360|
+--------------------+--------------------+

In [None]:
#1.8 Find out the number of trips for both direction for route L1. (331,313)

df_l1=df.filter(df['route_id'] == 'L1')
df_l1.groupBy('direction_id').count().show()
+------------+-----+
|direction_id|count|
+------------+-----+
|           0|  331|
|           1|  313|
+------------+-----+

In [None]:
#1.9 Find out a count of night trips. (1258)

df_routes=spark.read.csv('data/trips/routes.txt', header=True)
df_all = df.join(df_routes, df_routes['route_id']==df['route_id'])
df_all.filter(df_all['is_night'] == '1').count()

1258

In [None]:
# 1.10 Additionally: Create a temporary table from the DataFrame and try to do 1.5–1.8 using SQL.
Example:
df.createOrReplaceTempView('Trips')
spark.sql('select * from Trips limit 10').show()

1. 5. using SQL and TempView
spark.sql('select count(route_id) from Trips').show()
+---------------+
|count(route_id)|
+---------------+
|          93580|
+---------------+

In [None]:
# 2 Exclude all records that have a year listed outside the 1950--2018 range. Determine how many records remain in the DataFrame. (362 221)

songs_df = spark.read\
 .format('csv')\
 .option('header', 'false')\
 .option('delimiter', ',')\
 .option('inferSchema', 'true')\
 .load('data/lyrics.csv')

# Rename columns to the ones that make sense
songs_df = songs_df.withColumnRenamed('_c0', 'id')\
 .withColumnRenamed('_c1', 'name')\
 .withColumnRenamed('_c2', 'year')\
 .withColumnRenamed('_c3', 'singer')\
 .withColumnRenamed('_c4', 'genre')\
 .withColumnRenamed('_c5', 'text')


songs_df2 = songs_df.filter('year >= 1950 and year <= 2018')
songs_df2.cache()
songs_df2.count()

362221

In [None]:
#2.2 Edit the lyrics

# Replace missing values in the lyrics column with empty strings.
songs_df2 = songs_df2.fillna('', 'text')

# Convert the text to lower case.
songs_df2 = songs_df2.withColumn('text', F.lower(songs_df2['text']))

# Replace all non-alphanumeric characters with a space.
songs_df2 = songs_df2.withColumn('text', F.regexp_replace(songs_df2['text'], '[\W ]', ' '))

# Replace multiple space sequences with a single space.
songs_df2 = songs_df2.withColumn('text', F.regexp_replace(songs_df2['text'], '[ ]+', ' '))

# Omit spaces on both edges of the text (trim function).
songs_df2 = songs_df2.withColumn('text', F.trim(songs_df2['text']))


In [None]:
#2.3 Add a words_poc column to the DataFrame containing the number of all words in the song.

songs_df2 = songs_df2.withColumn('slova_poc', F.size(F.split(songs_df2['text'], ' ')))
songs_df2.filter('text=""').show()

songs_df2 = songs_df2.withColumn('slova_poc', F.when(songs_df2['text']=='', 0).otherwise(songs_df2['slova_poc']))
songs_df2.show()

In [None]:
#2.4  Cache the resulting DataFrame again.

songs_df2.cache()

In [None]:
#3.1 Find out how many artists have at least 500 songs and who they are. Create a separate DataFrame for these artists, use it in Assignment 4.3. (19; Bob Dylan 614, Chris Brown 655, etc.)


singers = songs_df2.groupBy('singer').count().toDF('singer', 'pocet').filter("pocet >= 500")
singers.show()
singers.count()
19


In [None]:
#3.2 Considering only songs with non-empty lyrics (i.e., word counts greater than 0), which artist with at least 100 such songs has the highest average number of words per song? (eightball-mjg 627.9)

songs_df2.filter('slova_poc > 0') \
    .groupBy('singer').agg({'*':'count', 'slova_poc':'avg'}) \
    .toDF('singer', 'prumer', 'pocet').filter('pocet>=100') \
    .orderBy('prumer', ascending=False) \
    .show()


In [None]:
#4.1  Find the 20 most frequently occurring words of at least two characters in song lyrics. (Count each word as many times as it appears in the text. Here it is useful to process the DataFrame using RDD transformations.)

words_top = songs_df2.rdd \
     .flatMap(lambda r: r[5].split(" ")) \
     .filter(lambda r: len(r)>1) \
     .map(lambda r: (r, 1)) \
     .reduceByKey(lambda a,b: a+b) \
     .sortBy(lambda r: r[1], False)
    
words_top.take(20)

[(u'the', 2031323), (u'you', 1988108), (u'to', 1181198), 
(u'and', 1153519), (u'it', 910863), (u'me', 859221), 
(u'my', 717654), (u'in', 685757), (u'of', 567866), 
(u'that', 555589), (u'your', 499245), (u'on', 467524), 
(u'we', 454191), (u'all', 405964), (u'is', 389578), 
(u'for', 368171), (u'be', 363155), (u'can', 354989), 
(u'so', 324009), (u'no', 313001)]


stopw = sc.textFile("/your_user/spark_05/stopwords.txt").collect()
stopw = set(stopw)

words_top2 = songs_df2.rdd \
    .flatMap(lambda r: r[5].split(" ")) \
    .filter(lambda r: len(r)>1) \
    .filter(lambda r: r not in stopw) \
    .map(lambda r: (r, 1)) \
    .reduceByKey(lambda a,b: a+b) \
    .sortBy(lambda r: r[1], False)
    
words_top2.take(20)

[(u'love', 310137), (u'don', 308914), (u'll', 222791), 
(u'time', 174700), (u've', 164238), (u'baby', 141879), 
(u'yeah', 131658), (u'life', 113901), (u'feel', 104905), 
(u'la', 103182), (u'gonna', 96730), (u'heart', 95538), 
(u'day', 90366), (u'night', 89188), (u'man', 88579), 
(u'ain', 86083), (u'wanna', 80072), (u'girl', 79097), 
(u'de', 77752), (u'good', 70872)]

In [None]:
#4.2 Choose three of your choice from the set of most frequent non-stop-words. Add three columns to the DataFrame (one column for each word) with a 1/0 flag to indicate whether the word is mentioned at least once in the song.

songs_df2 = songs_df2.withColumn('is_love', F.when(F.regexp_extract(songs_df2['text'], r'\b(love)\b', 1) == 'love', 1).otherwise(0))
songs_df2 = songs_df2.withColumn('is_like', F.when(F.regexp_extract(songs_df2['text'], r'\b(like)\b', 1) == 'like', 1).otherwise(0))
songs_df2 = songs_df2.withColumn('is_know', F.when(F.regexp_extract(songs_df2['text'], r'\b(know)\b', 1) == 'know', 1).otherwise(0))
songs_df2.show()


In [None]:
#4.3  For performers with at least 500 songs (see 3.1), find out what proportion of their songs contain the three common words you selected from Assignment 4.2.

singers_words = singers.join(songs_df2, 'singer') \
    .select('singer', 'is_love', 'is_like', 'is_know') \
    .groupBy('singer') \
    .agg({'is_love':'avg', 'is_like':'avg', 'is_know':'avg'})
singers_words.show()

