In [4]:
# Import and create a new SQLContext 
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [6]:
# Read the country CSV file into an RDD.
country_lines = sc.textFile('file:///home/cloudera/Downloads/big-data-3/final-project/country-list.csv')

In [8]:
# Convert each line into a pair of words
country_words = country_lines.map(lambda line: line.split(","))

In [10]:
# Convert each pair of words into a tuple
country_tuples = country_words.map(lambda word:(word))

In [11]:
# Create the DataFrame, look at schema and contents
countryDF = sqlContext.createDataFrame(country_tuples, ["country", "code"])
countryDF.printSchema()
countryDF.take(3)

root
 |-- country: string (nullable = true)
 |-- code: string (nullable = true)



[Row(country='Afghanistan', code=' AFG'),
 Row(country='Albania', code=' ALB'),
 Row(country='Algeria', code=' ALG')]

In [15]:
# Read tweets CSV file into RDD of lines
tweet_lines = sc.textFile('file:///home/cloudera/Downloads/big-data-3/final-project/imported_soccer_tweets.csv')
tweet_lines.take(3)

['{"_id":{"$oid":"578ffa8e7eb9513f4f55a935"},"user_name":"koteras","retweet_count":0,"tweet_followers_count":461,"source":"\\u003ca href=\\"http://twitter.com/download/iphone\\" rel=\\"nofollow\\"\\u003eTwitter for iPhone\\u003c/a\\u003e","coordinates":null,"tweet_mentioned_count":1,"tweet_ID":"755891629932675072","tweet_text":"RT @ochocinco: I beat them all for 10 straight hours #FIFA16KING  https://t.co/BFnV6jfkBL","user":{"CreatedAt":{"$date":"2011-12-27T09:04:01.000Z"},"FavouritesCount":5223,"FollowersCount":461,"FriendsCount":619,"UserId":447818090,"Location":"501"}}',
 '{"_id":{"$oid":"578ffa8f7eb9513f4f55a937"},"user_name":"AllieLovesR5_1D","retweet_count":0,"tweet_followers_count":4601,"source":"\\u003ca href=\\"http://twitter.com/download/iphone\\" rel=\\"nofollow\\"\\u003eTwitter for iPhone\\u003c/a\\u003e","coordinates":null,"tweet_mentioned_count":3,"tweet_ID":"755891632759681024","tweet_text":"RT @NiallOfficial: @Louis_Tomlinson @socceraid when I retired from playing becau

In [16]:
# Clean the data: some tweets are empty. Remove the empty tweets using filter() 
tweet_lines = tweet_lines.filter(lambda x:len(x) > 0)

In [17]:
# Perform WordCount on the cleaned tweet texts. (note: this is several lines.)
tweet_words = tweet_lines.flatMap(lambda line: line.split(" "))
tweet_tuples = tweet_words.map(lambda w : (w, 1))
word_count = tweet_tuples.reduceByKey(lambda a, b : (a+b))
word_count.take(1)

[('', 2870)]

In [19]:
# Create the DataFrame of tweet word counts
tweetDF = sqlContext.createDataFrame(word_count, ["word", "count"])
tweetDF.printSchema()
tweetDF.take(1)

root
 |-- word: string (nullable = true)
 |-- count: long (nullable = true)



[Row(word='', count=2870)]

In [20]:
# Join the country and tweet data frames (on the appropriate column)
joinDF = countryDF.join(tweetDF, countryDF.country == tweetDF.word)
joinDF.take(5)

[Row(country='Thailand', code=' THA', word='Thailand', count=1),
 Row(country='Iceland', code=' ISL', word='Iceland', count=2),
 Row(country='Mexico', code=' MEX', word='Mexico', count=2),
 Row(country='Wales', code=' WAL', word='Wales', count=20),
 Row(country='Denmark', code=' DEN', word='Denmark', count=1)]

In [23]:
# Question 1: number of distinct countries mentioned
joinDF.select('code').distinct().count()

49

In [24]:
# Question 2: number of countries mentioned in tweets.
from pyspark.sql.functions import sum
joinDF.agg(sum("count")).first()

Row(sum(count)=384)

In [25]:
# Table 1: top three countries and their counts.
from pyspark.sql.functions import desc
sortDF = joinDF.sort(desc("count"))
sortDF.show(3)

+-------+----+-------+-----+
|country|code|   word|count|
+-------+----+-------+-----+
|Nigeria| NGA|Nigeria|   54|
| France| FRA| France|   39|
|England| ENG|England|   31|
+-------+----+-------+-----+
only showing top 3 rows



In [30]:
# Table 2: counts for Wales, Iceland, and Japan.
from pyspark.sql.functions import col
table2 = joinDF.where((col("country") == "Wales") | (col("country") == "Iceland") | (col("country") == "England"))
table2.show()

+-------+----+-------+-----+
|country|code|   word|count|
+-------+----+-------+-----+
|Iceland| ISL|Iceland|    2|
|  Wales| WAL|  Wales|   20|
|England| ENG|England|   31|
+-------+----+-------+-----+



In [31]:
table3 = joinDF.where((col("country") == "Wales") | (col("country") == "Kenya") | (col("country") == "Netherlands"))
table3.show()

+-----------+----+-----------+-----+
|    country|code|       word|count|
+-----------+----+-----------+-----+
|      Wales| WAL|      Wales|   20|
|Netherlands| NED|Netherlands|   13|
|      Kenya| KEN|      Kenya|    3|
+-----------+----+-----------+-----+



In [32]:
from pyspark.sql.functions import avg 

joinDF.agg(avg("count")).first()

Row(avg(count)=7.836734693877551)