In [None]:
from pyspark import SparkContext, SparkConf
import os, shutil

In [None]:
#os.environ["PYSPARK_PYTHON"] = "/usr/bin/env python3"
master = "local[4]"
appName = "task1"
conf = SparkConf().setAppName(appName).setMaster(master)

In [None]:
sc = SparkContext(conf=conf)

In [None]:
sc.setLogLevel("WARN")

## Creating RDD

In [None]:
rdd = sc.textFile("data/geotweets.tsv")

### Creating a sample RDD for testing

In [None]:
sampled_rdd = rdd.sample(False, 0.1, 5)

### Creating array splitting on tabs

In [None]:
rdd_list = rdd.map(lambda x: x.split('\t'))

In [None]:
print(rdd_list.first())

In [None]:
sampled_rdd_list = sampled_rdd.map(lambda x: x.split('\t'))

In [None]:
new_list = sampled_rdd_list.keyBy(lambda x: x[1])
print(new_list.first())

# Task 1

In [None]:
number_of_tweets = rdd.count()
print(number_of_tweets)

In [None]:
number_of_users = rdd_list.map(lambda x: x[6]).distinct().count()
print(number_of_users)

In [None]:
number_of_countries = rdd_list.map(lambda x: x[1]).distinct().count()
print(number_of_countries)

In [None]:
number_of_places = rdd_list.map(lambda x: x[4]).distinct().count()
print(number_of_places)

In [None]:
#number_of_languages = rdd_list.map(lambda x: (x, 1) ).reduce(lambda a, b: a+b)
number_of_languages = rdd_list.map(lambda x: x[5]).distinct().count()
print(number_of_languages)

In [None]:
minimum_latitude = rdd_list.map(lambda x: float(x[11])).reduce(lambda a, b: min(a,b))
print(minimum_latitude)

In [None]:
minimum_longitude = rdd_list.map(lambda x: float(x[12])).reduce(lambda a, b: min(a,b))
print(minimum_longitude)

In [None]:
maximum_latitude = rdd_list.map(lambda x: float(x[11])).reduce(lambda a, b: max(a,b))
print(maximum_latitude)

In [None]:
maximum_longitude = rdd_list.map(lambda x: float(x[12])).reduce(lambda a, b: max(a,b))
#alsjdk
print(maximum_longitude)

In [None]:
tweet_text = rdd_list.map(lambda x: x[10])
print(tweet_text)

In [None]:
tweet_in_characters = tweet_text.map(lambda x: (len(x)))
average_tweet_in_characters = tweet_in_characters.mean()
print(average_tweet_in_characters)

In [None]:
tweet_in_words = tweet_text.map(lambda x: len(x.split(' ')))
average_tweet_in_words = tweet_in_words.mean()
print(average_tweet_in_words)

### Combining results to RDD and writes to file

In [None]:
results = sc.parallelize([number_of_tweets, number_of_users,\
                        number_of_countries, number_of_places,\
                        number_of_languages, minimum_latitude,\
                        minimum_longitude, maximum_latitude, maximum_longitude,\
                        average_tweet_in_characters, average_tweet_in_words])
results = results.coalesce(1)
resultsPath = 'results/result_1.tsv'
if os.path.isdir(resultsPath):
    shutil.rmtree(resultsPath)
results_tsv = results.saveAsTextFile(resultsPath)

# Task 2

### Creates new RDD by MapReduce, counting number of tweets per country

In [None]:
new_rdd = rdd_list.map(lambda x: (str(x[1]), 1)).countByKey().items()

### Sorts twice. First alphabetically ascending on country name, then numerically descending on number of tweets. We can do this since the sorts are stable, hence the order between records with same key is preserved

In [None]:
sorted_dict = sorted(new_rdd, key=lambda x: x[0])
sorted_dict = sorted(sorted_dict, key=lambda x: x[1], reverse=True)

In [None]:
#from operator import add, itemgetter
#sorted_dict_2 = sorted(new_rdd, key=itemgetter(1,0))
#new_rdd2 = sampled_rdd_list.map(lambda x: (x[5], 1)).reduceByKey(add)
#sorted_dict_2 = sorted(new_rdd2.collect(), key=lambda x: (-x[1],) + (x[0]))

In [None]:
print(sorted_dict)

### Saving result as RDD

In [None]:
result_task2_rdd = sc.parallelize(sorted_dict)
result_task2 = result_task2_rdd.map(lambda x: '{}\t{}'.format(x[0],x[1]))

### Writing results to text file

In [None]:
resultsPath = 'results/result_2.tsv'
if os.path.isdir(resultsPath):
    shutil.rmtree(resultsPath)
result_task2.coalesce(1).saveAsTextFile(resultsPath)

# Task 3

In [None]:
countries_under_10 = result_task2_rdd.filter(lambda x: x[1] < 11)
print(countries_under_10.collect())

In [None]:
countries_with_latlon = rdd_list.keyBy(lambda x: str(x[1])).mapValues(lambda x: (float(x[11]), float(x[12])))
#countries_with_latlon = rdd_list.map(lambda x: (str(x[1]), float(x[11]), float(x[12])))
countries_with_lat = rdd_list.map(lambda x: (str(x[1]), float(x[11])))
countries_with_lon = rdd_list.map(lambda x: (str(x[1]), float(x[12])))

In [None]:
countries_over_10_with_latlon = countries_with_latlon.subtractByKey(countries_under_10) 
countries_over_10_with_lat = countries_with_lat.subtractByKey(countries_under_10)
countries_over_10_with_lon = countries_with_lon.subtractByKey(countries_under_10)

In [None]:
#print(countries_with_latlon.count())
#print(countries_over_10_with_latlon.count())

In [None]:
#countries_over_10_with_lat.take(1)
#countries_over_10_with_lon.take(1)
#countries_over_10_with_lat.join(countries_over_10_with_lon).take(1)
#countries_over_10_with_lat.groupByKey().mapValues(list).take(1)

In [None]:
def calculateCentroid(latlon):
    return (sum(latlon[0]) / len(latlon[0]), sum(lon) / len(lon))
    

In [None]:
def calculateCenter(listWithCoord):
    return sum(listWithCoord)/len(listWithCoord)

In [None]:
#result_task3 = countries_over_10_with_latlon.reduceByKey(calculateCentroid)
country_centroid_lat = countries_over_10_with_lat.groupByKey().\
                    mapValues(list).mapValues(calculateCenter)
country_centroid_lon = countries_over_10_with_lon.groupByKey().\
                    mapValues(list).mapValues(calculateCenter)
country_centroid_rdd = country_centroid_lat.join(country_centroid_lon)
print(country_centroid_rdd.first())

In [None]:
result_task3 = country_centroid_rdd.map(lambda x: '{}\t{}\t{}'.format(x[0], x[1][0], x[1][1]))

In [None]:
resultsPath = 'results/result_3.csv'
if os.path.isdir(resultsPath):
    shutil.rmtree(resultsPath)
result_task3.coalesce(1).saveAsTextFile(resultsPath)

In [None]:
import cartoframes
import pandas as pd
#BASEURL = 'https://larshbj.carto.com'
#APIKEY = '299d2d825191b9879da6fc859d1064930f28d061'
#df = pd.read_csv('result_task3_carto.tsv', sep='\t')
#cc = cartoframes.CartoContext(base_url=BASEURL,
#                              api_key=APIKEY)
#cc.write(df, 'task3')

In [None]:
from cartoframes import Layer, BaseMap, styling
BASEURL = 'https://larshbj.carto.com'
APIKEY = '299d2d825191b9879da6fc859d1064930f28d061'
cc = cartoframes.CartoContext(base_url=BASEURL,
                              api_key=APIKEY)
cc.map(layers=Layer('result_task3_carto_4',
                   size=7),
       interactive=False)