In [1]:
from pyspark import SparkContext, SparkConf

In [2]:
master = "local[4]"
appName = "task1"
conf = SparkConf().setAppName(appName).setMaster(master)

In [3]:
sc = SparkContext(conf=conf)

In [4]:
sc.setLogLevel("WARN")

## Creating RDD

In [5]:
rdd = sc.textFile("data/geotweets.tsv")

### Creating a sample RDD for testing

In [6]:
sampled_rdd = rdd.sample(False, 0.1, 5)

### Creating array splitting on tabs

In [9]:
sampled_rdd_list = sampled_rdd.map(lambda x: x.split('\t'))

In [53]:
number_of_tweets = sampled_rdd.count()
print number_of_tweets

271230


In [20]:
number_of_users = sampled_rdd_list.map(lambda x: x[6]).distinct().count()
print(number_of_users)

122714


In [11]:
number_of_countries = sampled_rdd_list.map(lambda x: x[1]).distinct().count()
print(number_of_countries)

In [12]:
number_of_places = sampled_rdd_list.map(lambda x: x[4]).distinct().count()
print(number_of_places)

In [13]:
number_of_languages = sampled_rdd_list.map(lambda x: x[5]).distinct().count()
print(number_of_languages)

In [29]:
minimum_latitude = sampled_rdd_list.map(lambda x: float(x[11])).reduce(lambda a, b: min(a,b))
print(minimum_latitude)

-54.80753659


In [28]:
minimum_longitude = sampled_rdd_list.map(lambda x: float(x[12])).reduce(lambda a, b: min(a,b))
print(minimum_longitude)

-159.80555556


In [32]:
maximum_latitude = sampled_rdd_list.map(lambda x: float(x[11])).reduce(lambda a, b: max(a,b))
print(maximum_latitude)

54.47604295


In [34]:
maximum_longitude = sampled_rdd_list.map(lambda x: float(x[12])).reduce(lambda a, b: max(a,b))
print(maximum_longitude)

153.03508445


In [46]:
tweet_text = sampled_rdd_list.map(lambda x: x[10])
print tweet_text

Hoy no ests y tengo mil palabras, todas para vos


In [49]:
tweet_in_characters = tweet_text.map(lambda x: (len(x)))
average_tweet_in_characters = tweet_in_characters.mean()
print(average_tweet_in_characters)

87.1632636508
48


In [52]:
tweet_in_words = tweet_text.map(lambda x: len(x.split(' ')))
average_tweet_in_words = tweet_in_words.mean()
print(average_tweet_in_words)

12.2201010213


### Combining results to RDD and writes to file

In [65]:
results = sc.parallelize([number_of_tweets, number_of_users,\
                        number_of_countries, number_of_places,\
                        number_of_languages, minimum_latitude,\
                        minimum_longitude, maximum_latitude, maximum_longitude,\
                        average_tweet_in_characters, average_tweet_in_words])
results = results.coalesce(1)
results_tsv = results.saveAsTextFile('results/result_1.tsv')