In [1]:
from pyspark import SparkContext, SparkConf

In [2]:
master = "local[4]"
appName = "task1"
conf = SparkConf().setAppName(appName).setMaster(master)

In [3]:
sc = SparkContext(conf=conf)

In [4]:
sc.setLogLevel("WARN")

## Creating RDD

In [5]:
rdd = sc.textFile("data/geotweets.tsv")

### Creating a sample RDD for testing

In [6]:
#sampled_rdd = rdd.sample(False, 0.1, 5)

### Creating array splitting on tabs

In [7]:
rdd_list = rdd.map(lambda x: x.split('\t'))

In [8]:
#sampled_rdd_list = sampled_rdd.map(lambda x: x.split('\t'))

In [9]:
number_of_tweets = rdd.count()
print number_of_tweets

2715066


In [10]:
number_of_users = rdd_list.map(lambda x: x[6]).distinct().count()
print(number_of_users)

499822


In [11]:
number_of_countries = rdd_list.map(lambda x: x[1]).distinct().count()
print(number_of_countries)

70


In [12]:
number_of_places = rdd_list.map(lambda x: x[4]).distinct().count()
print(number_of_places)

23121


In [13]:
number_of_languages = rdd_list.map(lambda x: x[5]).distinct().count()
print(number_of_languages)

46


In [14]:
minimum_latitude = rdd_list.map(lambda x: float(x[11])).reduce(lambda a, b: min(a,b))
print(minimum_latitude)

-54.87555556


In [15]:
minimum_longitude = rdd_list.map(lambda x: float(x[12])).reduce(lambda a, b: min(a,b))
print(minimum_longitude)

-159.83019441


In [16]:
maximum_latitude = rdd_list.map(lambda x: float(x[11])).reduce(lambda a, b: max(a,b))
print(maximum_latitude)

69.83186826


In [17]:
maximum_longitude = rdd_list.map(lambda x: float(x[12])).reduce(lambda a, b: max(a,b))
print(maximum_longitude)

153.03508445


In [18]:
tweet_text = rdd_list.map(lambda x: x[10])
print tweet_text

PythonRDD[27] at RDD at PythonRDD.scala:48


In [19]:
tweet_in_characters = tweet_text.map(lambda x: (len(x)))
average_tweet_in_characters = tweet_in_characters.mean()
print(average_tweet_in_characters)

87.2014098368


In [20]:
tweet_in_words = tweet_text.map(lambda x: len(x.split(' ')))
average_tweet_in_words = tweet_in_words.mean()
print(average_tweet_in_words)

12.2284228081


### Combining results to RDD and writes to file

In [21]:
results = sc.parallelize([number_of_tweets, number_of_users,\
                        number_of_countries, number_of_places,\
                        number_of_languages, minimum_latitude,\
                        minimum_longitude, maximum_latitude, maximum_longitude,\
                        average_tweet_in_characters, average_tweet_in_words])
results = results.coalesce(1)
results_tsv = results.saveAsTextFile('results/result_1.tsv')