In [1]:
def parse_line(line):
    fields = line.split(',')
    stationID = fields[0]
    entryType = fields[2]
    temperature = float(fields[3]) * 0.1 * (9.0 / 5.0) + 32.0
    return (stationID, entryType, temperature)



In [2]:
import findspark
findspark.init()

In [3]:
from pyspark import SparkContext
sc = SparkContext("local", "Simple App")

In [None]:
lines = sc.textFile('./1800.csv')

In [None]:
parsed_lines = lines.map(parse_line)

In [None]:
parsed_lines

In [None]:
min_temps = parsed_lines.filter(lambda x: "TMIN" in x[1])


In [None]:
stations_temps = min_temps.map(lambda x: (x[0], x[2]))

In [None]:
min_temp = stations_temps.reduceByKey(lambda x, y: min(x, y))

In [None]:
results = min_temp.collect()

In [None]:
for result in results:
    print(result[0] + "\t{:.2f}F".format(result[1]))

# Book


In [None]:
book_lines = sc.textFile('./book.txt')
words = book_lines.flatMap(lambda x: x.split())
word_counts = words.countByValue()

for word, count in word_counts.items():
    clean_word = word.encode('ascii', 'ignore')
    if (clean_word):
        print(clean_word, count)

# Improving word count

In [5]:
import re

def normalize_words(text):
    return re.compile(r'\w+', re.UNICODE).split(text.lower())

data = sc.textFile('./book.txt')
words = data.flatMap(normalize_words)
word_counts = words.countByValue()

In [None]:


for word, count in word_counts.items():
    clean_word = word.encode('ascii', 'ignore')
    if (clean_word):
        print(clean_word, count)

# Sorting the count word

In [9]:
sort_data = sc.textFile('./book.txt')
words = sort_data.flatMap(normalize_words)

word_counts = words.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)


In [12]:
word_count_sorted = word_counts.map(lambda x: (x[1], x[0])).sortByKey()
results = word_count_sorted.collect()


In [15]:
for result in results:
    count = str(result[0])
    word = result[1].encode('ascii', 'ignore')
    if (word):
        print(word.decode() + ":\t\t" + count)

  :		1
  :		1
%. :		1
.:		1
 (:		1
,) :		1
.  (:		1
".) :		1
.  :		1
 (:		1
."  :		1
!) :		1
 .:		1
* :		1
&:		1
.":		1
"?:		1
***:		1
* * *:		1
@:		1
++ :		1
# :		1
.   :		1
: :		1
..." :		1
: ":		1
).  :		1
 (":		1
 #:		1
> :		1
>, :		1
); :		1
...:		1
?, :		1
% ":		2
%? :		2
(:		2
. ":		2
:  :		2
.):		2
":		2
., :		2
", :		2
"  :		2
 & :		2
? :		2
: $:		2
?) :		2
.? :		2
) -- :		2
):		2
  :		2
" :		2
") :		2
 <:		2
>, <:		2
---------------:		2
------------------------------------------------------------:		2
. :		3
.) :		3
". :		3
. (:		4
," :		4
, :		4
?" :		5
, :		5
, ":		6
,) :		6
). :		7
' :		8
   :		9
://:		9
), :		10
." :		10
!:		14
::		15
* :		18
 :		18
) :		19
?:		23
,:		25
! :		29
/:		29
 $:		33
: :		36
% :		41
.  :		43
" :		47
 (:		50
; :		56
 ":		68
 :		101
  :		162
? :		189
-:		377
.:		580
':		988
. :		1459
, :		2039
 :		40722


# Movie Data

# Use Broadcast Vars

In [None]:
def load_movie_names():
    movie_names = {}
    with open("ml-100k/u.ITEM") as f:
        for line in f:
            fields = line.split('|')
            movie_names[int(fields[0])] = fields[1]
    return movie_names

sc = SparkContext("local", "Simple App2")

name_dict = sc.broadcast(load_movie_names())

lines = sc.textFile('./ml-100k/u.data')
movies = lines.map(lambda x: (int(x.split()[1]), 1))
movie_counts = movies.reduceByKey(lambda x, y: x + y)

flipped = movie_counts.map(lambda x: (x[0], x[1]))
sorted_movies = flipped.sortByKey()

# Hero Relationship

In [17]:
def count_cooccurences(line):
    elements = line.split()
    return (int(elements[0]), len(elements) - 1)

def parse_names(line):
    fields = line.split('\"')
    return (int(fields[0]), fields[1].encode("utf8"))

names = sc.textFile('./marvel-names.txt')
names_rdd = names.map(parse_names)

lines = sc.textFile('./marvel-graph.txt')

pairings = lines.map(count_cooccurences)
total_friends_by_character = pairings.reduceByKey(lambda x, y: x + y)
flipped = total_friends_by_character.map(lambda x: (x[0], x[1]))

most_popular = flipped.max()

most_popular_name = str(names_rdd.lookup(most_popular[1])[0])


print(most_popular_name + " is the most popular superhero, with " + \
     str(most_popular[0]) + " co-appearances.")

b'4-D MAN/MERCURIO' is the most popular superhero, with 6486 co-appearances.


In [18]:
most_popular

(6486, 3)

# Superhero degrees of separation

In [None]:
def convet_to_bfs(line):
    fields = line.split()
    hero_id = int(fields[0])
    connections = []
    for connection in fields[1:]:
        connections.append(int(connection))
        
    coler = 'WHITE'
    distance = 9999
    
    if (hero_id == start_character_id):
        color = 'GRAY'
    distance = 0
    
    return (hero_id, (connections, distance, color))

# Item-Based 

In [None]:
conf = SparkConf().setMaster("local[*]").etAppName("MovieSimilarities")

In [None]:
import sys


- Discard bad ratings - only recommend good movies
- Try differen similarity metrics (Pearson Correlation Coefficient, Jaccard Coefficient, Conditional Probability)
- Adjust the thresholds for minimum co-raters of minimum score
- Invent a new similarity metric that takes the number of co-raters into account
- Use genre information in u.items to boost scores from movies in the same genre

# Elastic MapReduce