# Step 1 (Basic instrucitons)

## Environment Setup

### Install requirements

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

### Set environment variables

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

### Import libraries

In [3]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

In [4]:
import re

### Get spark context

In [5]:
sc = spark.sparkContext

## Part 1

### Read input file


In [7]:
input_file_path = "/content/input.txt"
input_file = sc.textFile(input_file_path)
print(input_file.first())

Games are a fun way to get people involved and learning in a happy environment and get them to work on concepts and tactics without them knowing it a lot of the time. Because of this, these games were perfect in a class on negotiation and persuasion because it loosened people up and allowed them to learn in a fun environment. The games used in this class were reinforcing the concepts we talked about and got familiar with in the lectures, but the games were a safe place where we could give them a spin, test drive these persuasion tactics on our peers. With this we were able to make the connection between theory and application of concepts which have no use on paper. This class was about learning how to use language and framing to the extent where you make people think they want what you want, get to identify with your subject and get them on your level before you persuade them to act, and such concepts are great in theory but the application of them takes some practice and with mastery,

### Count number of words

In [8]:
wordsRDD = input_file.flatMap(lambda line: line.split(" "))
print("First 5 words:")
print(wordsRDD.take(5))
words_count = wordsRDD.count()
print("Words count: {}".format(words_count))

First 5 words:
['Games', 'are', 'a', 'fun', 'way']
Words count: 5043


### Count each word repeat and save output in `words_count.txt`

In [None]:
print("Words repeat count:")
words_repeation = wordsRDD.countByValue()
print(words_repeation)
output_file_name = "words_count.txt"
with open(output_file_name, "w") as output_file:
  for word, count in words_repeation.items():
    new_line = f"{word}: {count}\n"
    output_file.write(new_line)
  output_file.close()

Words repeat count:
defaultdict(<class 'int'>, {'Games': 1, 'are': 31, 'a': 106, 'fun': 2, 'way': 16, 'to': 151, 'get': 18, 'people': 51, 'involved': 1, 'and': 237, 'learning': 3, 'in': 115, 'happy': 1, 'environment': 1, 'them': 31, 'work': 2, 'on': 38, 'concepts': 4, 'tactics': 5, 'without': 3, 'knowing': 5, 'it': 44, 'lot': 2, 'of': 162, 'the': 312, 'time.': 5, 'Because': 2, 'this,': 1, 'these': 14, 'games': 9, 'were': 28, 'perfect': 2, 'class': 5, 'negotiation': 2, 'persuasion': 4, 'because': 17, 'loosened': 1, 'up': 18, 'allowed': 3, 'learn': 1, 'environment.': 1, 'The': 23, 'used': 6, 'this': 43, 'reinforcing': 1, 'we': 6, 'talked': 1, 'about': 17, 'got': 2, 'familiar': 1, 'with': 36, 'lectures,': 1, 'but': 19, 'safe': 4, 'place': 11, 'where': 12, 'could': 15, 'give': 4, 'spin,': 1, 'test': 1, 'drive': 2, 'our': 1, 'peers.': 1, 'With': 1, 'able': 1, 'make': 6, 'connection': 1, 'between': 2, 'theory': 3, 'application': 2, 'which': 19, 'have': 38, 'no': 5, 'use': 11, 'paper.': 1, 'T

### Remove punctuation marks and repeat actions

In [9]:
clean_wordsRDD = (input_file
                  .flatMap(lambda line: re.split('[ ?!.,:;\t()]', line))
                  .filter(lambda word: word != ''))
print("First 5 words:")
print(clean_wordsRDD.take(5))
clean_words_count = clean_wordsRDD.count()
print("Words count: {}".format(clean_words_count))

First 5 words:
['Games', 'are', 'a', 'fun', 'way']
Words count: 5007


In [None]:
print("Words repeat count:")
clean_words_repeation = clean_wordsRDD.countByValue()
print(clean_words_repeation)
output_file_name = "clean_words_count.txt"
with open(output_file_name, "w") as output_file:
  for word, count in clean_words_repeation.items():
    new_line = f"{word}: {count}\n"
    output_file.write(new_line)
  output_file.close()

Words repeat count:
defaultdict(<class 'int'>, {'Games': 1, 'are': 32, 'a': 106, 'fun': 2, 'way': 17, 'to': 152, 'get': 18, 'people': 57, 'involved': 1, 'and': 237, 'learning': 3, 'in': 116, 'happy': 1, 'environment': 2, 'them': 33, 'work': 3, 'on': 40, 'concepts': 4, 'tactics': 5, 'without': 3, 'knowing': 5, 'it': 52, 'lot': 2, 'of': 162, 'the': 312, 'time': 23, 'Because': 2, 'this': 44, 'these': 14, 'games': 15, 'were': 29, 'perfect': 2, 'class': 7, 'negotiation': 2, 'persuasion': 4, 'because': 18, 'loosened': 1, 'up': 23, 'allowed': 3, 'learn': 1, 'The': 24, 'used': 6, 'reinforcing': 1, 'we': 6, 'talked': 1, 'about': 20, 'got': 2, 'familiar': 1, 'with': 38, 'lectures': 1, 'but': 19, 'safe': 4, 'place': 11, 'where': 12, 'could': 15, 'give': 4, 'spin': 1, 'test': 1, 'drive': 2, 'our': 1, 'peers': 1, 'With': 1, 'able': 1, 'make': 6, 'connection': 1, 'between': 2, 'theory': 3, 'application': 2, 'which': 19, 'have': 39, 'no': 5, 'use': 11, 'paper': 2, 'This': 30, 'was': 41, 'how': 8, 'la

## Part 2

### Find words count that start with "m/M"

In [None]:
words_with_m = clean_wordsRDD.filter(lambda word: word[0] in ["m", "M"])
print("First 5 words:")
print(words_with_m.take(5))
words_with_m_count = words_with_m.count()
print("Words starts with m count: {}".format(words_with_m_count))

First 5 words:
['make', 'make', 'mastery', 'many', 'me']
Words starts with m count: 150


## Part 3

### Count words with 5 chars

In [11]:
five_char_words = clean_wordsRDD.filter(lambda word: len(word) == 5)
print("First 5 words:")
print(five_char_words.take(5))
five_char_words_count = five_char_words.count()
print("Five char words: {}".format(five_char_words_count))

First 5 words:
['Games', 'happy', 'these', 'games', 'class']
Five char words: 581


### Remove words starts with vowels

In [12]:
vowels = ["a", "u", "i", "o", "u"]
reduced_five_words = five_char_words.filter(lambda word: word.lower()[0] not in vowels)
print("First 5 words:")
print(reduced_five_words.take(5))
reduced_five_words_count = reduced_five_words.count()
print("Five char words without vowel start: {}".format(reduced_five_words_count))
print("Sorted result:")
print(reduced_five_words.takeOrdered(reduced_five_words_count))

First 5 words:
['Games', 'happy', 'these', 'games', 'class']
Five char words without vowel start: 507
Sorted result:
['Buddy', 'Buddy', 'Buddy', 'Buddy', 'Buddy', 'Court', 'Cross', 'Final', 'Games', 'Given', 'Greek', 'Hooks', 'Lying', 'Lying', 'Lying', 'Lying', 'Magog', 'Monet', 'Named', 'Pablo', 'Pauls', 'Roman', 'Roman', 'Since', 'There', 'There', 'There', 'There', 'There', 'There', 'There', 'These', 'These', 'These', 'These', 'These', 'These', 'Tower', 'WebGL', 'based', 'based', 'based', 'based', 'basis', 'began', 'began', 'began', 'being', 'being', 'being', 'being', 'being', 'being', 'being', 'bench', 'blend', 'bonus', 'break', 'break', 'brief', 'build', 'build', 'built', 'built', 'built', 'built', 'cafés', 'cafés', 'cafés', 'cafés', 'calss', 'can’t', 'cards', 'catch', 'class', 'class', 'class', 'class', 'class', 'class', 'class', 'comes', 'comes', 'comic', 'could', 'could', 'could', 'could', 'could', 'could', 'could', 'could', 'could', 'could', 'could', 'could', 'could', 'could', 

## Part 4

### Find stop words

In [None]:
sorted_repeation = sorted(clean_words_repeation.items(), key=lambda item: item[1], reverse=True)
ten_percent_count = len(sorted_repeation) // 10
stop_words_with_count = sorted_repeation[:ten_percent_count]
stop_words = [stop_word for stop_word, _ in stop_words_with_count]
print("First 10 stop words:")
print(stop_words[:10])

First 10 stop words:
['the', 'and', 'of', 'to', 'in', 'a', 'is', 'that', 'people', 'it']


### Remove stopwords and non alpha numberic chars

In [None]:
!rm -r removed_stop_words
def clean_line(line):
  cleaned_line = ""
  for word in line.split():
    if word not in stop_words:
      cleaned_line += word
      cleaned_line += " "
  cleaned_line = re.sub("[^0-9a-zA-Z]+", " ", cleaned_line)
  return cleaned_line

clean_lines = (input_file.flatMap(lambda line: line.split("."))
               .map(clean_line))
clean_lines.collect()
removed_stop_words_file_name = "removed_stop_words"
clean_lines.coalesce(1).saveAsTextFile(removed_stop_words_file_name)

## Part 5

### Find bigrams

In [None]:
bigrams = input_file.flatMap(lambda line: line.split(".")) \
                   .map(lambda line: line.strip().split(" ")) \
                   .flatMap(lambda xs: (tuple(x) for x in zip(xs, xs[1:])))
print("First five bigrams:")
print(bigrams.take(5))

First five bigrams:
[('Games', 'are'), ('are', 'a'), ('a', 'fun'), ('fun', 'way'), ('way', 'to')]


### Calculate frequencies and sort them

In [None]:
bigrams_frequencies = bigrams.countByValue()
sorted_bigram_frequencies = sorted(bigrams_frequencies.items(), key=lambda item: item[1], reverse=True)
print("Bigrams: (Sorted by frequency)")
for word, repeat in sorted_bigram_frequencies:
  print(f"{word}: {repeat}")

Bigrams: (Sorted by frequency)
('of', 'the'): 56
('in', 'the'): 48
('and', 'the'): 15
('all', 'of'): 12
('on', 'the'): 12
('to', 'the'): 11
('people', 'to'): 10
('of', 'a'): 10
('to', 'get'): 9
('in', 'a'): 9
('of', 'this'): 9
('with', 'the'): 9
('it', 'is'): 9
('Soho', 'Square'): 9
('This', 'is'): 8
('the', 'square'): 8
('them', 'to'): 7
('is', 'the'): 7
('this', 'is'): 7
('one', 'of'): 7
('the', 'way'): 7
('for', 'the'): 7
('have', 'been'): 7
('in', 'this'): 6
('will', 'be'): 6
('have', 'a'): 6
('is', 'a'): 6
('that', 'they'): 6
('the', 'game'): 6
('and', 'then'): 6
('it', 'was'): 6
('as', 'a'): 6
('gender', 'and'): 6
('in', 'London'): 6
('the', 'games'): 5
('of', 'them'): 5
('some', 'of'): 5
('want', 'to'): 5
('me', 'to'): 5
('at', 'the'): 5
('that', 'the'): 5
('In', 'the'): 5
('to', 'think'): 5
('you', 'have'): 5
('is', 'also'): 5
('I', 'was'): 5
('they', 'were'): 5
('can', 'be'): 5
('to', 'be'): 5
('the', 'film'): 5
('way', 'that'): 5
('on', 'a'): 5
('and', 'this'): 5
('the', 'peo