In [1]:
from pyspark import SparkContext
from pyspark import SparkConf


In [3]:
# create Spark context with Spark configuration
conf = SparkConf().setAppName("read text file in pysparkf")
sc = SparkContext(conf=conf)


### Reading text File

In [5]:
# Read file into RDD
lines = sc.textFile("/home/hasan/DATA SET/about bangladesh.txt")


In [6]:
lines.take(2)

["Bangladesh is a country in South Asia. It is the eighth-most populous country in the world, with a population exceeding 161 million people. In terms of land mass, Bangladesh ranks 92nd, spanning 147,570 square kilometres (56,980 sq mi), making it one of the most densely-populated countries in the world. Bangladesh shares land borders with India to the west, north, and east, Myanmar to the southeast, and the Bay of Bengal to the south. It is narrowly separated from Nepal and Bhutan by India's Siliguri Corridor, and from China by the Indian state of Sikkim, in the north, respectively. Dhaka, the capital and largest city, is the nation's economic, political and cultural hub. Chittagong, the largest sea port, is the second largest city. With numerous criss-crossing rivers and inland waterways, the dominant geographic feature of Bangladesh is the Ganges delta, which empties into the Bay of Bengal with the combined waters of several river systems, including the Brahmaputra river and the Ga

In [7]:
lines.take(4)

["Bangladesh is a country in South Asia. It is the eighth-most populous country in the world, with a population exceeding 161 million people. In terms of land mass, Bangladesh ranks 92nd, spanning 147,570 square kilometres (56,980 sq mi), making it one of the most densely-populated countries in the world. Bangladesh shares land borders with India to the west, north, and east, Myanmar to the southeast, and the Bay of Bengal to the south. It is narrowly separated from Nepal and Bhutan by India's Siliguri Corridor, and from China by the Indian state of Sikkim, in the north, respectively. Dhaka, the capital and largest city, is the nation's economic, political and cultural hub. Chittagong, the largest sea port, is the second largest city. With numerous criss-crossing rivers and inland waterways, the dominant geographic feature of Bangladesh is the Ganges delta, which empties into the Bay of Bengal with the combined waters of several river systems, including the Brahmaputra river and the Ga

### tokenizing to word

In [8]:
def lowe_and_word(text):
    lines = text.lower()
    lines = lines.split()
    return lines



In [9]:
#use of map
word_text = lines.map(lowe_and_word)
word_text.take(3)

[['bangladesh',
  'is',
  'a',
  'country',
  'in',
  'south',
  'asia.',
  'it',
  'is',
  'the',
  'eighth-most',
  'populous',
  'country',
  'in',
  'the',
  'world,',
  'with',
  'a',
  'population',
  'exceeding',
  '161',
  'million',
  'people.',
  'in',
  'terms',
  'of',
  'land',
  'mass,',
  'bangladesh',
  'ranks',
  '92nd,',
  'spanning',
  '147,570',
  'square',
  'kilometres',
  '(56,980',
  'sq',
  'mi),',
  'making',
  'it',
  'one',
  'of',
  'the',
  'most',
  'densely-populated',
  'countries',
  'in',
  'the',
  'world.',
  'bangladesh',
  'shares',
  'land',
  'borders',
  'with',
  'india',
  'to',
  'the',
  'west,',
  'north,',
  'and',
  'east,',
  'myanmar',
  'to',
  'the',
  'southeast,',
  'and',
  'the',
  'bay',
  'of',
  'bengal',
  'to',
  'the',
  'south.',
  'it',
  'is',
  'narrowly',
  'separated',
  'from',
  'nepal',
  'and',
  'bhutan',
  'by',
  "india's",
  'siliguri',
  'corridor,',
  'and',
  'from',
  'china',
  'by',
  'the',
  'indian',


In [10]:
# flatting every word
word_text = lines.flatMap(lowe_and_word)
word_text.take(10)

['bangladesh', 'is', 'a', 'country', 'in', 'south', 'asia.', 'it', 'is', 'the']

### Removing stopword

In [11]:
#stopwords
stopwords = ['how', 'a', 'in', 'is', 'he', 'who', 'I', 'there', 'where', 'with', 'the', 'of', 'whos']

In [12]:
without_stopwords = word_text.filter(lambda x: x not in stopwords)
without_stopwords.take(10)


['bangladesh',
 'country',
 'south',
 'asia.',
 'it',
 'eighth-most',
 'populous',
 'country',
 'world,',
 'population']

### Those words starts with b character

In [13]:
# starts with b character
starts_with_b = word_text.filter(lambda x: x.startswith('b'))
starts_with_b.take(10)

['bangladesh',
 'bangladesh',
 'bangladesh',
 'borders',
 'bay',
 'bengal',
 'bhutan',
 'by',
 'by',
 'bangladesh']

In [14]:
# unique words
starts_with_b.distinct().take(10)

['bangladesh',
 'bhutan',
 'brahmaputra',
 'biodiversity',
 'beach',
 'bazar',
 'bengal,',
 'but',
 'briefly',
 'boundary']

### Counting every word how many times appear

In [26]:
word_mapp = without_stopwords.map(lambda x: (x,1))
word_grouped = word_mapp.groupByKey()
word_frequency = word_grouped.mapValues(sum).map(lambda x: (x[1], x[0])).sortByKey(False)
word_frequency.take(10)


[(24, 'and'),
 (8, 'to'),
 (8, 'bengal'),
 (7, 'bangladesh'),
 (7, 'was'),
 (5, 'by'),
 (4, 'as'),
 (4, 'it'),
 (4, 'largest'),
 (3, 'state')]

In [30]:
# total number of words, before removing stopwords
word_text.count()


521

In [31]:
# after removing stopwords, total number of unique word
word_frequency.count()


276

In [34]:
#toal unique word, with stopwords
rdd2 = word_text.distinct()
rdd2.count()


282

### Those words have same first two character

In [39]:
rdd3 = rdd2.groupBy(lambda w: w[0:2])
print([(k, list(v)) for (k,v) in rdd3.take(5)])

[('ba', ['bangladesh', 'bazar', 'bay', 'bakhtiyar', 'battle']), ('is', ['is', 'islam']), ('co', ['country', 'countries', "country's", "cox's", 'contacts', 'conquered', 'corridor,', 'combined', 'cover', 'comprises', 'conquest', 'company']), ('in', ['in', "india's", 'inland', 'into', 'introduced', 'india', 'indian', 'including', 'interrupted', 'india.', 'independent']), ('as', ['asia.', 'as', 'ashoka.'])]


### Taking 10% data

In [47]:
#not stopwords
without_stopwords.count()

381

In [48]:
ample_rdd = without_stopwords.sample(False, 0.1)
ample_rdd.count()

43

In [50]:
ample_rdd.take(5)

['terms', '92nd,', '147,570', 'kilometres', '(56,980']

### Create RDD

In [51]:
#create RDD
a = sc.parallelize([('a', 2), ('b', 3)])
b = sc.parallelize([('a', 8), ('b', 6), ('x', 4)])

In [58]:
# joining RDD
c = a.join(b)
c.collect()

[('a', (2, 8)), ('b', (3, 6))]

### another RDD

In [59]:
num_rdd = sc.parallelize(range(1, 50000))
num_rdd.take(20)


[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

In [61]:
#adding every number
add_rdd = num_rdd.reduce(lambda x,y: x+y)
add_rdd


1249975000

### creating RDD

In [67]:
data_keydata = sc.parallelize([('a', 3), ('b', 7), ('a', 4), ('e', 9), ('e', 7)])
data_keydata.reduceByKey(lambda x, y: x+y).collect()


[('b', 7), ('a', 7), ('e', 16)]

### Saving text file in RDD

### creating new RDD

In [79]:
test = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]

In [83]:
# sorting 
sc.parallelize(test).sortByKey(True, 1).collect()

[('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]

##### union operation

In [84]:
a = sc.parallelize([1,2,3,4,5])
b = sc.parallelize([2,4,6,8])

c = a.union(b)
c.collect()

[1, 2, 3, 4, 5, 2, 4, 6, 8]

##### use of mapPartitionsWithIndex

In [93]:
#map partition with index
mpwi = sc.parallelize([1,2,3,4],6)
def function_name(split_index, iterator): yield split_index

mpwi.mapPartitionsWithIndex(function_name).sum()

15

##### use of intersection

In [95]:
rdd_a = sc.parallelize([2,4,6,1,3,5])
rdd_b = sc.parallelize([1,3,5,2,4,6])
rdd_a.intersection(rdd_b).collect()


[1, 2, 3, 4, 5, 6]

##### use of subtract

In [101]:
rdd_a = sc.parallelize([1,2,3,4])
rdd_b = sc.parallelize([3,4,5,6])
rdd_a.subtract(rdd_b).collect()


[1, 2]

##### use of cartesian

In [102]:
rdd_a = sc.parallelize([1,2,3,4])
rdd_b = sc.parallelize([3,4,5,6])
rdd_a.cartesian(rdd_b).collect()

[(1, 3),
 (1, 4),
 (1, 5),
 (1, 6),
 (2, 3),
 (2, 4),
 (2, 5),
 (2, 6),
 (3, 3),
 (3, 4),
 (3, 5),
 (3, 6),
 (4, 3),
 (4, 4),
 (4, 5),
 (4, 6)]

### Page Rank Algorithm

In [128]:
page_links = [['a', ['b','c','d']],
              ['c', ['b']],
              ['b', ['d','c']],
              ['d', ['a','c']]
             ]

page_rank = [['a', 1], ['c', 1], ['b', 1], ['d', 1]]


In [129]:
def rankContribution(uris, rank):
    numberOUris = len(uris)
    rankContribution = float(rank) / numberOfUris
    newrank = []
    for uri in uris:
        newrank.append((uri, rankContribution))
        return newrank
    

In [130]:
pageLinkRDD = sc.parallelize(page_links, 2)
pageLinkRDD.collect()


[['a', ['b', 'c', 'd']], ['c', ['b']], ['b', ['d', 'c']], ['d', ['a', 'c']]]

In [131]:
pageRankRDD = sc.parallelize(page_rank, 2)
pageRankRDD.collect()


[['a', 1], ['c', 1], ['b', 1], ['d', 1]]

In [132]:
numIter = 20
s = 0.85

In [133]:
for i in range(numIter):
    linksRank = pageLinkRDD.join(pageRankRDD)
    contributedRDD = linksRank.flatMap(lambda x: rankContribution(x[1][0], x[1][1]))
    sumRanks = contributedRDD.reduceByKey(lambda v1,v2: v1+v2)
    pageRankRDD = sumRanks.map(lambda x: (x[0],(1-s)+s*x[1]))
    

In [None]:
pageRankRDD.collect()


### Reading csv File

In [10]:
fifa = (sc.textFile('/home/hasan/DATA SET/fifa-world-cup/WorldCupPlayers.csv', minPartitions=4).map(lambda element: element.split('\t')))


In [11]:
fifa.take(3)

[['RoundID,MatchID,Team Initials,Coach Name,Line-up,Shirt Number,Player Name,Position,Event'],
 ['201,1096,FRA,CAUDRON Raoul (FRA),S,0,Alex THEPOT,GK,'],
 ['201,1096,MEX,LUQUE Juan (MEX),S,0,Oscar BONFIGLIO,GK,']]

In [12]:
#printing number of partitions
fifa.getNumPartitions()

4

In [13]:
fifa.count()

37785

[['RoundID,MatchID,Team Initials,Coach Name,Line-up,Shirt Number,Player Name,Position,Event'],
 ['201,1096,FRA,CAUDRON Raoul (FRA),S,0,Alex THEPOT,GK,'],
 ['201,1096,MEX,LUQUE Juan (MEX),S,0,Oscar BONFIGLIO,GK,'],
 ["201,1096,FRA,CAUDRON Raoul (FRA),S,0,Marcel LANGILLER,,G40'"]]