In [1]:
#initial imports
import csv
import pandas as pd
import numpy as np

import pyspark as ps    # for the pyspark suite

In [2]:
spark = ps.sql.SparkSession.builder \
            .master("local[4]") \
            .appName("df JulesVerne") \
            .getOrCreate()

sc = spark.sparkContext

In [3]:
df = pd.read_csv('data/reviews.csv')

In [4]:
df.reviews = df.reviews.astype(str)
df.attractions = df.attractions.astype(str)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001171 entries, 0 to 1001170
Data columns (total 2 columns):
attractions    1001171 non-null object
reviews        1001171 non-null object
dtypes: object(2)
memory usage: 15.3+ MB


In [6]:
ls_attr = df.attractions.unique()

In [7]:
len(ls_attr)

1879

In [8]:
reviews = []
for x in ls_attr:
    _df = df[df.attractions == x]
    reviews.append(' '.join(_df.reviews))

In [9]:
len(reviews)

1879

In [10]:
reviews[0]

'Apart from the mass tourist population, this is one of the great attractions of Australia. Well worth going to see even if you are a domestic tourist! Iconic sightseeing of the harbour area including harbour bridge and opera house. If in Sydney you have to see it. Beautiful harbour. Good to go at daytime or night. Next to the Rocks, Circular Quay, CBD. Good spot to start a small trip in the Rocks area. We took a tour with Grayline which included lunch while sailing around the harbour. We saw so many different sailing ships and they all were very busy. There was even a boat that had two front ends so that they did not have to back...More Visit the Harbor multiple times to get great and different views and photos of the Opera House and Bridge at varying times of day and evening. I suggest booking accommodations in/nearby to Sydney Harbor designated as Circular Quay to maximize your time, logistics and access...More This area, called Circular Quay, was absolutely the BEST way to be greet

In [10]:
r_test = reviews[0:len(reviews):180]

In [11]:
a_test = ls_attr[0:len(ls_attr):180]

In [12]:
df_test = pd.DataFrame({'attractions': a_test, 'reviews': r_test})

In [13]:
df_test

Unnamed: 0,attractions,reviews
0,Sydney_Harbour-Sydney_New_South_Wales,"Apart from the mass tourist population, this i..."
1,Batu_Eco_Green_Park_Fun_Study-Batu_East_Java_Java,Jawa Timur Park 2 has 3 theme parks with diff...
2,Changdeokgung_Palace-Seoul,"If you are going to this palace, make sure you..."
3,Gede_Ruins-Gede_Coast_Province,"Our tour guide Andrew did an exemplary job, g..."
4,Lake_Chala-Moshi_Kilimanjaro_Region,"Its a 27000Tsh each, paying at newly built go..."
5,Fortress_of_Louisbourg_National_Historic_Site-...,This site is full of things to do and really ...
6,Little_French_Key-Roatan_Bay_Islands,LFK is truly a must visit if you are ever in ...
7,Borovets_Ski_Resort-Borovets_Sofia_Region,As a novice I was keen to learn to ski somewhe...
8,Le_Grand_Defi-Saint_Julien_des_Landes_Vendee_P...,"Well set up park, nice and helpful staff memb..."
9,Nidarosdomen-Trondheim_Trondheim_Municipality_...,"We have been to Notre Dame, the Blue Mosque a..."


In [32]:
a_test[9]

'Nidarosdomen-Trondheim_Trondheim_Municipality_Sor_Trondelag_Central_Norway'

In [33]:
r_test[9]

' We have been to Notre Dame, the Blue Mosque and have never been asked not to take photographs! We... read more   The front of the building is stunning, much more interesting than on the inside. Definitely worth... read more  We have been to Notre Dame, the Blue Mosque and have never been asked not to take photographs! We were not allowed to climb the tower or go down into the crypt! So, to walk into the Cathedral it cost us K90 each plus K50...More The front of the building is stunning, much more interesting than on the inside. Definitely worth checking out the outside of the building (and that\'s free) but I wouldn\'t be rushing to buy a ticket for the inside. You can apparently have a tour of...More Considering that Oslo is the capital of Norway, it might be somewhat unusual then to find that the country\'s most significant cathedral is found almost five hundred kilometres away to the north. But when one considers that the history of Trondheim extends back at least...More Together 

In [14]:
spark_df_test = spark.createDataFrame(df_test)

In [15]:
spark_df_test.show()

+--------------------+--------------------+
|         attractions|             reviews|
+--------------------+--------------------+
|Sydney_Harbour-Sy...|Apart from the ma...|
|Batu_Eco_Green_Pa...| Jawa Timur Park ...|
|Changdeokgung_Pal...|If you are going ...|
|Gede_Ruins-Gede_C...| Our tour guide A...|
|Lake_Chala-Moshi_...| Its a 27000Tsh e...|
|Fortress_of_Louis...| This site is ful...|
|Little_French_Key...| LFK is truly a m...|
|Borovets_Ski_Reso...|As a novice I was...|
|Le_Grand_Defi-Sai...| Well set up park...|
|Nidarosdomen-Tron...| We have been to ...|
|Lisebergs_Nojespa...| Perfect for kids...|
+--------------------+--------------------+



## Loading Reviews into a Spark Dataframe

In [16]:
spark_df = spark.createDataFrame(df)

In [17]:
spark_df.show()

+--------------------+--------------------+
|         attractions|             reviews|
+--------------------+--------------------+
|Sydney_Harbour-Sy...|Apart from the ma...|
|Sydney_Harbour-Sy...|Iconic sightseein...|
|Sydney_Harbour-Sy...|Beautiful harbour...|
|Sydney_Harbour-Sy...|We took a tour wi...|
|Sydney_Harbour-Sy...|Visit the Harbor ...|
|Sydney_Harbour-Sy...|This area, called...|
|Sydney_Harbour-Sy...|enjoy a champagne...|
|Sydney_Harbour-Sy...|went on a holiday...|
|Sydney_Harbour-Sy...|The Sydney Harbou...|
|Sydney_Harbour-Sy...|I have always wan...|
|Sydney_Harbour-Sy...|Always special, l...|
|Sydney_Harbour-Sy...|A must see while ...|
|Sydney_Harbour-Sy...|Don't really know...|
|Sydney_Harbour-Sy...|Loved our visit t...|
|Sydney_Harbour-Sy...|Recommend catchin...|
|Sydney_Harbour-Sy...|I have seen a num...|
|Sydney_Harbour-Sy...|Located by many g...|
|Sydney_Harbour-Sy...|Sydney Harbour is...|
|Sydney_Harbour-Sy...|Fantastic trip. B...|
|Sydney_Harbour-Sy...|So love Sy

## Let's try Sydney, Australia

In [8]:
df_sydney = spark_df.filter(spark_df.attractions == 'Sydney_Harbour-Sydney_New_South_Wales')

In [9]:
from src.nltk_pipe import indexing_pipeline
df_sydney_, ls_sydney = indexing_pipeline(df_sydney)

In [11]:
len(ls_sydney)

781

In [14]:
ls_sydney[:15]

[u'harbour',
 u'sydney',
 u'more',
 u'ferri',
 u'opera',
 u'hous',
 u'bridg',
 u'view',
 u'beauti',
 u'great',
 u'place',
 u'day',
 u'time',
 u'harbor',
 u'cruis']

In [16]:
df_sydney_.printSchema()

root
 |-- attractions: string (nullable = true)
 |-- reviews: string (nullable = true)
 |-- bow: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- vector_tf: vector (nullable = true)
 |-- features: vector (nullable = true)



In [19]:
df_sydney_.show(10)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|         attractions|             reviews|                 bow|           vector_tf|            features|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Sydney_Harbour-Sy...|Apart from the ma...|[mass, tourist, p...|(781,[9,51,55,62,...|(781,[9,51,55,62,...|
|Sydney_Harbour-Sy...|Iconic sightseein...|[icon, sightse, h...|(781,[0,1,4,5,6,2...|(781,[0,1,4,5,6,2...|
|Sydney_Harbour-Sy...|Beautiful harbour...|[beauti, harbour,...|(781,[0,8,17,18,2...|(781,[0,8,17,18,2...|
|Sydney_Harbour-Sy...|We took a tour wi...|[tour, graylin, l...|(781,[0,2,15,24,3...|(781,[0,2,15,24,3...|
|Sydney_Harbour-Sy...|Visit the Harbor ...|[harbor, multipl,...|(781,[1,2,4,5,6,7...|(781,[1,2,4,5,6,7...|
|Sydney_Harbour-Sy...|This area, called...|[area, circular, ...|(781,[1,2,11,14,1...|(781,[1,2,11,14,1...|
|Sydney_Harbour-Sy...|enjoy a champag

In [25]:
df_sydney_.select("bow").show(1, truncate=False)

+-------------------------------------------------------------------------+
|bow                                                                      |
+-------------------------------------------------------------------------+
|[mass, tourist, popul, great, attract, australia, worth, domest, tourist]|
+-------------------------------------------------------------------------+
only showing top 1 row



In [26]:
df_sydney_.select("vector_tf").show(1, truncate=False)

+--------------------------------------------+
|vector_tf                                   |
+--------------------------------------------+
|(781,[9,51,55,62,111],[1.0,2.0,1.0,1.0,1.0])|
+--------------------------------------------+
only showing top 1 row



In [27]:
df_sydney_.select("reviews").show(1, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------+
|reviews                                                                                                                                               |
+------------------------------------------------------------------------------------------------------------------------------------------------------+
|Apart from the mass tourist population, this is one of the great attractions of Australia. Well worth going to see even if you are a domestic tourist!|
+------------------------------------------------------------------------------------------------------------------------------------------------------+
only showing top 1 row



## Let's try Angkor Wat, Cambodia

In [17]:
df_angkor = spark_df.filter(spark_df.attractions == 'Angkor_Thom-Siem_Reap_Siem_Reap_Province')

In [18]:
df_angkor_, ls_angkor = indexing_pipeline(df_angkor)

In [20]:
ls_angkor[:15]

[u'templ',
 u'angkor',
 u'more',
 u'thom',
 u'wat',
 u'place',
 u'mani',
 u'time',
 u'day',
 u'guid',
 u'great',
 u'beauti',
 u'bayon',
 u'amaz',
 u'citi']

## Now on the entire dataframe, fingers crossed!

In [31]:
#df_, ls_ = indexing_pipeline(spark_df)
#doing this on AWS, need more cores

In [18]:
from src.nltk_pipe import indexing_pipeline
df_test, ls_test = indexing_pipeline(spark_df_test)

In [20]:
len(ls_test)

2598

In [21]:
df_test.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|         attractions|             reviews|                 bow|           vector_tf|            features|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Sydney_Harbour-Sy...|Apart from the ma...|[mass, tourist, p...|(2598,[0,1,2,3,4,...|(2598,[0,1,2,3,4,...|
|Batu_Eco_Green_Pa...| Jawa Timur Park ...|[jawa, timur, par...|(2598,[0,1,2,3,4,...|(2598,[0,1,2,3,4,...|
|Changdeokgung_Pal...|If you are going ...|[palac, sure, int...|(2598,[0,1,2,3,4,...|(2598,[0,1,2,3,4,...|
|Gede_Ruins-Gede_C...| Our tour guide A...|[tour, guid, andr...|(2598,[0,1,2,3,4,...|(2598,[0,1,2,3,4,...|
|Lake_Chala-Moshi_...| Its a 27000Tsh e...|[govt, post, grea...|(2598,[0,1,2,3,4,...|(2598,[0,1,2,3,4,...|
|Fortress_of_Louis...| This site is ful...|[site, full, thin...|(2598,[0,1,2,3,4,...|(2598,[0,1,2,3,4,...|
|Little_French_Key...| LFK is truly a

In [42]:
ls_test[:30]

[u'more',
 u'place',
 u'great',
 u'time',
 u'day',
 u'beauti',
 u'park',
 u'good',
 u'palac',
 u'lot',
 u'tour',
 u'harbour',
 u'nice',
 u'littl',
 u'visit',
 u'sydney',
 u'mani',
 u'garden',
 u'ride',
 u'histori',
 u'year',
 u'cruis',
 u'ski',
 u'guid',
 u'best',
 u'food',
 u'amaz',
 u'famili',
 u'french',
 u'peopl']

In [44]:
df_test.select("vector_tf").show(1, truncate=False).collect()[0]

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

AttributeError: 'NoneType' object has no attribute 'collect'

In [45]:
df_test.printSchema()

root
 |-- attractions: string (nullable = true)
 |-- reviews: string (nullable = true)
 |-- bow: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- vector_tf: vector (nullable = true)
 |-- features: vector (nullable = true)



# LDA

In [19]:
from pyspark.ml.clustering import LDA


In [34]:
lda = LDA(k=6, optimizer="em")

In [35]:
model = lda.fit(df_test)

In [36]:
topics = model.describeTopics().collect()

In [37]:
for topic in topics:
    print("- TOPIC {} -".format(topic[0]))
    topickeys = topic[1]
    topicvalues = topic[2]
    for i in range(len(topickeys)):
        print("  - word '{}': {}".format(ls_test[topickeys[i]],topicvalues[i]))
#print("--- [time elapsed: {}]".format(time.time()-t_begin))



- TOPIC 0 -
  - word 'harbour': 0.160317069907
  - word 'sydney': 0.159216800286
  - word 'opera': 0.0787991650295
  - word 'ferri': 0.0568503041956
  - word 'bridg': 0.0436315055284
  - word 'harbor': 0.0311846156121
  - word 'cruis': 0.0156593833621
  - word 'boat': 0.0127254955286
  - word 'icon': 0.0101562058734
  - word 'vivid': 0.0100777851985
- TOPIC 1 -
  - word 'key': 0.0516945932839
  - word 'cruis': 0.0465553044277
  - word 'island': 0.0402645594101
  - word 'french': 0.0316494252223
  - word 'beach': 0.0265121753398
  - word 'ship': 0.0213405040921
  - word 'excurs': 0.0183129351068
  - word 'port': 0.014280921498
  - word 'carniv': 0.0132060302765
  - word 'comment': 0.0126413898878
- TOPIC 2 -
  - word 'amus': 0.0650259477546
  - word 'ride': 0.0389615898323
  - word 'christma': 0.0285273626227
  - word 'sweden': 0.0274373205323
  - word 'market': 0.0240293362138
  - word 'queue': 0.0182361753045
  - word 'theme': 0.0139961868383
  - word 'flower': 0.0112154960632
  - wor

In [33]:
topics

[Row(topic=0, termIndices=[8, 47, 21, 72, 17, 38, 28, 84, 91, 214], termWeights=[0.09025806632271004, 0.03325006785035672, 0.029294040258265066, 0.025022047578001694, 0.022988676841297, 0.02287429649427522, 0.01976742820283245, 0.017072564782357164, 0.013333228364142647, 0.01218049312555555]),
 Row(topic=1, termIndices=[80, 18, 97, 98, 185, 87, 248, 200, 213, 218], termWeights=[0.03966836555428482, 0.023498053688581017, 0.023110791287546038, 0.01820705816781233, 0.017755978874606282, 0.017526142838800676, 0.016823713644994544, 0.014575154300227626, 0.011005500525655593, 0.010180457903229566]),
 Row(topic=2, termIndices=[11, 15, 54, 42, 61, 155, 21, 113, 355, 273], termWeights=[0.14878733232758246, 0.14426565975729713, 0.07315869432666405, 0.053354602878010714, 0.04088390054751191, 0.02907097424591972, 0.014492074421107537, 0.011883842381465115, 0.011235836953864277, 0.009486876664344489]),
 Row(topic=3, termIndices=[88, 19, 58, 123, 30, 186, 10, 287, 108, 23], termWeights=[0.0383415876