In [1]:
#initial imports
import csv
import pandas as pd
import numpy as np

import pyspark as ps    # for the pyspark suite

spark = ps.sql.SparkSession.builder \
            .master("local[4]") \
            .appName("df JulesVerne") \
            .getOrCreate()

sc = spark.sparkContext

In [2]:
df = pd.read_csv('data/reviews.csv')

In [3]:
df.reviews = df.reviews.astype(str)
df.attractions = df.attractions.astype(str)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001171 entries, 0 to 1001170
Data columns (total 2 columns):
attractions    1001171 non-null object
reviews        1001171 non-null object
dtypes: object(2)
memory usage: 15.3+ MB


## Loading Reviews into a Spark Dataframe

In [6]:
spark_df = spark.createDataFrame(df)

In [7]:
spark_df.show()

+--------------------+--------------------+
|         attractions|             reviews|
+--------------------+--------------------+
|Sydney_Harbour-Sy...|Apart from the ma...|
|Sydney_Harbour-Sy...|Iconic sightseein...|
|Sydney_Harbour-Sy...|Beautiful harbour...|
|Sydney_Harbour-Sy...|We took a tour wi...|
|Sydney_Harbour-Sy...|Visit the Harbor ...|
|Sydney_Harbour-Sy...|This area, called...|
|Sydney_Harbour-Sy...|enjoy a champagne...|
|Sydney_Harbour-Sy...|went on a holiday...|
|Sydney_Harbour-Sy...|The Sydney Harbou...|
|Sydney_Harbour-Sy...|I have always wan...|
|Sydney_Harbour-Sy...|Always special, l...|
|Sydney_Harbour-Sy...|A must see while ...|
|Sydney_Harbour-Sy...|Don't really know...|
|Sydney_Harbour-Sy...|Loved our visit t...|
|Sydney_Harbour-Sy...|Recommend catchin...|
|Sydney_Harbour-Sy...|I have seen a num...|
|Sydney_Harbour-Sy...|Located by many g...|
|Sydney_Harbour-Sy...|Sydney Harbour is...|
|Sydney_Harbour-Sy...|Fantastic trip. B...|
|Sydney_Harbour-Sy...|So love Sy

## Let's try Sydney, Australia

In [8]:
df_sydney = spark_df.filter(spark_df.attractions == 'Sydney_Harbour-Sydney_New_South_Wales')

In [9]:
from src.nltk_pipe import indexing_pipeline
df_sydney_, ls_sydney = indexing_pipeline(df_sydney)

In [11]:
len(ls_sydney)

781

In [14]:
ls_sydney[:15]

[u'harbour',
 u'sydney',
 u'more',
 u'ferri',
 u'opera',
 u'hous',
 u'bridg',
 u'view',
 u'beauti',
 u'great',
 u'place',
 u'day',
 u'time',
 u'harbor',
 u'cruis']

In [16]:
df_sydney_.printSchema()

root
 |-- attractions: string (nullable = true)
 |-- reviews: string (nullable = true)
 |-- bow: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- vector_tf: vector (nullable = true)
 |-- features: vector (nullable = true)



In [19]:
df_sydney_.show(10)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|         attractions|             reviews|                 bow|           vector_tf|            features|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Sydney_Harbour-Sy...|Apart from the ma...|[mass, tourist, p...|(781,[9,51,55,62,...|(781,[9,51,55,62,...|
|Sydney_Harbour-Sy...|Iconic sightseein...|[icon, sightse, h...|(781,[0,1,4,5,6,2...|(781,[0,1,4,5,6,2...|
|Sydney_Harbour-Sy...|Beautiful harbour...|[beauti, harbour,...|(781,[0,8,17,18,2...|(781,[0,8,17,18,2...|
|Sydney_Harbour-Sy...|We took a tour wi...|[tour, graylin, l...|(781,[0,2,15,24,3...|(781,[0,2,15,24,3...|
|Sydney_Harbour-Sy...|Visit the Harbor ...|[harbor, multipl,...|(781,[1,2,4,5,6,7...|(781,[1,2,4,5,6,7...|
|Sydney_Harbour-Sy...|This area, called...|[area, circular, ...|(781,[1,2,11,14,1...|(781,[1,2,11,14,1...|
|Sydney_Harbour-Sy...|enjoy a champag

In [25]:
df_sydney_.select("bow").show(1, truncate=False)

+-------------------------------------------------------------------------+
|bow                                                                      |
+-------------------------------------------------------------------------+
|[mass, tourist, popul, great, attract, australia, worth, domest, tourist]|
+-------------------------------------------------------------------------+
only showing top 1 row



In [26]:
df_sydney_.select("vector_tf").show(1, truncate=False)

+--------------------------------------------+
|vector_tf                                   |
+--------------------------------------------+
|(781,[9,51,55,62,111],[1.0,2.0,1.0,1.0,1.0])|
+--------------------------------------------+
only showing top 1 row



In [27]:
df_sydney_.select("reviews").show(1, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------+
|reviews                                                                                                                                               |
+------------------------------------------------------------------------------------------------------------------------------------------------------+
|Apart from the mass tourist population, this is one of the great attractions of Australia. Well worth going to see even if you are a domestic tourist!|
+------------------------------------------------------------------------------------------------------------------------------------------------------+
only showing top 1 row



## Let's try Angkor Wat, Cambodia

In [17]:
df_angkor = spark_df.filter(spark_df.attractions == 'Angkor_Thom-Siem_Reap_Siem_Reap_Province')

In [18]:
df_angkor_, ls_angkor = indexing_pipeline(df_angkor)

In [20]:
ls_angkor[:15]

[u'templ',
 u'angkor',
 u'more',
 u'thom',
 u'wat',
 u'place',
 u'mani',
 u'time',
 u'day',
 u'guid',
 u'great',
 u'beauti',
 u'bayon',
 u'amaz',
 u'citi']

## Now on the entire dataframe, fingers crossed!

In [28]:
df_, ls_ = indexing_pipeline(spark_df)
#doing this on AWS

KeyboardInterrupt: 