In [1]:
#initial imports
import csv
import pandas as pd
import numpy as np

import pyspark as ps
from src.nltk_pipe import indexing_pipeline
from pyspark.ml.clustering import LDA

In [2]:
df_total = pd.read_pickle('data/reviews_unique.pkl')

In [3]:
spark_df = spark.createDataFrame(df_total)

In [6]:
spark_df.show(2)

+--------------------+--------------------+
|         attractions|             reviews|
+--------------------+--------------------+
|Sydney_Harbour-Sy...|Apart from the ma...|
|Australian_War_Me...|This is a must vi...|
+--------------------+--------------------+
only showing top 2 rows



In [5]:
df_total_nltk, ls_nltk = indexing_pipeline(spark_df)

In [6]:
df_total_nltk.persist()

DataFrame[attractions: string, reviews: string, bow: array<string>, vector_tf: vector, features: vector]

In [4]:
nltk_df = pd.read_pickle('data/df_total_nltk.pkl')

In [2]:
nltk_ls = pd.read_pickle('data/ls_nltk.pkl')

In [3]:
nltk_ls

[u'more',
 u'place',
 u'great',
 u'time',
 u'beauti',
 u'day',
 u'tour',
 u'park',
 u'good',
 u'visit',
 u'museum',
 u'view',
 u'lot',
 u'nice',
 u'mani',
 u'guid',
 u'amaz',
 u'garden',
 u'histori',
 u'experi',
 u'hour',
 u'citi',
 u'area',
 u'peopl',
 u'interest',
 u'trip',
 u'build',
 u'walk',
 u'best',
 u'old',
 u'way',
 u'other',
 u'water',
 u'wonder',
 u'much',
 u'anim',
 u'year',
 u'templ',
 u'worth',
 u'littl',
 u'top',
 u'thank',
 u'small',
 u'site',
 u'famili',
 u'differ',
 u'part',
 u'ride',
 u'kid',
 u'world',
 u'beach',
 u'thing',
 u'first',
 u'zoo',
 u'shop',
 u'ticket',
 u'few',
 u'restaur',
 u'free',
 u'fun',
 u'art',
 u'church',
 u'food',
 u'show',
 u'island',
 u'friend',
 u'inform',
 u'exhibit',
 u'tourist',
 u'hous',
 u'easi',
 u'town',
 u'big',
 u'bit',
 u'impress',
 u'attract',
 u'sure',
 u'excel',
 u'architectur',
 u'fantast',
 u'love',
 u'palac',
 u'boat',
 u'minut',
 u'review',
 u'local',
 u'full',
 u'night',
 u'huge',
 u'photo',
 u'car',
 u'life',
 u'staff',
 u

In [8]:
spark_nltk_df = spark.createDataFrame(nltk_df)

In [9]:
spark_nltk_df.persist()

DataFrame[attractions: string, reviews: string, bow: array<string>, vector_tf: vector, features: vector]

In [19]:
import pickle

In [21]:
f = 'data/ls_nltk.pkl'
with open(f, 'wb') as myfile:
    pickle.dump(ls_nltk, myfile)

## Testing spark LDA

In [14]:
sample_df = df_total.sample(frac=0.01, replace=False)

In [15]:
spark_sample_df = spark.createDataFrame(sample_df)

In [16]:
sample_nltk_df, sample_ls = indexing_pipeline(spark_sample_df)

In [16]:
lda = LDA(k=3, optimizer="em")

In [17]:
model = lda.fit(sample_nltk_df)

In [18]:
topics = model.describeTopics().collect()

In [19]:
for topic in topics:
    print("- TOPIC {} -".format(topic[0]))
    topickeys = topic[1]
    topicvalues = topic[2]
    for i in range(len(topickeys)):
        print("  - word '{}': {}".format(sample_ls[topickeys[i]],topicvalues[i]))

- TOPIC 0 -
  - word 'waterfal': 0.107076734088
  - word 'fall': 0.0225443648848
  - word 'island': 0.0176960065402
  - word 'wet': 0.0172834616148
  - word 'fort': 0.0160332361729
  - word 'el': 0.0139192917059
  - word 'ferri': 0.0121036043839
  - word 'waterproof': 0.0092313432661
  - word 'fortress': 0.00919330767031
  - word 'lighthous': 0.00745761548216
- TOPIC 1 -
  - word 'museum': 0.0263856076058
  - word 'palac': 0.0224564929425
  - word 'pragu': 0.018527278134
  - word 'garden': 0.0174302952708
  - word 'port': 0.0170167094791
  - word 'collect': 0.0127449514114
  - word 'mosaic': 0.0122871777836
  - word 'histori': 0.0105498433234
  - word 'josi': 0.00991788532738
  - word 'concert': 0.0097557943973
- TOPIC 2 -
  - word 'bridg': 0.0394331900272
  - word 'pragu': 0.0268651564562
  - word 'disney': 0.0242966701864
  - word 'beach': 0.0174013199385
  - word 'charl': 0.0151300901016
  - word 'ride': 0.00961449603691
  - word 'fort': 0.009322728664
  - word 'restaur': 0.00931595

In [34]:
m = model.topicsMatrix().toArray()

In [35]:
m.shape

(3096, 3)

In [36]:
len(sample_ls)

3096

In [38]:
sample_ls[:5]

[u'more', u'place', u'great', u'time', u'beauti']

In [40]:
for i in m:
    print i

[  55.3903943   101.9993294    73.07104798]
[  28.35065615   78.60658886  114.93200096]
[ 14.90557432  36.70174817  37.69430304]
[ 0.  0.  0.]
[ 46.62959495  69.87793315  48.27631839]
[ 30.35290458  58.87363925  74.92513957]
[  1.28108485e+00   8.88182421e+02   7.15788172e-01]
[  46.77667469  233.95496797   16.71871124]
[  10.92532999  355.12486675   73.73000913]
[ 29.33871245  19.60558233  78.22584762]
[  1.80417902e+03   4.85724223e-04   3.90784478e-04]
[  1.12870234e+00   4.39175795e-02   9.94835406e+02]
[ 10.003448     8.57858445  36.55825901]
[ 17.85821472  30.58391453  58.49879414]
[  7.2624596   13.55421994  31.24601426]
[ 28.41723245  66.59187154  68.80997695]
[ 24.9230352   96.26412584  36.45620057]
[ 14.49034632  33.22215938  45.1101086 ]
[  33.61210135  258.66728501    5.54419182]
[  43.02419855    0.85529805  191.44443867]
[ 21.90957221  76.86081951  34.00757368]
[ 25.06243035  25.11428976  74.15026096]
[   0.94635799  173.97324683   45.1571806 ]
[  14.36511076  195.1499825

[ 1.24160591  7.81762915  9.84131301]
[  1.17138391   3.7312636   33.9407711 ]
[  3.50779634e+01   1.24023218e-03   2.44699991e+01]
[  0.29316033  41.98660502   2.26722841]
[  0.07548531  18.14849826   6.72931494]
[ 15.66704194   0.55228047   5.30280962]
[  2.49744535  30.00383952   5.29231161]
[  0.77517549  12.9399835    7.80697304]
[ 15.38051799   8.54479037   1.02799014]
[ 7.88232954  8.43499612  2.07239679]
[ 0.63486677  6.92279931  5.2826319 ]
[  3.13550573  16.20556465   5.61222812]
[  6.43437678  12.97035508   9.3415452 ]
[  0.58259173  28.53502001   3.8688546 ]
[  3.06955148  53.8362812    1.03393217]
[ 3.0546425   5.68726016  6.76628232]
[  6.17379589   1.92121321  23.97516651]
[ 0.9912108   9.48655864  7.40112738]
[  1.91725366e-02   3.43675156e+01   1.41336145e+01]
[  3.61477108e-02   1.99113729e-03   5.62921881e+01]
[  0.83517787   3.2670682   10.97515599]
[  0.08202002   7.33304257  20.53270678]
[  2.16759495   0.54485513  53.61787685]
[  2.28811565  22.15160579   3.50804

[ 6.50111093  3.50265433  1.90801426]
[  0.01278446   3.36639732  12.27246468]
[  7.78910762e-03   4.73067757e+00   1.09131798e+01]
[  5.31997370e-03   1.44977297e+01   1.14859678e+00]
[ 0.22197593  9.09310043  4.33261126]
[  2.30454727e+01   1.29068965e-03   1.61579638e+00]
[ 5.00620359  3.57163667  0.43307308]
[  1.80158011e+01   2.86320778e-03   3.16237098e-03]
[  8.38442893e-01   8.38808047e-03   2.38157288e+01]
[  0.41850495  20.39256136   0.11162655]
[ 2.24467591  4.50062892  5.16647468]
[ 3.02100132  3.78565174  2.20426028]
[  0.17473395   3.42481226  12.05210025]
[ 2.02974757  3.0436033   5.30724919]
[  0.04861066   1.21534886  16.75786717]
[ 8.15826791  1.17486326  2.57864835]
[ 4.08775482  4.89961369  2.924411  ]
[ 0.01592405  8.4900182   5.14174537]
[ 2.40518527  4.48076207  3.49465271]
[  1.65409487e-01   1.78516922e+01   4.72498238e-03]
[  0.42447107  20.38750999   0.1107118 ]
[  0.78398754  12.67763301   2.19002591]
[ 11.1781253    4.10748998   2.73621141]
[ 2.84523957  2

[  1.35131657e-03   1.13799905e+01   1.37812635e-03]
[ 0.27857365  4.99475108  1.9505121 ]
[ 2.73975179  1.23419338  4.34382099]
[  2.09154463e+00   7.56063855e+00   4.44429485e-03]
[  2.27886709e-03   1.13780920e+01   2.34906814e-03]
[ 1.97091595  2.55643897  2.6964819 ]
[ 0.315952    4.65085115  3.35096301]
[ 1.28169627  1.09306497  4.84907559]
[  3.14822823e+00   6.50311754e+00   5.28170771e-03]
[ 0.02967644  6.41269079  3.21426024]
[ 3.59827329  1.27958615  3.43990673]
[ 0.8032816   8.59570705  0.25763883]
[ 1.92219548  0.72534517  5.67022552]
[  2.82314357e-03   9.65199328e+00   1.81104680e-03]
[ 0.03034493  6.3964665   3.22981605]
[ 2.42096771  5.96321513  1.27244463]
[  5.76778788e+00   7.15793145e-04   5.61421623e+00]
[  7.03415682e-03   3.83447026e+00   7.54121550e+00]
[ 2.48943307  1.13509322  3.59931054]
[  1.66372343e-03   1.75500584e+00   9.62605035e+00]
[ 1.94952642  3.65861477  2.70962498]
[ 0.01249674  4.1147296   4.19053982]
[ 0.2230285   3.66472592  3.3360824 ]
[ 1.99

[ 0.03046779  3.20127065  1.5965753 ]
[  3.77015184e+00   1.28028247e-03   1.91992783e+00]
[ 1.09303362  3.72887206  0.00640806]
[ 0.14283241  3.67921112  1.86931642]
[ 0.51337547  0.00800682  5.16997766]
[  1.07579359e-03   5.68865600e+00   1.62815779e-03]
[  2.03170068e-03   3.24535245e+00   1.58092959e+00]
[  3.45957950e-03   6.21856814e-04   5.68727852e+00]
[  1.41551293e-02   5.67581158e+00   1.39324842e-03]
[ 0.04383028  3.74719206  1.90033762]
[  3.45957949e-03   6.21856814e-04   5.68727852e+00]
[  1.41551192e-02   5.67581159e+00   1.39324840e-03]
[ 5.65670062  0.01317789  0.02148144]
[  1.40203887e-01   4.20036929e-03   5.54695570e+00]
[ 0.03978067  1.84088881  3.81069047]
[ 2.40796766  3.21453152  0.06886077]
[  1.67150811e+00   7.97915136e-04   3.15600771e+00]
[ 2.27406802  0.02900189  2.52524383]
[  6.96336980e-03   2.04880804e-03   5.68234778e+00]
[ 2.37006856  0.8673677   1.59087748]
[ 0.01288857  1.8536085   3.82486289]
[ 0.02818972  3.196369    1.60375502]
[ 0.12700585  

[ 2.48878415  1.23059931  0.07485651]
[  3.08745386e-03   3.78989859e+00   1.25392975e-03]
[ 1.27340345  2.02306498  0.49777154]
[  4.34118478e-03   6.49528151e-04   3.78924926e+00]
[ 0.01260647  1.87931863  1.90231486]
[ 0.23770352  0.00800638  3.54853006]
[  1.67399228e-02   3.77590282e+00   1.59722617e-03]
[  3.08745386e-03   3.78989859e+00   1.25392975e-03]
[  2.67775713e-03   3.78991887e+00   1.64333849e-03]
[ 0.00888201  1.88879553  1.89656243]
[ 1.58096979  2.00133982  0.21193036]
[ 0.01262037  1.87931428  1.90230531]
[  6.71834555e-03   2.68458139e-03   3.78483704e+00]
[  1.19693859e-03   3.79155909e+00   1.48393946e-03]
[ 0.10431539  1.80002727  1.88989731]
[ 1.67957791  2.10922792  0.00543414]
[ 0.05035642  1.91912973  1.82475381]
[ 0.11996511  1.78678278  1.88749208]
[  3.78578940e+00   6.01143356e-03   2.43913387e-03]
[  2.67775713e-03   3.78991887e+00   1.64333849e-03]
[ 1.65353805  2.10342882  0.0372731 ]
[  1.19693859e-03   3.79155909e+00   1.48393946e-03]
[  4.34118479e

In [43]:
transformed = model.transform(sample_nltk_df)

In [51]:
a = transformed.select('topicDistribution')

In [52]:
type(a)

pyspark.sql.dataframe.DataFrame

In [56]:
a.toPandas()

Unnamed: 0,topicDistribution
0,"[0.0115302494764, 0.0103710479144, 0.978098702..."
1,"[0.749023191888, 0.18992226399, 0.0610545441217]"
2,"[0.306404484301, 0.31001866227, 0.383576853429]"
3,"[0.989166343067, 0.00382556298841, 0.007008093..."
4,"[0.0144522292225, 0.971510984714, 0.0140367860..."
5,"[0.0122320523366, 0.00769150042494, 0.98007644..."
6,"[0.629540431277, 0.0828334456175, 0.287626123105]"
7,"[0.186149866002, 0.2129429922, 0.600907141799]"
8,"[0.931372291134, 0.0191126200627, 0.0495150888..."
9,"[0.00649468183311, 0.0206698656633, 0.97283545..."


In [57]:
b = _

In [70]:
b['max'] = b.topicDistribution.apply(np.argmax)

In [71]:
b['max']

0     2
1     0
2     2
3     0
4     1
5     2
6     0
7     2
8     0
9     2
10    1
11    0
12    1
13    2
14    2
15    0
16    1
17    1
18    1
Name: max, dtype: int64

## Testing the pipeline

In [5]:
from src.lda_pipe import save_to_s3, lda_model_score, print_topics, to_pickle

In [17]:
type(spark_sample_df)

pyspark.sql.dataframe.DataFrame

In [18]:
spark_sample_df.persist()

DataFrame[attractions: string, reviews: string]

In [19]:
#sample_nltk_df, sample_ls
sample_nltk_df.persist()

DataFrame[attractions: string, reviews: string, bow: array<string>, vector_tf: vector, features: vector]

In [20]:
lda, model, topics, transformed = lda_model_score(sample_nltk_df, 4)

In [21]:
print_topics(topics, sample_ls)

- TOPIC 0 -
  - word 'zoo': 0.0932548620658
  - word 'rickshaw': 0.0693146603084
  - word 'anim': 0.0433619749504
  - word 'cathedr': 0.0385900403969
  - word 'dear': 0.0250117964688
  - word 'kyoto': 0.0211140409596
  - word 'church': 0.0155085150807
  - word 'lima': 0.0143645463891
  - word 'thank': 0.0139644389732
  - word 'bamboo': 0.0132319329591
- TOPIC 1 -
  - word 'templ': 0.0497711832453
  - word 'miyajima': 0.0389182912482
  - word 'island': 0.0341893726521
  - word 'deer': 0.0267501927996
  - word 'ferri': 0.0267026642301
  - word 'shrine': 0.023790289556
  - word 'jr': 0.0214090369797
  - word 'hong': 0.0186842555288
  - word 'kong': 0.0185061761436
  - word 'garden': 0.0180525199746
- TOPIC 2 -
  - word 'beach': 0.083004488557
  - word 'arch': 0.0413793829113
  - word 'pragu': 0.0206591748725
  - word 'concert': 0.0201020018016
  - word 'municip': 0.0149893821865
  - word 'hall': 0.0144158909909
  - word 'art': 0.0119458405125
  - word 'hike': 0.0115028756586
  - word 'san

In [85]:
to_pickle(transformed, 'test.pkl')

Unnamed: 0,attractions,reviews,bow,vector_tf,features,topicDistribution
0,Tokyo_Disneyland-Urayasu_Chiba_Prefecture_Kanto,We love Tokyo Disneyland. We had a 3-day pass ...,"[tokyo, disneyland, 3-day, pass, time, round, ...","(635.0, 201.0, 145.0, 304.0, 33.0, 317.0, 0.0,...","(32.5712419361, 21.1774636472, 7.43752768619, ...","[0.00722041109051, 0.106651777385, 0.016021987..."
1,Bet_She_an_National_Park-Beit_She_an_Northern_...,Interesting piece of history close to Jordan ...,"[piec, histori, close, jordan, border, hour, h...","(57.0, 22.0, 14.0, 15.0, 2.0, 11.0, 0.0, 11.0,...","(2.92371778009, 2.31793134447, 0.718106121426,...","[0.0420580089135, 0.0413411018979, 0.222497497..."
2,Molhe_Barra_Sul-Balneario_Camboriu_State_of_Sa...,Nice view _Ù÷ great for walking..at day and n...,"[nice, view, _u, great, walking..at, day, nigh...","(0.0, 0.0, 3.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.153879883163, 0.0, 0.0, 0.1053605...","[0.224604622584, 0.24496575215, 0.266211375942..."
3,Seljalandsfoss-South_Region,I have to echo what other reviewers have said ...,"[other, review, anywher, truli, waterfal, comp...","(459.0, 96.0, 115.0, 89.0, 278.0, 101.0, 0.0, ...","(23.5436221239, 10.1146095032, 5.89872885457, ...","[0.00283150792066, 0.989770015213, 0.003448263..."
4,The_Giant_s_House-Akaroa_Canterbury_Region_Sou...,Absolutely incredible. Her work is out of this...,"[incred, work, world, make, sure, time, great,...","(370.0, 230.0, 86.0, 109.0, 152.0, 64.0, 2.0, ...","(18.9785189234, 24.2329186013, 4.41122331733, ...","[0.979210070855, 0.0054740134963, 0.0085136748..."
5,The_Walk_At_JBR-Dubai_Emirate_of_Dubai,As the weather in September is still hot I re...,"[weather, septemb, hot, beach, even, mani, mor...","(296.0, 479.0, 207.0, 123.0, 75.0, 96.0, 0.0, ...","(15.1828151387, 50.4676870001, 10.6177119382, ...","[0.00530330268404, 0.0113053991565, 0.83967113..."
6,El_Morro-Havana_Ciudad_de_la_Habana_Province_Cuba,A great view from the cruise ship on the way ...,"[great, view, cruis, ship, way, havana, great,...","(381.0, 206.0, 252.0, 118.0, 93.0, 108.0, 52.0...","(19.5427451617, 21.7042662255, 12.9259101857, ...","[0.00897887243706, 0.882010682535, 0.091321050..."
7,Djura_Jaksic_Monument-Novi_Sad_Vojvodina,Dunavski park is in a city center and it's de...,"[dunavski, park, citi, center, natur, relax, r...","(5.0, 1.0, 3.0, 1.0, 2.0, 0.0, 0.0, 0.0, 0.0, ...","(0.256466471938, 0.105360515658, 0.15387988316...","[0.168042981316, 0.142194944816, 0.16445670749..."
8,Udo-Jeju_Jeju_Island,"This is a perfect one-day trip from Jeju, I ca...","[perfect, one-day, trip, jeju, season, midweek...","(174.0, 65.0, 29.0, 37.0, 66.0, 64.0, 1.0, 41....","(8.92503322343, 6.84843351776, 1.48750553724, ...","[0.0115707033225, 0.0720802933086, 0.899410486..."
9,Charles_Bridge-Prague_Bohemia,Wonderful views over to the castle area. Best ...,"[wonder, view, castl, area, best, earli, day, ...","(282.0, 176.0, 184.0, 206.0, 282.0, 199.0, 7.0...","(14.4647090173, 18.5434507558, 9.43796616731, ...","[0.0127849394977, 0.00455967748244, 0.00524998..."


## the entire dataframe!

In [6]:
from src.lda_pipe import save_to_s3, lda_model_score, print_topics, to_pickle

In [None]:
#nltk_df = pd.read_pickle('data/df_total_nltk.pkl')
#nltk_ls = pd.read_pickle('data/ls_nltk.pkl')

In [7]:
spark_nltk_df = spark.createDataFrame(nltk_df)

In [38]:
lda, model, topics, transformed = lda_model_score(spark_nltk_df, 20)

In [39]:
print_topics(topics, nltk_ls)

- TOPIC 0 -
  - word 'palac': 0.0391437436093
  - word 'castl': 0.0347835681738
  - word 'fort': 0.0181264415785
  - word 'seoul': 0.0176783234263
  - word 'copenhagen': 0.0137147294718
  - word 'korean': 0.0124891402121
  - word 'pragu': 0.0112232395745
  - word 'korea': 0.009791533048
  - word 'parliament': 0.00889395480336
  - word 'garden': 0.00885978923237
- TOPIC 1 -
  - word 'dolphin': 0.0414207108434
  - word 'aquarium': 0.0375584337744
  - word 'shark': 0.0149105264004
  - word 'dubai': 0.0134925454508
  - word 'abu': 0.0128715646645
  - word 'bucharest': 0.0111309079173
  - word 'dhabi': 0.0110462810369
  - word 'klcc': 0.0100862084927
  - word 'potter': 0.00916688159209
  - word 'harri': 0.00892475488562
- TOPIC 2 -
  - word 'glacier': 0.0384854759749
  - word 'kyoto': 0.0152374337302
  - word 'rickshaw': 0.0132583297236
  - word 'london': 0.0126904986694
  - word 'agora': 0.0118448010036
  - word 'milford': 0.0111236976827
  - word 'ebisuya': 0.0101567616223
  - word 'gaudi

In [37]:
p = to_pickle(transformed, 'data/transformed_20.pkl')

In [46]:
topics = model.describeTopics(maxTermsPerTopic=30).collect()

In [47]:
print_topics(topics, nltk_ls)

- TOPIC 0 -
  - word 'palac': 0.0391437436093
  - word 'castl': 0.0347835681738
  - word 'fort': 0.0181264415785
  - word 'seoul': 0.0176783234263
  - word 'copenhagen': 0.0137147294718
  - word 'korean': 0.0124891402121
  - word 'pragu': 0.0112232395745
  - word 'korea': 0.009791533048
  - word 'parliament': 0.00889395480336
  - word 'garden': 0.00885978923237
  - word 'edmonton': 0.00833871621775
  - word 'sculptur': 0.0071207502416
  - word 'museum': 0.00694328825027
  - word 'statu': 0.00642846998394
  - word 'tower': 0.00580757914838
  - word 'audio': 0.00544559765411
  - word 'histori': 0.0052701682644
  - word 'royal': 0.00490155006673
  - word 'danish': 0.00437242321843
  - word 'build': 0.00393276059434
  - word 'lobkowicz': 0.00350768540713
  - word 'art': 0.00344623791097
  - word 'klimt': 0.0033727854594
  - word 'dmz': 0.00336788829287
  - word 'fortress': 0.00332170691339
  - word 'belveder': 0.00320798471762
  - word 'turku': 0.00312348519642
  - word 'architectur': 0.00

In [49]:
import pickle

In [52]:
model.save("lda_20.model")

In [59]:
model2 = LDA.DistributedLDAModel.load('lda_20.model')

AttributeError: type object 'LDA' has no attribute 'DistributedLDAModel'