# Tourist satisfaction with transport systems to Mount Etna, Sicily. Part 3. Predict.

# This cell will prompt you to connect this notebook with your google account.
from google.colab import drive                                                                        
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/"
base_dir = root_dir + 'Academy/+ Papers/en_proceso/mount_etna/'


In [1]:
base_dir = '../' # To run locally

# 1. Data pre-processing

In [2]:
#!pip install pycaret[full]

## Importing libraries

In [3]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
import imblearn
from pycaret.classification import *
import spacy
#spacy.load("en_core_web_sm")


# NLTK
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet, stopwords
from collections import Counter
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

#ignore log(0) and divide by 0 warning
np.seterr(divide = 'ignore');
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

print('Libraries read!')

Libraries read!


[nltk_data] Downloading package stopwords to /home/juan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/juan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/juan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /home/juan/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/juan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Reading models

In [4]:
model_title = load_model(base_dir + 'models/model_title_jupyter')
model_review = load_model(base_dir + 'models/model_review_jupyter')
model_classification = load_model(base_dir + 'models/tuned_nb')

Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded


## Reading data

In [5]:
data = pd.read_csv(base_dir + 'predict/data_to_predict.csv')

# Dropping rows without data in content
data = data.dropna(subset = ['content'])

print('Data shape: ', data.shape)
data.head()

Data shape:  (17, 2)


Unnamed: 0,content,title
0,"East side of the sicilian island, in the Catan...",Etna🇮🇹
1,August 2008. Arrived at Refugio Sapienza just ...,"A hard climb, but worth it"
2,Mount Etna is extraordinary. It is an active v...,You must see an active volcano at least once i...
3,How much does the cable car cost from Rifugio ...,Cost of cable car to the top???
4,It's always a great and sensational emotion to...,Wonderful Etna


## Filtering comments according to keywords

In [6]:
TRANSPORT_KEYWORDS = 'transport|rail|cable|car|bus|transportation|mobility|bike'

In [7]:
# Filtering
data['is_transport_related'] = data['content'].str.contains(TRANSPORT_KEYWORDS, case=False, na=False)
data = data[data['is_transport_related']==True].reset_index(drop=True)
data = data.drop(columns=['is_transport_related'])

# New dataset
print('Data shape: ', data.shape)
data.head(10)

Data shape:  (10, 2)


Unnamed: 0,content,title
0,August 2008. Arrived at Refugio Sapienza just ...,"A hard climb, but worth it"
1,Mount Etna is extraordinary. It is an active v...,You must see an active volcano at least once i...
2,How much does the cable car cost from Rifugio ...,Cost of cable car to the top???
3,We visited Etna with two small children in Aug...,"Well orgainised, easy access"
4,We were able to experience Mt Etna without spe...,Cheap and easy way to do Mt Etna
5,We would like to visit Etna. We have a 3 year ...,Question about Etna Cable Cars and children
6,Mount Etna is a stunning day out from Catania....,"Mount Etna - AST bus from Catania, cable car a..."
7,"If you are in good condition, you can climb th...",walk mount etna on your own
8,We stayed at Nicolosi 25 k's from Mt Etna. It ...,Stunning
9,"Mt Etna is of course a ""must visit"", but be wa...",Watch out for the clouds...


In [8]:
noise_words = []
stopwords_corpus = nltk.corpus.stopwords
eng_stop_words = stopwords_corpus.words('english')
noise_words.extend(eng_stop_words)

In [9]:
data

Unnamed: 0,content,title
0,August 2008. Arrived at Refugio Sapienza just ...,"A hard climb, but worth it"
1,Mount Etna is extraordinary. It is an active v...,You must see an active volcano at least once i...
2,How much does the cable car cost from Rifugio ...,Cost of cable car to the top???
3,We visited Etna with two small children in Aug...,"Well orgainised, easy access"
4,We were able to experience Mt Etna without spe...,Cheap and easy way to do Mt Etna
5,We would like to visit Etna. We have a 3 year ...,Question about Etna Cable Cars and children
6,Mount Etna is a stunning day out from Catania....,"Mount Etna - AST bus from Catania, cable car a..."
7,"If you are in good condition, you can climb th...",walk mount etna on your own
8,We stayed at Nicolosi 25 k's from Mt Etna. It ...,Stunning
9,"Mt Etna is of course a ""must visit"", but be wa...",Watch out for the clouds...


# 3. NLP for title

In [10]:
# Pycaret
from pycaret.nlp import *

exp_name = setup(data = data[['title']],  
                 target = 'title',
                 session_id = 42,
                 custom_stopwords = noise_words
                )

Description,Value
session_id,42
Documents,10
Vocab Size,20
Custom Stopwords,1


In [11]:
print(model_title)

LdaModel(num_terms=782, num_topics=5, decay=0.5, chunksize=100)


In [12]:
predictions_title = assign_model(model_title)  
predictions_title

Unnamed: 0,title,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Dominant_Topic,Perc_Dominant_Topic
0,hard,0.100006,0.599971,0.100008,0.100008,0.100007,Topic 1,0.6
1,must see active volcano least life,0.885385,0.028851,0.028612,0.028576,0.028576,Topic 0,0.89
2,cost cable car top,0.04,0.839998,0.040001,0.040001,0.040001,Topic 1,0.84
3,orgainise easy access,0.050296,0.799664,0.050015,0.050013,0.050013,Topic 1,0.8
4,cheap easy way,0.374993,0.474998,0.050003,0.050003,0.050003,Topic 1,0.47
5,question cable car child,0.040009,0.839689,0.040012,0.040279,0.040011,Topic 1,0.84
6,cable car,0.066667,0.733331,0.066667,0.066667,0.066667,Topic 1,0.73
7,,0.2,0.2,0.2,0.2,0.2,Topic 0,0.2
8,,0.2,0.2,0.2,0.2,0.2,Topic 0,0.2
9,watch cloud,0.066671,0.396501,0.403333,0.066672,0.066822,Topic 2,0.4


In [13]:
predictions_title = predictions_title.add_prefix('Title_')
predictions_title['Title_Dominant_Topic'] = predictions_title['Title_Dominant_Topic'].replace(' ', '_', regex=True)
predictions_title

Unnamed: 0,Title_title,Title_Topic_0,Title_Topic_1,Title_Topic_2,Title_Topic_3,Title_Topic_4,Title_Dominant_Topic,Title_Perc_Dominant_Topic
0,hard,0.100006,0.599971,0.100008,0.100008,0.100007,Topic_1,0.6
1,must see active volcano least life,0.885385,0.028851,0.028612,0.028576,0.028576,Topic_0,0.89
2,cost cable car top,0.04,0.839998,0.040001,0.040001,0.040001,Topic_1,0.84
3,orgainise easy access,0.050296,0.799664,0.050015,0.050013,0.050013,Topic_1,0.8
4,cheap easy way,0.374993,0.474998,0.050003,0.050003,0.050003,Topic_1,0.47
5,question cable car child,0.040009,0.839689,0.040012,0.040279,0.040011,Topic_1,0.84
6,cable car,0.066667,0.733331,0.066667,0.066667,0.066667,Topic_1,0.73
7,,0.2,0.2,0.2,0.2,0.2,Topic_0,0.2
8,,0.2,0.2,0.2,0.2,0.2,Topic_0,0.2
9,watch cloud,0.066671,0.396501,0.403333,0.066672,0.066822,Topic_2,0.4


# 4. NLP for review

In [14]:
exp_name = setup(data = data[['content']], 
                     target = 'content',
                     session_id = 42,
                     custom_stopwords = noise_words
                     )

Description,Value
session_id,42
Documents,10
Vocab Size,433
Custom Stopwords,1


In [15]:
print(model_review)

LdaModel(num_terms=5748, num_topics=2, decay=0.5, chunksize=100)


In [16]:
predictions_review = assign_model(model_review)  
predictions_review

Unnamed: 0,content,Topic_0,Topic_1,Dominant_Topic,Perc_Dominant_Topic
0,arrive make sure allow plenty time mountain ro...,0.46845,0.53155,Topic 1,0.53
1,extraordinary active volcano beautiful landsca...,0.055577,0.944423,Topic 1,0.94
2,much cable car cost charge really look receipt...,0.222892,0.777108,Topic 1,0.78
3,small child august use company get see sight h...,0.629173,0.370827,Topic 0,0.63
4,able experience etna spend money tour research...,0.526225,0.473775,Topic 0,0.53
5,would visit old month old know take child youn...,0.843984,0.156016,Topic 0,0.84
6,take follow cable car star deduction bus trip ...,0.499221,0.500779,Topic 1,0.5
7,good etna adult child age feel spend money cab...,0.563484,0.436517,Topic 0,0.56
8,stay take half hour parking place choose go ea...,0.600172,0.399828,Topic 0,0.6
9,course must visit warn day even private tour v...,0.75746,0.24254,Topic 0,0.76


In [17]:
predictions_review = predictions_review.add_prefix('Review_')
predictions_review['Review_Dominant_Topic'] = predictions_review['Review_Dominant_Topic'].replace(' ', '_', regex=True)
predictions_review

Unnamed: 0,Review_content,Review_Topic_0,Review_Topic_1,Review_Dominant_Topic,Review_Perc_Dominant_Topic
0,arrive make sure allow plenty time mountain ro...,0.46845,0.53155,Topic_1,0.53
1,extraordinary active volcano beautiful landsca...,0.055577,0.944423,Topic_1,0.94
2,much cable car cost charge really look receipt...,0.222892,0.777108,Topic_1,0.78
3,small child august use company get see sight h...,0.629173,0.370827,Topic_0,0.63
4,able experience etna spend money tour research...,0.526225,0.473775,Topic_0,0.53
5,would visit old month old know take child youn...,0.843984,0.156016,Topic_0,0.84
6,take follow cable car star deduction bus trip ...,0.499221,0.500779,Topic_1,0.5
7,good etna adult child age feel spend money cab...,0.563484,0.436517,Topic_0,0.56
8,stay take half hour parking place choose go ea...,0.600172,0.399828,Topic_0,0.6
9,course must visit warn day even private tour v...,0.75746,0.24254,Topic_0,0.76


# 5. Merging data

In [18]:
data = pd.concat([predictions_title, predictions_review], axis=1)
data = data.dropna(subset=['Title_title' ,'Review_content']).reset_index(drop=True)
data = data.loc[~((data['Title_title'] == '') |
                      (data['Review_content'] == ''))].reset_index(drop=True)
data.head()

Unnamed: 0,Title_title,Title_Topic_0,Title_Topic_1,Title_Topic_2,Title_Topic_3,Title_Topic_4,Title_Dominant_Topic,Title_Perc_Dominant_Topic,Review_content,Review_Topic_0,Review_Topic_1,Review_Dominant_Topic,Review_Perc_Dominant_Topic
0,hard,0.100006,0.599971,0.100008,0.100008,0.100007,Topic_1,0.6,arrive make sure allow plenty time mountain ro...,0.46845,0.53155,Topic_1,0.53
1,must see active volcano least life,0.885385,0.028851,0.028612,0.028576,0.028576,Topic_0,0.89,extraordinary active volcano beautiful landsca...,0.055577,0.944423,Topic_1,0.94
2,cost cable car top,0.04,0.839998,0.040001,0.040001,0.040001,Topic_1,0.84,much cable car cost charge really look receipt...,0.222892,0.777108,Topic_1,0.78
3,orgainise easy access,0.050296,0.799664,0.050015,0.050013,0.050013,Topic_1,0.8,small child august use company get see sight h...,0.629173,0.370827,Topic_0,0.63
4,cheap easy way,0.374993,0.474998,0.050003,0.050003,0.050003,Topic_1,0.47,able experience etna spend money tour research...,0.526225,0.473775,Topic_0,0.53


# 5. Classification model

In [19]:
predictions = predict_model(model_classification, data) 
predictions

Unnamed: 0,Title_title,Title_Topic_0,Title_Topic_1,Title_Topic_2,Title_Topic_3,Title_Topic_4,Title_Dominant_Topic,Title_Perc_Dominant_Topic,Review_content,Review_Topic_0,Review_Topic_1,Review_Dominant_Topic,Review_Perc_Dominant_Topic,Label,Score
0,hard,0.100006,0.599971,0.100008,0.100008,0.100007,Topic_1,0.6,arrive make sure allow plenty time mountain ro...,0.46845,0.53155,Topic_1,0.53,1,0.5481
1,must see active volcano least life,0.885385,0.028851,0.028612,0.028576,0.028576,Topic_0,0.89,extraordinary active volcano beautiful landsca...,0.055577,0.944423,Topic_1,0.94,0,0.7608
2,cost cable car top,0.04,0.839998,0.040001,0.040001,0.040001,Topic_1,0.84,much cable car cost charge really look receipt...,0.222892,0.777108,Topic_1,0.78,1,0.572
3,orgainise easy access,0.050296,0.799664,0.050015,0.050013,0.050013,Topic_1,0.8,small child august use company get see sight h...,0.629173,0.370827,Topic_0,0.63,1,0.6875
4,cheap easy way,0.374993,0.474998,0.050003,0.050003,0.050003,Topic_1,0.47,able experience etna spend money tour research...,0.526225,0.473775,Topic_0,0.53,1,0.5279
5,question cable car child,0.040009,0.839689,0.040012,0.040279,0.040011,Topic_1,0.84,would visit old month old know take child youn...,0.843984,0.156016,Topic_0,0.84,1,0.7488
6,cable car,0.066667,0.733331,0.066667,0.066667,0.066667,Topic_1,0.73,take follow cable car star deduction bus trip ...,0.499221,0.500779,Topic_1,0.5,1,0.6221
7,watch cloud,0.066671,0.396501,0.403333,0.066672,0.066822,Topic_2,0.4,course must visit warn day even private tour v...,0.75746,0.24254,Topic_0,0.76,1,0.6418
