In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import googlemaps
from geopy.distance import geodesic
import folium
from sklearn import preprocessing

### <a id="data_ac">Data Acquisition</a>

###    Web scraping https://luas.ie/luas-red-line-stops/ in order to get the tram stops' list 

<tr>
<td> <img src="pics/luas_Map.jpg" alt="Map" style="width:350px;"/> </td>
<td> <img src="pics/luas-red-web_scraping.PNG" alt="Web" style="width:350px;"/> </td>
</tr>

In [55]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'}
source = requests.get('https://luas.ie/luas-red-line-stops/', headers=headers).text
soup = BeautifulSoup(source, 'lxml')
amusements_soup = BeautifulSoup(source,"html.parser")
red_stops = pd.DataFrame(columns=['Name'])
for item in amusements_soup.findAll('ul',{'class':'dropdown-menu event-dropdown'}):
    sub_items = item.findAll('li')
    i=0
    for sub_item in sub_items:
        space = '     '
        print('{} Luas stop{}'.format (sub_item.text, 10*space), end="\r", flush=True)
        red_stops.loc[i, 'Name'] = '{} Luas stop'.format (sub_item.text)
        i+=1

The Point Luas stop                                                           

Again albeit using the Green Line

In [155]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'}
source = requests.get('https://luas.ie/luas-green-line-stops/', headers=headers).text
soup = BeautifulSoup(source, 'lxml')
amusements_soup = BeautifulSoup(source,"html.parser")
green_stops = pd.DataFrame(columns=['Name'])
for item in amusements_soup.findAll('ul',{'class':'dropdown-menu event-dropdown'}):
    sub_items = item.findAll('li')
    i=0
    for sub_item in sub_items:
        space = '     '
        print('{} Luas stop{}'.format (sub_item.text, 10*space), end="\r", flush=True)
        green_stops.loc[i, 'Name'] = '{} Luas stop'.format (sub_item.text)
        i+=1

Brides Glen Luas stop                                                          

In [None]:
red_stops['Line'] = 'Red' 

In [128]:
green_stops['Line'] = 'Green' 

In [46]:
luas_stops = pd.DataFrame(columns=['Name'])
luas_stops = luas_stops.append(red_stops)
luas_stops = luas_stops.append(green_stops)
print ('There are {} stops'.format (luas_stops.shape[0]))

There are 67 stops


### Getting coordinates using Google Maps API

In [48]:
# hidden_cell
gmaps = googlemaps.Client(key='AIzaSyCjmu_ABM3URijC2HSvMPIkiReTrTUZPTo')

In [68]:
def getCoordinates(names):
    
    Coordinates_list=[]
    for name in zip(names):
        space = '     '
        print(name[0] +'{}'.format(10*space), end="\r", flush=True)
        query = gmaps.geocode(name)

        Coordinates_list.append([(
            name[0], 
            v['formatted_address'],
            v['geometry']['location']['lat'], 
            v['geometry']['location']['lng']) for v in query])

    Luas_Coordinates = pd.DataFrame([item for Coordinate_list in Coordinates_list for item in Coordinate_list])
    Luas_Coordinates.columns = ['Name',
                  'Address',
                  'Latitude', 
                  'Longitude']

    return(Luas_Coordinates)

In [71]:
luas_gmap_coordinates = getCoordinates(names = luas_stops['Name'])

Brides Glen Luas stop                                                          

In [76]:
#luas_gmap_coordinates.to_csv('luas_gmap_coordinates_full.csv', index=False)

In [73]:
luas_gmap_coordinates.head()

Unnamed: 0,Name,Address,Latitude,Longitude
0,Tallaght Luas stop,"Oldbawn, Dublin, Ireland",53.287367,-6.374592
1,Saggart Luas stop,"Saggart Luas Stop, Fortunestown, Saggart, Co. ...",53.284641,-6.437762
2,Fortunestown Luas stop,"Fortunestown, Tallaght, Dublin, Ireland",53.28421,-6.42461
3,Citywest Campus Luas stop,"Citywest Campus Luas Stop, Cooldown Commons, D...",53.2878,-6.41882
4,Cheeverstown Luas stop,"Cheeverstown Luas Stop, Tallaght, Dublin, Ireland",53.291036,-6.406877


Removing companies outside the project scope boundaries

In [74]:
#these are the coordenates boundary
xmin, xmax, ymin, ymax = -6.295547, -6.227464, 53.322556, 53.356207
luas_list = luas_gmap_coordinates.copy()
luas_list = luas_list.drop(luas_list[(luas_list.Latitude < ymin) 
                                          | (luas_list.Latitude > ymax)
                                          | (luas_list.Longitude < xmin)
                                          | (luas_list.Longitude > xmax)].index)

luas_list.reset_index(drop=True, inplace=True)
print ('Dataframe has been reduced from {} to {} Luas stops'.format 
       (str(luas_gmap_coordinates.shape[0]), 
        str(luas_list.shape[0])))

Dataframe has been reduced from 63 to 23 Luas stops


In [19]:
luas_list.head()

Unnamed: 0,Name,Latitude,Longitude,Classification
0,Fatima Luas stop,53.338484,-6.292567,Luas stop
1,James's Luas stop,53.341938,-6.293428,Luas stop
2,Heuston Luas stop,53.346329,-6.29413,Luas stop
3,Museum Luas stop,53.347827,-6.286734,Luas stop
4,Four Courts Luas stop,53.346799,-6.27347,Luas stop


Adding the 'line' column with tram line (red=horizontal / green=vertical)

In [138]:
luas_list['Line'] = 'Red'

In [150]:
luas_list.shape

(23, 4)

In [None]:
i=0
for i in range(0, len(green_stops)):
    luas

In [162]:
luas_list_color = pd.merge(luas_list, green_stops, how='left', on=['Name'])

In [163]:
luas_list_color 

Unnamed: 0,Name,Latitude,Longitude,Classification,Line
0,Fatima Luas stop,53.338484,-6.292567,Luas stop,
1,James's Luas stop,53.341938,-6.293428,Luas stop,
2,Heuston Luas stop,53.346329,-6.29413,Luas stop,
3,Museum Luas stop,53.347827,-6.286734,Luas stop,
4,Four Courts Luas stop,53.346799,-6.27347,Luas stop,
5,Jervis Luas stop,53.347696,-6.265666,Luas stop,
6,Abbey Street Luas stop,53.348645,-6.258185,Luas stop,
7,Busáras Luas stop,53.350085,-6.251451,Luas stop,
8,Busáras Luas stop,53.349921,-6.252033,Luas stop,
9,George's Dock Luas stop,53.349473,-6.247568,Luas stop,


In [200]:
# luas_list_color.to_csv('Features_csv/luas_list_color.csv', index=False)

In [161]:
green_stops['Line'] = 'Green'

In [172]:
luas_list_color = luas_list_color.fillna('Red')

In [20]:
luas_list_color = pd.read_csv('luas_list_color.csv')

In [21]:
Red_luas = luas_list_color[luas_list_color['Line'].astype(str).str.contains("Red")].copy()
Green_luas = luas_list_color[luas_list_color['Line'].astype(str).str.contains("Green")].copy()

In [198]:
# Green_luas.to_csv('Features_csv/Green_luas_points_map.csv', index=False)
# Red_luas.to_csv('Features_csv/Red_luas_points_map.csv', index=False)

In [141]:
luas_list.drop(columns=['Address'], inplace=True)

In [81]:
luas_list['Classification'] = 'Luas stop'

In [4]:
luas_list.head(2)

Unnamed: 0,Name,Latitude,Longitude,Classification
0,Fatima Luas stop,53.338484,-6.292567,Luas stop
1,James's Luas stop,53.341938,-6.293428,Luas stop


In [47]:
rest_df = pd.read_csv('Restaurant_List_scope.csv')

In [9]:
rest_df.sample(2)

Unnamed: 0,Name,Ranking,Reviews,Rating,Price,Cuisines,Address,Latitude,Longitude,Phone,Link
677,Boojum - Kevin Street,940,66,4.0,Cheap,"['Mexican', 'Fast food', 'Vegetarian Friendly']","Kevin Street Lower Unit 4 College Court, Dubli...",53.337508,-6.266823,+353 1 809 0077,/Restaurant_Review-g186605-d10415683-Reviews-B...
695,Chaplins Bar,978,46,4.5,Cheap,"['Irish', 'Bar', 'Pub']","1/2 Hawkins Street, Dublin 2, Dublin D02 K590 ...",53.3462,-6.257221,+353 1 677 5225,/Restaurant_Review-g186605-d5667749-Reviews-Ch...


In [48]:
rest_luas = rest_df[['Latitude', 'Longitude', 'Link']].copy()

In [11]:
rest_luas.sample(2)

Unnamed: 0,Latitude,Longitude,Link
723,53.332443,-6.245485,/Restaurant_Review-g186605-d10488014-Reviews-O...
481,53.347951,-6.246127,/Restaurant_Review-g186605-d2100789-Reviews-Br...


In [49]:
#rest_luas.drop(columns=['Companies weight class'], inplace=True)

### <center>[NEXT CHAPTER](./e.Price_sqr_meter.ipynb#data_ac)</center>

#### <center> [Table of Contents](./../0.Table_of_Contents_Code.ipynb) </center>

### <a id="feat_gen">Feature Generation</a>

Distance Restaurant to closest Luas stop

In [49]:
rest_luas.loc[0, 'Distances min'] = 3
# Not sure if it's a bug, I had to make sure the column has dtype as an object so pandas will add the list to the cell
# Otherwise wil raise an error -> ValueError: Must have equal len keys and value when setting with an iterable

i=0
for i in range(0, len(rest_luas)):
    dist = []
    for j in range(0, len(luas_list)):
        dist.append(int(geodesic((rest_df.at[i,'Latitude'], rest_df.at[i,'Longitude']), 
                                      (luas_list.at[j, 'Latitude'], luas_list.at[j, 'Longitude'])).m))
    if int(sorted(dist)[0]) < 100: # we fix a minimum of 100 meters range to a tram station
        rest_luas.at[i, 'Distances min'] = 100 # in order to have a consistent model
    else:
        rest_luas.at[i, 'Distances min'] = int(sorted(dist)[0])

In [50]:
rest_luas.describe()

Unnamed: 0,Latitude,Longitude,Distances min
count,1188.0,1188.0,1188.0
mean,53.341764,-6.260744,396.907407
std,0.007607,0.012098,347.031231
min,53.32258,-6.295349,100.0
25%,53.337302,-6.265706,155.0
50%,53.343006,-6.262388,292.5
75%,53.347322,-6.25504,476.5
max,53.356205,-6.227568,1834.0


In [51]:
rest_luas.groupby('Distances min')['Distances min'].count()

Distances min
100.0     151
102.0       2
103.0       4
104.0       1
105.0       4
         ... 
1736.0      1
1751.0      1
1757.0      1
1784.0      1
1834.0      1
Name: Distances min, Length: 529, dtype: int64

In [52]:
i=0
for i in range(0, rest_luas.shape[0]):
    if rest_luas.loc[i, 'Distances min'] >= 600:
        rest_luas.loc[i, 'Distances min class'] = 'Farest'
    elif rest_luas.loc[i, 'Distances min'] >= 400:
        rest_luas.loc[i, 'Distances min class'] = 'Far'        
    elif rest_luas.loc[i, 'Distances min'] >= 200:
        rest_luas.loc[i, 'Distances min class'] = 'Close'         
    else:
        rest_luas.loc[i, 'Distances min class'] = 'Closest'

In [51]:
rest_luas.head(2)

Unnamed: 0,Latitude,Longitude,Link,Distances min,Distances min class
0,53.34872,-6.258399,/Restaurant_Review-g186605-d10387074-Reviews-T...,16.0,Closest
1,53.339644,-6.263466,/Restaurant_Review-g186605-d13477650-Reviews-G...,161.0,Closest


In [18]:
rest_luas.dtypes

Latitude               float64
Longitude              float64
Link                    object
Distances min          float64
Distances min class     object
dtype: object

In [38]:
# Green_luas = pd.read_csv('Features_csv/Green_luas_points_map.csv')
# Red_luas = pd.read_csv('Features_csv/Red_luas_points_map.csv')

In [53]:
weight = {'Closest':'green', 'Close':'blue','Far':'yellow', 'Farest':'red'}
red_line = []
green_line = []
dub_latitude = '53.341785'
dub_longitude = '-6.265288'
# create map of Dublin using latitude and longitude values
map_dublin = folium.Map(location=[dub_latitude, dub_longitude], 
                        tiles='CartoDB dark_matter', zoom_start=13)
    
for lat, lng, label in zip(rest_luas['Latitude'], 
                                  rest_luas['Longitude'], 
                                  rest_luas['Distances min class']):
    folium.CircleMarker(
            [lat, lng],
            radius=2,
            color=weight[label],
            fill=True,
            fill_color=weight[label],
            fill_opacity=0.5,
            parse_html=False).add_to(map_dublin)
    
for lat, lng in zip(Red_luas['Latitude'], Red_luas['Longitude']):
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        color='red',
        fill=True,
        fill_color='red',
        fill_opacity=0.7,
        parse_html=False).add_to(map_dublin) 
    red_line.append([lat, lng])
for lat, lng, label in zip(Green_luas['Latitude'], Green_luas['Longitude'], Green_luas['Name']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='green',
        fill_opacity=0.7,
        parse_html=False).add_to(map_dublin) 
    green_line.append([lat, lng])

folium.PolyLine([red_line], weight=5, color='red').add_to(map_dublin)
folium.PolyLine([green_line], weight=5, color='green').add_to(map_dublin)

map_dublin

In [192]:
Green_luas.drop(index=16, inplace=True)

Before normalizing the weights we need to get the inverse, due to closest to a tram Luas station means a higher weight in order to have a standard with the rest of the feature set weights

In [2]:
# rest_luas = pd.read_csv('Luas_rest_weights_NOR.csv')

In [40]:
rest_luas.sample(2)

Unnamed: 0,Latitude,Longitude,Link,Distances min,Distances min class
441,53.342669,-6.262463,/Restaurant_Review-g186605-d1771808-Reviews-Da...,302.0,Close
810,53.343471,-6.290487,/Restaurant_Review-g186605-d11826597-Reviews-J...,259.0,Close


In [54]:
rest_luas['Distances inv'] = 1/rest_luas['Distances min']

In [55]:
rest_luas.sample(2)

Unnamed: 0,Latitude,Longitude,Link,Distances min,Distances min class,Distances inv
414,53.345308,-6.267112,/Restaurant_Review-g186605-d6826441-Reviews-Th...,282.0,Close,0.003546
797,53.334058,-6.245211,/Restaurant_Review-g186605-d2716584-Reviews-Th...,972.0,Farest,0.001029


Normalizing weights

In [56]:
weight_nor = rest_luas[['Distances inv']].values
weight_nor = preprocessing.StandardScaler(with_mean=False, with_std=True).fit(weight_nor).transform(weight_nor.astype(float))
weight_nor_df = pd.DataFrame(data=weight_nor, columns=['Distances inv NOR'])
rest_luas = pd.merge(rest_luas, weight_nor_df, left_index=True, right_index=True )

In [57]:
rest_luas.describe()

Unnamed: 0,Latitude,Longitude,Distances min,Distances inv,Distances inv NOR
count,1188.0,1188.0,1188.0,1188.0,1188.0
mean,53.341764,-6.260744,396.907407,0.004412,1.477995
std,0.007607,0.012098,347.031231,0.002986,1.000421
min,53.32258,-6.295349,100.0,0.000545,0.182656
25%,53.337302,-6.265706,155.0,0.002099,0.703028
50%,53.343006,-6.262388,292.5,0.003419,1.145274
75%,53.347322,-6.25504,476.5,0.006452,2.161236
max,53.356205,-6.227568,1834.0,0.01,3.349916


In [58]:
rest_luas.head()

Unnamed: 0,Latitude,Longitude,Link,Distances min,Distances min class,Distances inv,Distances inv NOR
0,53.34872,-6.258399,/Restaurant_Review-g186605-d10387074-Reviews-T...,100.0,Closest,0.01,3.349916
1,53.339644,-6.263466,/Restaurant_Review-g186605-d13477650-Reviews-G...,161.0,Closest,0.006211,2.080693
2,53.343513,-6.27106,/Restaurant_Review-g186605-d6403998-Reviews-Da...,399.0,Close,0.002506,0.839578
3,53.322659,-6.236801,/Restaurant_Review-g186605-d2239110-Reviews-Mu...,1348.0,Farest,0.000742,0.24851
4,53.337441,-6.265903,/Restaurant_Review-g186605-d15590976-Reviews-T...,359.0,Close,0.002786,0.933124


In [209]:
rest_luas = rest_luas.round({'Distances min':0, 'Distances NOR':4})

In [60]:
rest_luas.to_csv('Features_csv/Luas_weights_NOR.csv', index=False)

### <center>[NEXT CHAPTER](./b.Liffey.ipynb#feat_gen)</center>

#### <center> [Table of Contents](./../0.Table_of_Contents_Code.ipynb) </center>