In [2]:
import pandas as pd

# <a id="clean"> Cleaning Accommodation Venues</a>

In [3]:
dub = pd.read_csv('Features_csv/Gmap_Dublin_clean.csv')
print ('Dataframe has {} rows.'.format (str(dub.shape[0])))

Dataframe has 3183 rows.


### Cleaning Dataframe

#### Removing duplicates rows 

In [5]:
dub_t = dub[dub.duplicated(['Venue Latitude', 'Venue Longitude', 'Venue id'])]
print ("number of duplicate rows: {}".format (dub_t.shape))
dub_t_clean = dub.drop_duplicates(['Venue Latitude', 'Venue Longitude', 'Venue id'], keep ='first')
print ('Venues: {}'.format (dub_t_clean.shape[0]))

number of duplicate rows: (2390, 11)
Venues: 793


In [27]:
dub_hotels = dub_t_clean[dub_t_clean['Venue Category'].astype(str).str.contains('lodging')].copy()
dub_hotels.reset_index(drop=True, inplace=True )
print ('Accommodation places: {}'.format (dub_hotels.shape[0]))

Accommodation places: 203


#### Checking if there are null values

In [28]:
print('There are {} null values on the dataframe'.format(str(dub_hotels.isna().sum().sum())))

There are 0 null values on the dataframe


Removing columns in order to use it with others datasets / Renaming columns like rest of datasets

In [119]:
dub_hotels_clean = dub_hotels.drop(columns=['Neighborhood', 'Neighborhood Latitude', 
                                            'Neighborhood Longitude', 'Venue id', 
                                            'Venue Category', 'Venue price level',
                                           'Venue Rating', 'Total user ratings'])
dub_hotels_clean.rename(columns={'Venue Latitude':'Latitude', 'Venue Longitude':'Longitude', 'Venue':'Name'}, inplace=True)
dub_hotels_clean.reset_index(drop=True, inplace=True )
dub_hotels_clean['Classification'] = 'Accommodation'

In [121]:
# dub_hotels_clean.to_csv('Features_csv/dublin_hotels.csv', index=False)

# Cleaning Turistic Venues

In [96]:
dub_tourist1 = pd.read_csv('Foursquare_Dublin_venues_tourist.csv')
print ('Dataframe has {} rows.'.format (str(dub_tourist.shape[0])))

Dataframe has 139 rows.


In [45]:
dub_tourist.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Dublin 6,53.327363,-6.265288,Blackbird,53.326871,-6.264804,Pub
1,Dublin 6,53.327363,-6.250158,Clayton Hotel - Burlington Road,53.330223,-6.24898,Hotel
2,Dublin 4,53.327363,-6.235029,Herbert Park,53.327156,-6.234803,Park
3,Dublin 4,53.327363,-6.227464,InterContinental Dublin,53.326608,-6.226079,Hotel
4,Dublin 4,53.327363,-6.227464,Royal Dublin Society (RDS),53.32758,-6.229068,Convention Center


In [97]:
dub_tourist_duplicates = dub_tourist1[dub_tourist1.duplicated(['Venue Latitude', 'Venue Longitude', 'Venue'])]
print ("number of duplicate rows: {}".format (dub_tourist_duplicates.shape))
dub_tourist = dub_tourist1.drop_duplicates(['Venue Latitude', 'Venue Longitude', 'Venue'], keep ='first')
print ('Venues: {}'.format (dub_tourist.shape[0]))

number of duplicate rows: (139, 7)
Venues: 275


In [98]:
dub_tourist_groups = dub_tourist.groupby('Venue Category').count()
dub_tourist_groups.sort_values(by='Venue', ascending=False).head()

Unnamed: 0_level_0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude
Venue Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Pub,57,57,57,57,57,57
Hotel,23,23,23,23,23,23
Coffee Shop,11,11,11,11,11,11
Café,11,11,11,11,11,11
Restaurant,9,9,9,9,9,9


Has many subcategories, so the best way is filtering manually setting each one

In [59]:
dub_tourist_clean = None

In [122]:
# Subcategories to keep:
subcategories = ['Park', 'Plaza', 'Train Station', 'Soccer Stadium', 'Theater',  
                'Distillery', 'Train Station', 'Post Office', 'Multiplex', 
                'Monument / Landmark', 'Library', 'Indie Movie Theater', 'Garden', 'Event Space', 
                'Department Store', 'Convention Center', 'Clothing Store', 'Church', 'Bus Station',
                'Brewery', 'Bookstore']

dub_tourist_clean = pd.DataFrame(columns=dub_tourist.columns)
#dub_tourist_clean.columns = dub_tourist.columns

for sub in subcategories:
    #print(sub)
    dub_tourist_clean = dub_tourist_clean.append(dub_tourist[dub_tourist['Venue Category']
                                    .astype(str).str.contains(sub)], ignore_index = True)
    
print ('The tourism related dataframe has {} point of interest'.format (str(dub_tourist_clean.shape[0])))

The tourism related dataframe has 63 point of interest


In [123]:
dub_tourist_clean.drop(columns=['Neighborhood', 'Neighborhood Latitude', 
                                'Neighborhood Longitude', 'Venue Category'], inplace=True)
dub_tourist_clean.rename(columns={'Venue Latitude':'Latitude', 'Venue Longitude':'Longitude', 'Venue':'Name'}, inplace=True)
dub_tourist_clean.reset_index(drop=True, inplace=True )
dub_tourist_clean['Classification'] = 'Tourism'
dub_tourist_clean.head()

Unnamed: 0,Name,Latitude,Longitude,Classification
0,Herbert Park,53.327156,-6.234803,Tourism
1,St Patrick's Park,53.34011,-6.271894,Tourism
2,Iveagh Gardens,53.33568,-6.261059,Tourism
3,St Stephen's Green,53.338151,-6.25916,Tourism
4,Merrion Square Park,53.340138,-6.250451,Tourism


In [124]:
# dub_tourist_clean.to_csv('Features_csv/dublin_tourism.csv', index=False)

# Cleaning University Venues

In [125]:
dub_uni1 = pd.read_csv('dublin_venues_university.csv')
print ('Dataframe has {} rows.'.format (str(dub_uni1.shape[0])))

Dataframe has 26 rows.


In [116]:
dub_uni_duplicates = dub_uni1[dub_uni1.duplicated(['Venue Latitude', 'Venue Longitude', 'Venue'])]
print ("number of duplicate rows: {}".format (dub_uni_duplicates.shape))
dub_uni_clean = dub_uni1.drop_duplicates(['Venue Latitude', 'Venue Longitude', 'Venue'], keep ='first').copy()
print ('Venues: {}'.format (dub_uni_clean.shape[0]))

number of duplicate rows: (9, 7)
Venues: 17


In [117]:
dub_uni_clean.drop(columns=['Neighborhood', 'Neighborhood Latitude', 
                                'Neighborhood Longitude', 'Venue Category'], inplace=True)
dub_uni_clean.rename(columns={'Venue Latitude':'Latitude', 'Venue Longitude':'Longitude', 'Venue':'Name'}, inplace=True)
dub_uni_clean.reset_index(drop=True, inplace=True )
dub_uni_clean['Classification'] = 'University'
dub_uni_clean.head()

Unnamed: 0,Name,Latitude,Longitude,Classification
0,Arena School of English,53.334769,-6.263229,University
1,Plantronics WWSM,53.335211,-6.228241,University
2,Presentation Room,53.338474,-6.266765,University
3,Tutorial Room 8,53.338888,-6.26259,University
4,Newman House,53.336785,-6.260219,University


In [118]:
# dub_uni_clean.to_csv('Features_csv/dublin_university.csv', index=False)

### <center>[NEXT CHAPTER](./0.Interactive_Map.ipynb#map)</center>

#### <center> [Table of Contents](./../0.Table_of_Contents_Code.ipynb) </center>