In [54]:
import numpy as np
import pandas as pd
import re
import ast

# Importing raw csv file

In [55]:
trip = pd.read_csv("trip_a.csv")

In [56]:
trip.sample(5)

Unnamed: 0,avg,cuisine,name,price,rank,reviews
9984,3.0,Pizza,Little Caesars,,"#8,453 of 10,387 Restaurants in New York City",
9889,3.0,Pizza,Pizza Hut,$,"#8,398 of 10,387 Restaurants in New York City",72.0
5276,4.5,Healthy,by CHLOE. Williamsburg,,"#772 of 5,330 Restaurants in Brooklyn",14.0
9003,3.5,"Filipino,Asian",Cafe 81,,"#7,689 of 10,387 Restaurants in New York City",5.0
7940,4.0,,Cibao Restaurant,,"#6,823 of 10,387 Restaurants in New York City",4.0


#### Separating the elements of cuisine and keeping only the first one

In [57]:
trip['cuisine'] = trip['cuisine'].str.split(",", n = 1, expand = True)

#### Counting elements

In [58]:
trip['name'].count()

12446

In [59]:
# Dropping NAs

In [60]:
trip.dropna(inplace = False)

Unnamed: 0,avg,cuisine,name,price,rank,reviews
0,4.5,Italian,Piccola Cucina Osteria,$$ - $$$,"#1 of 10,384 Restaurants in New York City",2012
1,4.5,American,Club A Steakhouse,$$$$,"#2 of 10,384 Restaurants in New York City",3501
2,5.0,Pizza,SottoCasa Pizzeria,$$ - $$$,"#3 of 10,384 Restaurants in New York City",327
3,5.0,French,Boucherie Union Square,$$$$,"#4 of 10,384 Restaurants in New York City",641
4,4.5,Italian,Piccola Cucina,$$ - $$$,"#5 of 10,384 Restaurants in New York City",704
5,4.5,French,Boucherie West Village,$$ - $$$,"#6 of 10,384 Restaurants in New York City",709
6,4.5,Mediterranean,Loi Estiatorio,$$ - $$$,"#7 of 10,384 Restaurants in New York City",340
7,4.5,Italian,Song E Napule,$$ - $$$,"#8 of 10,384 Restaurants in New York City",259
8,4.5,French,Petite Boucherie,$$ - $$$,"#9 of 10,384 Restaurants in New York City",421
9,4.5,Mexican,Los Tacos No. 1,$,"#10 of 10,384 Restaurants in New York City",214


In [61]:
trip.head()

Unnamed: 0,avg,cuisine,name,price,rank,reviews
0,4.5,Italian,Piccola Cucina Osteria,$$ - $$$,"#1 of 10,384 Restaurants in New York City",2012
1,4.5,American,Club A Steakhouse,$$$$,"#2 of 10,384 Restaurants in New York City",3501
2,5.0,Pizza,SottoCasa Pizzeria,$$ - $$$,"#3 of 10,384 Restaurants in New York City",327
3,5.0,French,Boucherie Union Square,$$$$,"#4 of 10,384 Restaurants in New York City",641
4,4.5,Italian,Piccola Cucina,$$ - $$$,"#5 of 10,384 Restaurants in New York City",704


#### converting prices to scale 1, 2, 3 where 1 is cheap, 2 is medium and 3 is expensive, and cleaning extra characters

In [62]:
trip['price'] = trip['price'].replace(["$$$$", "$$ - $$$", "$"], [3, 2, 1])
trip['price'] = pd.to_numeric(trip['price'], errors = "coerce")

In [63]:
# Checking data type

In [64]:
np.dtype(trip['price'])

dtype('float64')

In [65]:
trip.head()

Unnamed: 0,avg,cuisine,name,price,rank,reviews
0,4.5,Italian,Piccola Cucina Osteria,2.0,"#1 of 10,384 Restaurants in New York City",2012
1,4.5,American,Club A Steakhouse,3.0,"#2 of 10,384 Restaurants in New York City",3501
2,5.0,Pizza,SottoCasa Pizzeria,2.0,"#3 of 10,384 Restaurants in New York City",327
3,5.0,French,Boucherie Union Square,3.0,"#4 of 10,384 Restaurants in New York City",641
4,4.5,Italian,Piccola Cucina,2.0,"#5 of 10,384 Restaurants in New York City",704


#### Cleaning up restaurant's rank

In [66]:
trip['rank'] = trip['rank'].replace(regex ={'#'}, value = '')
trip.head()

Unnamed: 0,avg,cuisine,name,price,rank,reviews
0,4.5,Italian,Piccola Cucina Osteria,2.0,"1 of 10,384 Restaurants in New York City",2012
1,4.5,American,Club A Steakhouse,3.0,"2 of 10,384 Restaurants in New York City",3501
2,5.0,Pizza,SottoCasa Pizzeria,2.0,"3 of 10,384 Restaurants in New York City",327
3,5.0,French,Boucherie Union Square,3.0,"4 of 10,384 Restaurants in New York City",641
4,4.5,Italian,Piccola Cucina,2.0,"5 of 10,384 Restaurants in New York City",704


#### Extracting neigborhoods by making a copy of ranks and eliminating everything other than the city name using regex to erase anything before the word "in"

In [67]:
trip['neighborhood'] = trip['rank']
trip['neighborhood'] = trip['neighborhood'].replace(regex ={'(?=\\d).*?(?<=in)'}, value = "")

In [68]:
trip.sample(5)

Unnamed: 0,avg,cuisine,name,price,rank,reviews,neighborhood
3778,4.0,Mexican,Javelina,2.0,"3,378 of 10,387 Restaurants in New York City",56.0,New York City
6101,4.0,American,The Cannery,2.0,"5,290 of 10,387 Restaurants in New York City",9.0,New York City
4361,4.5,Italian,Raviolo,2.0,"3,872 of 10,387 Restaurants in New York City",12.0,New York City
7331,4.5,American,Fields Good Chicken,2.0,"6,321 of 10,387 Restaurants in New York City",4.0,New York City
11676,2.0,,433 Graham Avenue Deli,,"3,375 of 5,330 Restaurants in Brooklyn",,Brooklyn


#### New York in this case is understood as Manhatan so here i'm replacing NYC for Manhatan

In [69]:
trip['neighborhood'] = trip['neighborhood'].replace(regex = {'New York City'}, value = 'Manhattan')

In [70]:
trip.sample(5)

Unnamed: 0,avg,cuisine,name,price,rank,reviews,neighborhood
1337,4.0,Mexican,Empellon Taqueria,2.0,"1,252 of 10,387 Restaurants in New York City",190.0,Manhattan
12406,4.5,Cafe,Qathra Cafe,2.0,"850 of 5,330 Restaurants in Brooklyn",13.0,Brooklyn
5570,4.0,,Nature Works Restaurant,,"4,866 of 10,387 Restaurants in New York City",14.0,Manhattan
8483,5.0,American,Sonnyboy,,"7,262 of 10,387 Restaurants in New York City",,Manhattan
4519,4.0,American,Nelly Spillane's Bar and Restaurant,2.0,"4,000 of 10,387 Restaurants in New York City",50.0,Manhattan


#### Now it's posible to clean up the column Rank to keep only each restaurant's rank by using str.split

In [71]:
trip['rank'] = trip['rank'].str.split(" ", n = 1, expand = True)

In [72]:
# Re-organizing columns

In [73]:
trip = trip[['name', 'cuisine', 'price', 'rank', 'neighborhood', 'reviews', 'avg']]
trip.head()

Unnamed: 0,name,cuisine,price,rank,neighborhood,reviews,avg
0,Piccola Cucina Osteria,Italian,2.0,1,Manhattan,2012,4.5
1,Club A Steakhouse,American,3.0,2,Manhattan,3501,4.5
2,SottoCasa Pizzeria,Pizza,2.0,3,Manhattan,327,5.0
3,Boucherie Union Square,French,3.0,4,Manhattan,641,5.0
4,Piccola Cucina,Italian,2.0,5,Manhattan,704,4.5


#### Erasing commas and changing reviews' Dtype to Int

In [74]:
trip['reviews'] = trip['reviews'].replace(regex = {","}, value = "")
trip['reviews'] = pd.to_numeric(trip['reviews'])
trip.head()

Unnamed: 0,name,cuisine,price,rank,neighborhood,reviews,avg
0,Piccola Cucina Osteria,Italian,2.0,1,Manhattan,2012.0,4.5
1,Club A Steakhouse,American,3.0,2,Manhattan,3501.0,4.5
2,SottoCasa Pizzeria,Pizza,2.0,3,Manhattan,327.0,5.0
3,Boucherie Union Square,French,3.0,4,Manhattan,641.0,5.0
4,Piccola Cucina,Italian,2.0,5,Manhattan,704.0,4.5


In [75]:
# Confirming Dtype

In [76]:
trip['reviews'] = pd.to_numeric(trip['reviews'])
np.dtype(trip['reviews'])

dtype('float64')

#### Last glimpse to make sure the data is clean and ready to be exported

In [77]:
trip.sample(5)

Unnamed: 0,name,cuisine,price,rank,neighborhood,reviews,avg
6948,Mezcla,Latin,,5991,Manhattan,9.0,4.0
10371,Subway,,,407,Jersey City,5.0,3.0
3251,CK14 - The Crooked Knife at 14th Street,American,2.0,2937,Manhattan,47.0,4.0
827,New Wonjo Restaurant,Vegetarian Friendly,2.0,776,Manhattan,427.0,4.0
1886,Peking Duck House,Chinese,2.0,1744,Manhattan,244.0,4.0


# Data set elements' first description

In [78]:
trip.describe()

Unnamed: 0,price,reviews,avg
count,7617.0,11079.0,12446.0
mean,1.821583,112.648344,4.132733
std,0.540693,454.128082,0.579253
min,1.0,2.0,1.0
25%,1.0,6.0,4.0
50%,2.0,21.0,4.0
75%,2.0,76.0,4.5
max,3.0,19916.0,5.0


# Exporting for further analisys

In [79]:
trip.to_csv('tripA_wrangled.csv')