In [1]:
#Import Libraries
import pandas as pd
import numpy as np
import sqlite3
import dateutil.parser as parser
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [2]:
#Create Database Connection
DBName = '../Data/yelp_project.db'
conn=sqlite3.connect(DBName)
cursor = conn.cursor()

In [3]:
#Read Business Data
query = "SELECT b.business_id, r.stars, attributes, date, useful, cool, funny, postal_code FROM review r JOIN \
business b USING (business_id) WHERE CAST(postal_code AS int)>=15000 AND CAST(postal_code AS int)<15500"
business_data = pd.read_sql_query(query,conn)
business_data

Unnamed: 0,business_id,stars,attributes,date,useful,cool,funny,postal_code
0,cnGIivYRLxpF7tBVR_JwWA,1,"['AcceptsInsurance: False', 'BusinessAcceptsCr...",2016-02-01,0,0,0,15071
1,cnGIivYRLxpF7tBVR_JwWA,5,"['AcceptsInsurance: False', 'BusinessAcceptsCr...",2016-07-04,0,0,0,15071
2,cnGIivYRLxpF7tBVR_JwWA,5,"['AcceptsInsurance: False', 'BusinessAcceptsCr...",2016-07-28,0,0,0,15071
3,cnGIivYRLxpF7tBVR_JwWA,5,"['AcceptsInsurance: False', 'BusinessAcceptsCr...",2016-08-06,0,0,0,15071
4,P3LisOj7DktgGa7C5FYpnA,1,['BusinessAcceptsCreditCards: True'],2015-09-19,4,0,0,15237
5,P3LisOj7DktgGa7C5FYpnA,2,['BusinessAcceptsCreditCards: True'],2014-01-29,4,1,0,15237
6,P3LisOj7DktgGa7C5FYpnA,3,['BusinessAcceptsCreditCards: True'],2016-12-05,0,0,0,15237
7,P3LisOj7DktgGa7C5FYpnA,5,['BusinessAcceptsCreditCards: True'],2011-11-19,2,0,0,15237
8,P3LisOj7DktgGa7C5FYpnA,5,['BusinessAcceptsCreditCards: True'],2013-10-05,0,0,0,15237
9,P3LisOj7DktgGa7C5FYpnA,5,['BusinessAcceptsCreditCards: True'],2014-01-12,2,0,0,15237


In [4]:
#Split the attributes one per row
attributes_splitted = pd.concat([pd.Series(row['business_id'],row['attributes'].split(',')) for _, row in business_data.dropna().iterrows()]).reset_index()
attributes_splitted.columns = ['Attributes', 'business_id']

In [5]:
attributes_splitted

Unnamed: 0,Attributes,business_id
0,['AcceptsInsurance: False',cnGIivYRLxpF7tBVR_JwWA
1,'BusinessAcceptsCreditCards: True',cnGIivYRLxpF7tBVR_JwWA
2,"""BusinessParking: {'garage': False",cnGIivYRLxpF7tBVR_JwWA
3,'street': False,cnGIivYRLxpF7tBVR_JwWA
4,'validated': False,cnGIivYRLxpF7tBVR_JwWA
5,'lot': True,cnGIivYRLxpF7tBVR_JwWA
6,"'valet': False}""",cnGIivYRLxpF7tBVR_JwWA
7,'ByAppointmentOnly: True',cnGIivYRLxpF7tBVR_JwWA
8,"""HairSpecializesIn: {'coloring': False",cnGIivYRLxpF7tBVR_JwWA
9,'africanamerican': False,cnGIivYRLxpF7tBVR_JwWA


In [6]:
#Get Restaurants with price
restprice = attributes_splitted[attributes_splitted.Attributes.str.contains("restaurantspricerange", case=False)].reset_index()
restprice

Unnamed: 0,index,Attributes,business_id
0,16,'RestaurantsPriceRange2: 3'],cnGIivYRLxpF7tBVR_JwWA
1,33,'RestaurantsPriceRange2: 3'],cnGIivYRLxpF7tBVR_JwWA
2,50,'RestaurantsPriceRange2: 3'],cnGIivYRLxpF7tBVR_JwWA
3,67,'RestaurantsPriceRange2: 3'],cnGIivYRLxpF7tBVR_JwWA
4,124,'RestaurantsPriceRange2: 2',93otbGHE0s0m-lU1osvg9w
5,179,'RestaurantsPriceRange2: 2',93otbGHE0s0m-lU1osvg9w
6,234,'RestaurantsPriceRange2: 2',93otbGHE0s0m-lU1osvg9w
7,289,'RestaurantsPriceRange2: 2',93otbGHE0s0m-lU1osvg9w
8,344,'RestaurantsPriceRange2: 2',93otbGHE0s0m-lU1osvg9w
9,399,'RestaurantsPriceRange2: 2',93otbGHE0s0m-lU1osvg9w


In [98]:
#Extract price range and remove non-numerical characters
restprice['Price'] = [ pd.to_numeric(re.sub(r"[\[\]' ]", "", r.split(":")[1])) for r in restprice.Attributes]
restprice

Unnamed: 0,index,Attributes,business_id,Price
0,16,'RestaurantsPriceRange2: 3'],cnGIivYRLxpF7tBVR_JwWA,3
1,33,'RestaurantsPriceRange2: 3'],cnGIivYRLxpF7tBVR_JwWA,3
2,50,'RestaurantsPriceRange2: 3'],cnGIivYRLxpF7tBVR_JwWA,3
3,67,'RestaurantsPriceRange2: 3'],cnGIivYRLxpF7tBVR_JwWA,3
4,124,'RestaurantsPriceRange2: 2',93otbGHE0s0m-lU1osvg9w,2
5,179,'RestaurantsPriceRange2: 2',93otbGHE0s0m-lU1osvg9w,2
6,234,'RestaurantsPriceRange2: 2',93otbGHE0s0m-lU1osvg9w,2
7,289,'RestaurantsPriceRange2: 2',93otbGHE0s0m-lU1osvg9w,2
8,344,'RestaurantsPriceRange2: 2',93otbGHE0s0m-lU1osvg9w,2
9,399,'RestaurantsPriceRange2: 2',93otbGHE0s0m-lU1osvg9w,2


In [99]:
business_price = restprice[['business_id', 'Price']]
business_price

Unnamed: 0,business_id,Price
0,cnGIivYRLxpF7tBVR_JwWA,3
1,cnGIivYRLxpF7tBVR_JwWA,3
2,cnGIivYRLxpF7tBVR_JwWA,3
3,cnGIivYRLxpF7tBVR_JwWA,3
4,93otbGHE0s0m-lU1osvg9w,2
5,93otbGHE0s0m-lU1osvg9w,2
6,93otbGHE0s0m-lU1osvg9w,2
7,93otbGHE0s0m-lU1osvg9w,2
8,93otbGHE0s0m-lU1osvg9w,2
9,93otbGHE0s0m-lU1osvg9w,2


In [116]:
#Left Join of both df
business_data_Price = business_data.join(business_price, how = 'left', rsuffix='_bp')

In [121]:
years=[pd.to_datetime(row['date']).year for _, row in business_data_Price.iterrows()]

In [122]:
len(years)

176505

In [125]:
business_data_Price['Years'] = years
business_data_Price

Unnamed: 0,business_id,stars,attributes,date,useful,cool,funny,postal_code,business_id_bp,Price,Years
0,cnGIivYRLxpF7tBVR_JwWA,1,"['AcceptsInsurance: False', 'BusinessAcceptsCr...",2016-02-01,0,0,0,15071,cnGIivYRLxpF7tBVR_JwWA,3.0,2016
1,cnGIivYRLxpF7tBVR_JwWA,5,"['AcceptsInsurance: False', 'BusinessAcceptsCr...",2016-07-04,0,0,0,15071,cnGIivYRLxpF7tBVR_JwWA,3.0,2016
2,cnGIivYRLxpF7tBVR_JwWA,5,"['AcceptsInsurance: False', 'BusinessAcceptsCr...",2016-07-28,0,0,0,15071,cnGIivYRLxpF7tBVR_JwWA,3.0,2016
3,cnGIivYRLxpF7tBVR_JwWA,5,"['AcceptsInsurance: False', 'BusinessAcceptsCr...",2016-08-06,0,0,0,15071,cnGIivYRLxpF7tBVR_JwWA,3.0,2016
4,P3LisOj7DktgGa7C5FYpnA,1,['BusinessAcceptsCreditCards: True'],2015-09-19,4,0,0,15237,93otbGHE0s0m-lU1osvg9w,2.0,2015
5,P3LisOj7DktgGa7C5FYpnA,2,['BusinessAcceptsCreditCards: True'],2014-01-29,4,1,0,15237,93otbGHE0s0m-lU1osvg9w,2.0,2014
6,P3LisOj7DktgGa7C5FYpnA,3,['BusinessAcceptsCreditCards: True'],2016-12-05,0,0,0,15237,93otbGHE0s0m-lU1osvg9w,2.0,2016
7,P3LisOj7DktgGa7C5FYpnA,5,['BusinessAcceptsCreditCards: True'],2011-11-19,2,0,0,15237,93otbGHE0s0m-lU1osvg9w,2.0,2011
8,P3LisOj7DktgGa7C5FYpnA,5,['BusinessAcceptsCreditCards: True'],2013-10-05,0,0,0,15237,93otbGHE0s0m-lU1osvg9w,2.0,2013
9,P3LisOj7DktgGa7C5FYpnA,5,['BusinessAcceptsCreditCards: True'],2014-01-12,2,0,0,15237,93otbGHE0s0m-lU1osvg9w,2.0,2014


In [134]:
zip_matrix = business_data_Price[['postal_code', 'Years', 'stars', 'useful', 'funny', 'cool', 'Price']]
stars = zip_matrix.stars
zip_matrix = pd.get_dummies(zip_matrix, columns = ["stars"])
zip_matrix['stars'] = stars
zip_matrix

Unnamed: 0,postal_code,Years,useful,funny,cool,Price,stars_1,stars_2,stars_3,stars_4,stars_5,stars
0,15071,2016,0,0,0,3.0,1.0,0.0,0.0,0.0,0.0,1
1,15071,2016,0,0,0,3.0,0.0,0.0,0.0,0.0,1.0,5
2,15071,2016,0,0,0,3.0,0.0,0.0,0.0,0.0,1.0,5
3,15071,2016,0,0,0,3.0,0.0,0.0,0.0,0.0,1.0,5
4,15237,2015,4,0,0,2.0,1.0,0.0,0.0,0.0,0.0,1
5,15237,2014,4,0,1,2.0,0.0,1.0,0.0,0.0,0.0,2
6,15237,2016,0,0,0,2.0,0.0,0.0,1.0,0.0,0.0,3
7,15237,2011,2,0,0,2.0,0.0,0.0,0.0,0.0,1.0,5
8,15237,2013,0,0,0,2.0,0.0,0.0,0.0,0.0,1.0,5
9,15237,2014,2,0,0,2.0,0.0,0.0,0.0,0.0,1.0,5


In [138]:
zip_aggregated = zip_matrix.groupby(['postal_code', 'Years']).agg(pd.Series.mean)
zip_aggregated

Unnamed: 0_level_0,Unnamed: 1_level_0,useful,funny,cool,Price,stars_1,stars_2,stars_3,stars_4,stars_5,stars
postal_code,Years,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
15003,2008,2.000000,0.500000,2.000000,3.000000,0.000000,0.000000,0.000000,0.500000,0.500000,4.500000
15003,2009,0.500000,0.000000,0.500000,2.000000,0.000000,0.000000,0.000000,0.500000,0.500000,4.500000
15003,2010,1.625000,0.375000,0.375000,2.500000,0.000000,0.000000,0.125000,0.500000,0.375000,4.250000
15003,2011,1.200000,0.200000,0.466667,2.500000,0.066667,0.000000,0.066667,0.133333,0.733333,4.466667
15003,2012,1.800000,0.450000,0.800000,2.066667,0.150000,0.100000,0.050000,0.300000,0.400000,3.700000
15003,2013,0.840000,0.560000,0.640000,2.000000,0.080000,0.120000,0.040000,0.320000,0.440000,3.920000
15003,2014,0.764706,0.176471,0.205882,2.217391,0.058824,0.029412,0.058824,0.147059,0.705882,4.411765
15003,2015,0.901639,0.327869,0.475410,2.311111,0.098361,0.049180,0.032787,0.131148,0.688525,4.262295
15003,2016,0.319444,0.083333,0.194444,1.949153,0.041667,0.041667,0.055556,0.236111,0.625000,4.361111
15003,2017,0.000000,0.000000,0.000000,2.666667,0.000000,0.250000,0.000000,0.750000,0.000000,3.500000


pandas.core.frame.DataFrame

In [143]:
#business_price_zip = business_price.join(business_data, how = "inner", rsuffix="_bp")
business_price_zip = business_data_Price[['business_id','Price', 'postal_code', 'Years']].drop_duplicates()
price_zip = business_price_zip[['postal_code', 'Price', 'Years']]
price_zip

Unnamed: 0,postal_code,Price,Years
0,15071,3.0,2016
4,15237,2.0,2015
5,15237,2.0,2014
6,15237,2.0,2016
7,15237,2.0,2011
8,15237,2.0,2013
12,15212,2.0,2013
13,15212,2.0,2014
16,15212,2.0,2015
21,15212,2.0,2016


In [144]:
average_price_zip = price_zip.groupby(['postal_code','Years']).agg(pd.Series.mean)

In [145]:
average_price_zip

Unnamed: 0_level_0,Unnamed: 1_level_0,Price
postal_code,Years,Unnamed: 2_level_1
15003,2008,3.000000
15003,2009,2.000000
15003,2010,2.400000
15003,2011,2.400000
15003,2012,2.111111
15003,2013,1.888889
15003,2014,2.071429
15003,2015,2.076923
15003,2016,2.117647
15003,2017,2.666667


In [146]:
zip_aggregated['Price'] = average_price_zip

In [147]:
zip_aggregated

Unnamed: 0_level_0,Unnamed: 1_level_0,useful,funny,cool,Price,stars_1,stars_2,stars_3,stars_4,stars_5,stars
postal_code,Years,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
15003,2008,2.000000,0.500000,2.000000,3.000000,0.000000,0.000000,0.000000,0.500000,0.500000,4.500000
15003,2009,0.500000,0.000000,0.500000,2.000000,0.000000,0.000000,0.000000,0.500000,0.500000,4.500000
15003,2010,1.625000,0.375000,0.375000,2.400000,0.000000,0.000000,0.125000,0.500000,0.375000,4.250000
15003,2011,1.200000,0.200000,0.466667,2.400000,0.066667,0.000000,0.066667,0.133333,0.733333,4.466667
15003,2012,1.800000,0.450000,0.800000,2.111111,0.150000,0.100000,0.050000,0.300000,0.400000,3.700000
15003,2013,0.840000,0.560000,0.640000,1.888889,0.080000,0.120000,0.040000,0.320000,0.440000,3.920000
15003,2014,0.764706,0.176471,0.205882,2.071429,0.058824,0.029412,0.058824,0.147059,0.705882,4.411765
15003,2015,0.901639,0.327869,0.475410,2.076923,0.098361,0.049180,0.032787,0.131148,0.688525,4.262295
15003,2016,0.319444,0.083333,0.194444,2.117647,0.041667,0.041667,0.055556,0.236111,0.625000,4.361111
15003,2017,0.000000,0.000000,0.000000,2.666667,0.000000,0.250000,0.000000,0.750000,0.000000,3.500000


In [166]:
average_reviews_zip = business_data_Price.groupby(['postal_code','Years']).agg(pd.Series.count)
average_reviews_zip = average_reviews_zip['business_id']

Unnamed: 0_level_0,Unnamed: 1_level_0,business_id,stars,attributes,date,useful,cool,funny,business_id_bp,Price
postal_code,Years,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
15003,2008,2,2,2,2,2,2,2,1,1.0
15003,2009,2,2,2,2,2,2,2,1,1.0
15003,2010,8,8,8,8,8,8,8,6,6.0
15003,2011,15,15,15,15,15,15,15,10,10.0
15003,2012,20,20,18,20,20,20,20,15,15.0
15003,2013,25,25,23,25,25,25,25,15,15.0
15003,2014,34,34,30,34,34,34,34,23,23.0
15003,2015,61,61,56,61,61,61,61,45,45.0
15003,2016,72,72,71,72,72,72,72,59,59.0
15003,2017,4,4,4,4,4,4,4,3,3.0


In [200]:
review_byzip = business_data_Price.groupby(['postal_code', 'Years']).count()

In [199]:
total_reviews = business_data_Price.groupby(['Years']).count().business_id
total_reviews

31

In [212]:
review_byzip = review_byzip.reset_index()[['postal_code','Years','business_id']]
review_byzip

Unnamed: 0,postal_code,Years,business_id
0,15003,2008,2
1,15003,2009,2
2,15003,2010,8
3,15003,2011,15
4,15003,2012,20
5,15003,2013,25
6,15003,2014,34
7,15003,2015,61
8,15003,2016,72
9,15003,2017,4


In [217]:
prop_reviews = [ float(review_byzip.ix[i]['business_id'])/total_reviews[review_byzip.ix[i]['Years']] for i in range(len(review_byzip))]

In [219]:
zip_aggregated['reviews_perc'] = prop_reviews

In [220]:
zip_aggregated

Unnamed: 0_level_0,Unnamed: 1_level_0,useful,funny,cool,Price,stars_1,stars_2,stars_3,stars_4,stars_5,stars,reviews_perc
postal_code,Years,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
15003,2008,2.000000,0.500000,2.000000,3.000000,0.000000,0.000000,0.000000,0.500000,0.500000,4.500000,0.001061
15003,2009,0.500000,0.000000,0.500000,2.000000,0.000000,0.000000,0.000000,0.500000,0.500000,4.500000,0.000553
15003,2010,1.625000,0.375000,0.375000,2.400000,0.000000,0.000000,0.125000,0.500000,0.375000,4.250000,0.001275
15003,2011,1.200000,0.200000,0.466667,2.400000,0.066667,0.000000,0.066667,0.133333,0.733333,4.466667,0.001285
15003,2012,1.800000,0.450000,0.800000,2.111111,0.150000,0.100000,0.050000,0.300000,0.400000,3.700000,0.001196
15003,2013,0.840000,0.560000,0.640000,1.888889,0.080000,0.120000,0.040000,0.320000,0.440000,3.920000,0.001188
15003,2014,0.764706,0.176471,0.205882,2.071429,0.058824,0.029412,0.058824,0.147059,0.705882,4.411765,0.001180
15003,2015,0.901639,0.327869,0.475410,2.076923,0.098361,0.049180,0.032787,0.131148,0.688525,4.262295,0.001568
15003,2016,0.319444,0.083333,0.194444,2.117647,0.041667,0.041667,0.055556,0.236111,0.625000,4.361111,0.001638
15003,2017,0.000000,0.000000,0.000000,2.666667,0.000000,0.250000,0.000000,0.750000,0.000000,3.500000,0.001601


In [249]:
flatten_zip = zip_aggregated.reset_index()
pd.to_numeric(flatten_zip[(flatten_zip['postal_code']=='15232')&(flatten_zip['Years']==2007)].stars)

791    3.923077
Name: stars, dtype: float64

In [253]:
avg_review_prev1 = [flatten_zip[(flatten_zip['postal_code']==flatten_zip.ix[i]['postal_code'])&(flatten_zip['Years']==(flatten_zip.ix[i]['Years']-1))]['stars'].values for i in range(len(flatten_zip))]

In [260]:
zip_aggregated['prev_stars'] = avg_review_prev1
zip_aggregated['prev_stars'] = zip_aggregated['prev_stars'].str.get(0)
zip_aggregated

Unnamed: 0_level_0,Unnamed: 1_level_0,useful,funny,cool,Price,stars_1,stars_2,stars_3,stars_4,stars_5,stars,reviews_perc,prev_stars
postal_code,Years,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
15003,2008,2.000000,0.500000,2.000000,3.000000,0.000000,0.000000,0.000000,0.500000,0.500000,4.500000,0.001061,
15003,2009,0.500000,0.000000,0.500000,2.000000,0.000000,0.000000,0.000000,0.500000,0.500000,4.500000,0.000553,4.500000
15003,2010,1.625000,0.375000,0.375000,2.400000,0.000000,0.000000,0.125000,0.500000,0.375000,4.250000,0.001275,4.500000
15003,2011,1.200000,0.200000,0.466667,2.400000,0.066667,0.000000,0.066667,0.133333,0.733333,4.466667,0.001285,4.250000
15003,2012,1.800000,0.450000,0.800000,2.111111,0.150000,0.100000,0.050000,0.300000,0.400000,3.700000,0.001196,4.466667
15003,2013,0.840000,0.560000,0.640000,1.888889,0.080000,0.120000,0.040000,0.320000,0.440000,3.920000,0.001188,3.700000
15003,2014,0.764706,0.176471,0.205882,2.071429,0.058824,0.029412,0.058824,0.147059,0.705882,4.411765,0.001180,3.920000
15003,2015,0.901639,0.327869,0.475410,2.076923,0.098361,0.049180,0.032787,0.131148,0.688525,4.262295,0.001568,4.411765
15003,2016,0.319444,0.083333,0.194444,2.117647,0.041667,0.041667,0.055556,0.236111,0.625000,4.361111,0.001638,4.262295
15003,2017,0.000000,0.000000,0.000000,2.666667,0.000000,0.250000,0.000000,0.750000,0.000000,3.500000,0.001601,4.361111


In [261]:
#save to pickle
import pickle
output_file = open("Feature_Matrix_onlyYelp.dat","wb")
pickle.dump(zip_aggregated, output_file)
output_file.close()

In [264]:
#Integrate Zillow
zip_price = pd.read_csv("../Data/Zip_MedianValuePerSqft_AllHomes.csv")

#Select only Pittsburgh
pittsburgh_data_price = zip_price[(zip_price.RegionName>=15000)&(zip_price.RegionName<15500)]

#Select the RegionName and the columns with year
pittsburgh_data_byYear = pd.concat([pittsburgh_data_price.ix[:,1], pittsburgh_data_price[pittsburgh_data_price.columns[7:]]], axis = 1)

#Melt the dataframe to transform the columns into rows
flat_zip = pd.melt(pittsburgh_data_byYear, id_vars="RegionName", var_name="Date", value_name="MedSqft")
flat_zip

Unnamed: 0,RegionName,Date,MedSqft
0,15301,1996-04,54.0
1,15237,1996-04,69.0
2,15108,1996-04,65.0
3,15068,1996-04,56.0
4,15235,1996-04,47.0
5,15317,1996-04,68.0
6,15221,1996-04,37.0
7,15206,1996-04,33.0
8,15401,1996-04,40.0
9,15001,1996-04,59.0


In [265]:
#Extract only the years
years = [flat_zip.Date.ix[i].split('-')[0] for i in range(len(flat_zip))]

#Add to df
flat_zip['Years'] = years

#Select only these columns
flat_zip = flat_zip[['RegionName', 'Years', 'MedSqft']]

#Group by and aggregate by median
med_zip = flat_zip.groupby(['RegionName', 'Years']).median()

#Flatten the table
med_zip = med_zip.reset_index()
med_zip

Unnamed: 0,RegionName,Years,MedSqft
0,15001,1996,61.0
1,15001,1997,62.0
2,15001,1998,65.0
3,15001,1999,67.0
4,15001,2000,69.0
5,15001,2001,73.0
6,15001,2002,75.5
7,15001,2003,76.5
8,15001,2004,81.0
9,15001,2005,84.5


In [287]:
Med_price_sqft = [med_zip[(med_zip['RegionName']==int(flatten_zip.ix[i]['postal_code']))&(med_zip['Years']==str(flatten_zip.ix[i]['Years']))]['MedSqft'].values for i in range(len(flatten_zip))]

In [288]:
Med_price_sqft

[array([ 57.]),
 array([ 55.5]),
 array([ 57.5]),
 array([ 54.]),
 array([ 55.5]),
 array([ 57.]),
 array([ 57.]),
 array([ 58.]),
 array([ 65.]),
 array([ 66.]),
 array([ 104.5]),
 array([ 111.]),
 array([ 116.]),
 array([ 75.5]),
 array([ 117.]),
 array([ 115.5]),
 array([ 119.]),
 array([ 126.5]),
 array([ 133.]),
 array([ 143.5]),
 array([ 96.]),
 array([ 94.]),
 array([ 93.]),
 array([ 97.]),
 array([ 98.]),
 array([ 98.]),
 array([ 101.]),
 array([ 104.]),
 array([ 109.5]),
 array([ 115.5]),
 array([ 119.]),
 array([], dtype=float64),
 array([ 91.]),
 array([ 90.5]),
 array([ 91.]),
 array([ 92.]),
 array([ 96.5]),
 array([ 99.]),
 array([ 100.]),
 array([ 102.5]),
 array([ 54.5]),
 array([ 53.]),
 array([ 55.]),
 array([ 54.]),
 array([ 54.]),
 array([ 54.]),
 array([ 57.]),
 array([ 63.]),
 array([], dtype=float64),
 array([], dtype=float64),
 array([], dtype=float64),
 array([], dtype=float64),
 array([], dtype=float64),
 array([], dtype=float64),
 array([], dtype=float64),
 a

In [289]:
zip_aggregated['MedPrice_sqft'] = Med_price_sqft
zip_aggregated['MedPrice_sqft'] = zip_aggregated['MedPrice_sqft'].str.get(0)
zip_aggregated

Unnamed: 0_level_0,Unnamed: 1_level_0,useful,funny,cool,Price,stars_1,stars_2,stars_3,stars_4,stars_5,stars,reviews_perc,prev_stars,MedPrice_sqft
postal_code,Years,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
15003,2008,2.000000,0.500000,2.000000,3.000000,0.000000,0.000000,0.000000,0.500000,0.500000,4.500000,0.001061,,57.0
15003,2009,0.500000,0.000000,0.500000,2.000000,0.000000,0.000000,0.000000,0.500000,0.500000,4.500000,0.000553,4.500000,55.5
15003,2010,1.625000,0.375000,0.375000,2.400000,0.000000,0.000000,0.125000,0.500000,0.375000,4.250000,0.001275,4.500000,57.5
15003,2011,1.200000,0.200000,0.466667,2.400000,0.066667,0.000000,0.066667,0.133333,0.733333,4.466667,0.001285,4.250000,54.0
15003,2012,1.800000,0.450000,0.800000,2.111111,0.150000,0.100000,0.050000,0.300000,0.400000,3.700000,0.001196,4.466667,55.5
15003,2013,0.840000,0.560000,0.640000,1.888889,0.080000,0.120000,0.040000,0.320000,0.440000,3.920000,0.001188,3.700000,57.0
15003,2014,0.764706,0.176471,0.205882,2.071429,0.058824,0.029412,0.058824,0.147059,0.705882,4.411765,0.001180,3.920000,57.0
15003,2015,0.901639,0.327869,0.475410,2.076923,0.098361,0.049180,0.032787,0.131148,0.688525,4.262295,0.001568,4.411765,58.0
15003,2016,0.319444,0.083333,0.194444,2.117647,0.041667,0.041667,0.055556,0.236111,0.625000,4.361111,0.001638,4.262295,65.0
15003,2017,0.000000,0.000000,0.000000,2.666667,0.000000,0.250000,0.000000,0.750000,0.000000,3.500000,0.001601,4.361111,66.0


In [291]:
zip_aggregated.columns = [u'useful', u'funny', u'cool', u'Price', u'stars_1', u'stars_2',
       u'stars_3', u'stars_4', u'stars_5', u'stars', u'reviews_perc',
       u'prev_stars', u'MedPrice_sqft(t)']

In [292]:
zip_aggregated

Unnamed: 0_level_0,Unnamed: 1_level_0,useful,funny,cool,Price,stars_1,stars_2,stars_3,stars_4,stars_5,stars,reviews_perc,prev_stars,MedPrice_sqft(t)
postal_code,Years,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
15003,2008,2.000000,0.500000,2.000000,3.000000,0.000000,0.000000,0.000000,0.500000,0.500000,4.500000,0.001061,,57.0
15003,2009,0.500000,0.000000,0.500000,2.000000,0.000000,0.000000,0.000000,0.500000,0.500000,4.500000,0.000553,4.500000,55.5
15003,2010,1.625000,0.375000,0.375000,2.400000,0.000000,0.000000,0.125000,0.500000,0.375000,4.250000,0.001275,4.500000,57.5
15003,2011,1.200000,0.200000,0.466667,2.400000,0.066667,0.000000,0.066667,0.133333,0.733333,4.466667,0.001285,4.250000,54.0
15003,2012,1.800000,0.450000,0.800000,2.111111,0.150000,0.100000,0.050000,0.300000,0.400000,3.700000,0.001196,4.466667,55.5
15003,2013,0.840000,0.560000,0.640000,1.888889,0.080000,0.120000,0.040000,0.320000,0.440000,3.920000,0.001188,3.700000,57.0
15003,2014,0.764706,0.176471,0.205882,2.071429,0.058824,0.029412,0.058824,0.147059,0.705882,4.411765,0.001180,3.920000,57.0
15003,2015,0.901639,0.327869,0.475410,2.076923,0.098361,0.049180,0.032787,0.131148,0.688525,4.262295,0.001568,4.411765,58.0
15003,2016,0.319444,0.083333,0.194444,2.117647,0.041667,0.041667,0.055556,0.236111,0.625000,4.361111,0.001638,4.262295,65.0
15003,2017,0.000000,0.000000,0.000000,2.666667,0.000000,0.250000,0.000000,0.750000,0.000000,3.500000,0.001601,4.361111,66.0


In [293]:
#For t=t-1
t=-1
Med_price_sqft_tminus1 = [med_zip[(med_zip['RegionName']==int(flatten_zip.ix[i]['postal_code']))&(med_zip['Years']==str(flatten_zip.ix[i]['Years']+t))]['MedSqft'].values for i in range(len(flatten_zip))]

In [294]:
#For t=t-2
t=-2
Med_price_sqft_tminus2 = [med_zip[(med_zip['RegionName']==int(flatten_zip.ix[i]['postal_code']))&(med_zip['Years']==str(flatten_zip.ix[i]['Years']+t))]['MedSqft'].values for i in range(len(flatten_zip))]

In [295]:
#For t=t+1
t=1
Med_price_sqft_tplus1 = [med_zip[(med_zip['RegionName']==int(flatten_zip.ix[i]['postal_code']))&(med_zip['Years']==str(flatten_zip.ix[i]['Years']+t))]['MedSqft'].values for i in range(len(flatten_zip))]

In [300]:
zip_aggregated['MedPrice_sqft_tminus1'] = Med_price_sqft_tminus1
zip_aggregated['MedPrice_sqft_tminus1'] = zip_aggregated['MedPrice_sqft_tminus1'].str.get(0)

zip_aggregated['MedPrice_sqft_tminus2'] = Med_price_sqft_tminus2
zip_aggregated['MedPrice_sqft_tminus2'] = zip_aggregated['MedPrice_sqft_tminus2'].str.get(0)

zip_aggregated['MedPrice_sqft_tplus1'] = Med_price_sqft_tplus1
zip_aggregated['MedPrice_sqft_tplus1'] = zip_aggregated['MedPrice_sqft_tplus1'].str.get(0)

zip_aggregated

Unnamed: 0_level_0,Unnamed: 1_level_0,useful,funny,cool,Price,stars_1,stars_2,stars_3,stars_4,stars_5,stars,reviews_perc,prev_stars,MedPrice_sqft(t),MedPrice_sqft_tminus1,MedPrice_sqft_tminus2,MedPrice_sqft_tplus1
postal_code,Years,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
15003,2008,2.000000,0.500000,2.000000,3.000000,0.000000,0.000000,0.000000,0.500000,0.500000,4.500000,0.001061,,57.0,57.0,57.0,55.5
15003,2009,0.500000,0.000000,0.500000,2.000000,0.000000,0.000000,0.000000,0.500000,0.500000,4.500000,0.000553,4.500000,55.5,57.0,57.0,57.5
15003,2010,1.625000,0.375000,0.375000,2.400000,0.000000,0.000000,0.125000,0.500000,0.375000,4.250000,0.001275,4.500000,57.5,55.5,57.0,54.0
15003,2011,1.200000,0.200000,0.466667,2.400000,0.066667,0.000000,0.066667,0.133333,0.733333,4.466667,0.001285,4.250000,54.0,57.5,55.5,55.5
15003,2012,1.800000,0.450000,0.800000,2.111111,0.150000,0.100000,0.050000,0.300000,0.400000,3.700000,0.001196,4.466667,55.5,54.0,57.5,57.0
15003,2013,0.840000,0.560000,0.640000,1.888889,0.080000,0.120000,0.040000,0.320000,0.440000,3.920000,0.001188,3.700000,57.0,55.5,54.0,57.0
15003,2014,0.764706,0.176471,0.205882,2.071429,0.058824,0.029412,0.058824,0.147059,0.705882,4.411765,0.001180,3.920000,57.0,57.0,55.5,58.0
15003,2015,0.901639,0.327869,0.475410,2.076923,0.098361,0.049180,0.032787,0.131148,0.688525,4.262295,0.001568,4.411765,58.0,57.0,57.0,65.0
15003,2016,0.319444,0.083333,0.194444,2.117647,0.041667,0.041667,0.055556,0.236111,0.625000,4.361111,0.001638,4.262295,65.0,58.0,57.0,66.0
15003,2017,0.000000,0.000000,0.000000,2.666667,0.000000,0.250000,0.000000,0.750000,0.000000,3.500000,0.001601,4.361111,66.0,65.0,58.0,


In [301]:
output_file = open("Feature_Matrix_complete.dat","wb")
pickle.dump(zip_aggregated, output_file)
output_file.close()

In [302]:
zip_aggregated.to_csv("Feature_Matrix_Yelp_Complete.csv")