In [114]:
import numpy as np
import pandas as pd
import time
import pickle

In [115]:
#Load yoga studio data
f = open('/home/henry/Insight/Yogee/Datasets/Yelp_NY_Yoga_Studios_dataset/YogaDf.pckl', 'rb')
YogaDf = pickle.load(f)
f.close()

In [116]:
#Initialize name, zip codes and start year columns to model dataframe
NanDfValues = np.zeros([np.shape(YogaDf)[0],3])
NanDfValues[:] = np.nan
StartYearDf = pd.DataFrame(NanDfValues,columns=['name','zip','startyear'])

In [117]:
#Add name, zip codes and start year data to startyear dataframe

for i in range(0,np.shape(YogaDf)[0]):
    StartYearDf.loc[i,'name'] = YogaDf.loc[i,'name']
    address = YogaDf.loc[i,'location']
    zipcode = address['zip_code']
    if zipcode:        
        StartYearDf.loc[i,'zip'] = np.int64(zipcode)
    dos = YogaDf.loc[i,'initial_dos_filing_date']
    if isinstance(dos,str):
        dos = dos.replace('T00:00:00.000','')
        startyear = dos[0:4]
        StartYearDf.loc[i,'startyear'] = np.int64(startyear)


In [94]:
#Load demographics dataframe
import pickle

f = open('/home/henry/Insight/Yogee/Datasets/Demographics_dataset/DemographicsDf.pckl', 'rb')
DemographicsDf = pickle.load(f)
f.close()

In [95]:
#Add empty 2017 dataframe to demographics

#Load population dataset
Population2011FileLoc = "/home/henry/Insight/Yogee/Datasets/NY_Population_dataset/ACS_11_5YR_B01003/ACS_11_5YR_B01003_with_ann.csv"
Population2011Df = pd.read_csv(Population2011FileLoc)

#Get zipcodes to use
zipcodes = Population2011Df.loc[:,'GEO.id2']

#Initialize nan raws to demographics dataframe
NanDfValues = np.zeros([np.shape(zipcodes)[0],np.shape(DemographicsDf)[1]])
NanDfValues[:] = np.nan
NanDf = pd.DataFrame(NanDfValues,columns=list(DemographicsDf))

#Add to demographics dataframe
DemographicsDf =  pd.concat([DemographicsDf, NanDf], axis=0)

for i in range(0,np.shape(zipcodes)[0]):
    index = i + np.shape(zipcodes)[0]*6
    DemographicsDf.iloc[index,0] = zipcodes[i]
    DemographicsDf.iloc[index,1] = 2017
DemographicsDf = DemographicsDf.reset_index(drop=True)


In [96]:
#Create model dataframe

#Initialize female ratio column to demographics dataframe
ZerosDfValues = np.zeros([np.shape(DemographicsDf)[0],4])
ZerosDf = pd.DataFrame(ZerosDfValues,columns=['NewStudio','TotalStudio','NewStudioNextYear','NewStudioNextYearBin'])

#Add to model dataframe
ModelDf =  pd.concat([DemographicsDf, ZerosDf], axis=1)

In [97]:
#Add new studio number
for i in range(0,StartYearDf.shape[0]):
    StartYear = StartYearDf.iloc[i,2]
    Zipcode = StartYearDf.iloc[i,1]
    if any(zipcodes == Zipcode) and (not np.isnan(StartYear)) and (StartYear >= 2011) and (StartYear <= 2017):
        #Add NewStudio value
        ModelRow = ModelDf[(ModelDf['year'] == StartYear) & (ModelDf['zip'] == Zipcode)]
        ModelRowIndex = ModelRow.index.values[0]
        ModelDf.iloc[ModelRowIndex,7] = ModelDf.iloc[ModelRowIndex,7] + 1
        
        #Add NewStudioNextYear value, this is the target
        if (StartYear >= 2012) and (StartYear <= 2017):
            PreviousYear = StartYear-1
            ModelRow = ModelDf[(ModelDf['year'] == PreviousYear) & (ModelDf['zip'] == Zipcode)]
            ModelRowIndex = ModelRow.index.values[0]
            ModelDf.iloc[ModelRowIndex,9] = ModelDf.iloc[ModelRowIndex,9] + 1

In [98]:
#reindex model dataframe
ModelDf = ModelDf.sort_values(by=['zip','year'], ascending=True)
ModelDf = ModelDf.reset_index(drop=True)

In [99]:
#Add NewStudioNextYear values
for i in range(0,StartYearDf.shape[0]):
    StartYear = StartYearDf.iloc[i,2]
    Zipcode = StartYearDf.iloc[i,1]
    if any(zipcodes == Zipcode) and (not np.isnan(StartYear)) and (StartYear >= 2011) and (StartYear <= 2017):
        #Add to total studio value
        ModelRow = ModelDf[(ModelDf['year'] == StartYear) & (ModelDf['zip'] == Zipcode)]
        ModelRowIndex = np.int64(ModelRow.index.values[0])
        YearRange = np.int64(2017 - StartYear + 1)
        ModelDf.iloc[ModelRowIndex:ModelRowIndex+YearRange,8] = ModelDf.iloc[ModelRowIndex:ModelRowIndex+YearRange,8] + 1

In [100]:
#Add NewStudioNextYearBin values
ModelDf = ModelDf.assign(NewStudioNextYearBin=ModelDf.NewStudioNextYear.gt(0) * 1)

In [101]:
#Initialize difference columns
NanDfValues = np.zeros([np.shape(DemographicsDf)[0],4])
NanDfValues[:] = np.nan
NanDf = pd.DataFrame(NanDfValues,
                     columns=['populationDifference','FemaleRatioDifference','IncomeDifference','PopDensityDifference'])
ModelDf = pd.concat([ModelDf, NanDf], axis=1, sort=False)

for i in range(0,np.shape(ModelDf)[0]):
    year = ModelDf.loc[i,'year']
    zipcode = ModelDf.loc[i,'zip']
    if (year>=2012) and (year<=2016):
        PreviousRow = ModelDf[(ModelDf['year'] == (year-1)) & (ModelDf['zip'] == zipcode)]
        #population difference
        CurrentPopulation = ModelDf.loc[i,'population']        
        PreviousPopulation = PreviousRow['population'].iloc[0]
        PopulationDifference = CurrentPopulation - PreviousPopulation
        ModelDf.loc[i,'populationDifference'] = PopulationDifference
        #FemaleRatio difference
        CurrentFemaleRatio = ModelDf.loc[i,'FemaleRatio'] 
        PreviousFemaleRatio = PreviousRow['FemaleRatio'].iloc[0]
        FemaleRatioDifference = CurrentFemaleRatio - PreviousFemaleRatio
        ModelDf.loc[i,'FemaleRatioDifference'] = FemaleRatioDifference
        #Income difference
        if (not ModelDf['Income'].isnull()[i]) and (not PreviousRow['Income'].isnull().iloc[0]):
            CurrentIncome = ModelDf.loc[i,'Income'] 
            PreviousIncome = PreviousRow['Income'].iloc[0]
            IncomeDifference = np.int64(CurrentIncome) - np.int64(PreviousIncome)
            ModelDf.loc[i,'IncomeDifference'] = IncomeDifference
        #PopDensity difference
        CurrentPopDensity = ModelDf.loc[i,'PopDensity'] 
        PreviousPopDensity = PreviousRow['PopDensity'].iloc[0]
        PopDensityDifference = CurrentPopDensity - PreviousPopDensity
        ModelDf.loc[i,'PopDensityDifference'] = PopDensityDifference
    

In [102]:
import pickle

f = open('/home/henry/Insight/Yogee/Datasets/Model_dataset/ModelDf.pckl', 'wb')
pickle.dump(ModelDf, f)
f.close()