In [124]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
sns.set()

In [125]:
%%capture
from tqdm.notebook import tqdm
tqdm.pandas()

In [126]:
rio = pd.read_csv('../classifying/rio_normalized.csv')
mumbai = pd.read_csv('../classifying/mumbai_normalized.csv')
hyderabad = pd.read_csv('../classifying/hyderabad_normalized.csv')
chennai = pd.read_csv('../classifying/chennai_normalized.csv')
delhi = pd.read_csv('../classifying/delhi_normalized.csv')

In [127]:
cities = {'rio': rio, 'mumbai': mumbai, 'hyderabad': hyderabad, 'chennai': chennai, 'delhi': delhi}

### scale prices

In [128]:
ss = StandardScaler()
def scale_prices(prices):
    return ss.fit_transform(prices)

for city_name, city_df in cities.items():    
    city_df['price'] = scale_prices(city_df[['price']])  # scale price
    city_df['city'] = city_name                          # add 'city' label

### combine cities

In [129]:
combined = pd.DataFrame()
for city in cities.values():
    combined = pd.concat([combined, city], sort=False)

### bootstrap

In [130]:
def bootstrap(df):
    df.dropna(subset=['price', 'class'], inplace=True)
    return pd.concat([df, df[df['class'] == 1].sample(
                      df['class'].value_counts()[0] - df['class'].value_counts()[1], # calc n_samples
                      replace = True)]).reset_index().drop(columns='index')

combined = bootstrap(combined)

rio = bootstrap(rio)
mumbai = bootstrap(mumbai)
hyderabad = bootstrap(hyderabad)
delhi = bootstrap(delhi)
chennai = bootstrap(chennai)
cities = {'rio': rio, 'mumbai': mumbai, 'hyderabad': hyderabad, 'chennai': chennai, 'delhi': delhi}

### export

In [131]:
combined.to_csv('combined.csv', index=False)

# model

In [132]:
def ScoreXCity(city, preds=False):
    # define train XY
    train_df = combined[combined['city']!=city]
    x_train = train_df[['price']]
    y_train = train_df['class']
    
    # define test XY
    test_df = cities[city]
    x_test = test_df[['price']]
    y_test = test_df['class']
    
    # fit and score RF
    rf = RandomForestClassifier(n_estimators=20)
    rf.fit(x_train, y_train)
    if preds==True:
        return rf.predict(x_test)
    return round(rf.score(x_train, y_train),4), round(rf.score(x_test, y_test),4)

In [133]:
# run x-tests
for city in cities.keys():
    train_score, test_score = ScoreXCity(city)
    print(f'Test on {city}: {test_score}')

Test on rio: 0.5058
Test on mumbai: 0.5095
Test on hyderabad: 0.4721
Test on chennai: 0.4777
Test on delhi: 0.5115


### for tableau...

In [134]:
hyderabad['pred'] = ScoreXCity('hyderabad', preds=True)
confusion_matrix(hyderabad['pred'], hyderabad['class'])

array([[1874, 1976],
       [ 293,  191]])

In [136]:
def parse_lat(x):
    try: return '-'+'.'.join(re.findall(r'\d+', x.split(',')[0]))
    except: return np.nan
def parse_long(x):
    try: return '-'+'.'.join(re.findall(r'\d+', x.split(',')[1]))
    except: return np.nan