In [42]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import time

from catboost import CatBoostRegressor

import re

from statsmodels.stats.outliers_influence import variance_inflation_factor
from xgboost import XGBRegressor

In [43]:
pd.set_option('display.max_rows', 500)

In [44]:
df = pd.read_csv('./../data/ames_housing_price_data_v5.csv')

In [45]:
to_dummify = [
    'Street_paved',
    'Alley',
    'LandContour',
    'Utilities',
    'LandSlope',
    'Neighborhood',
    'BldgType',
    #'OverallQual',
    #'OverallCond',
    'RoofStyle',
    'RoofMatl',
    'MasVnrType',
    #'ExterQual',
    #'ExterCond',
    'Foundation',
    'CentralAir',
    #'KitchenQual',
    #'FireplaceQu',
    'PavedDrive',
    #'PoolQC',
    'Fence',
    'MiscFeature',
    #'MoSold',
    #'HeatingQC_ord',
    'LotShape_com',
    'MSZoning_com',
    'Heating_com',
    'Electrical_com',
    'LotConfig_com',
    #'number_floors',
    'attic',
    'PUD',
    #'Functional_ord',
    'Remod_age_bin',
    'SaleType',
    #'SaleCondition'
]

In [46]:
#drop outlier rows
df = df[(df['PID'] != 902207130) & (df['PID'] != 908154205)]
df = df[(df['SaleCondition'] == 'Normal') | (df['SaleCondition'] == 'Partial')]
df = df[df['BedroomAbvGr'] != 0]
df = df[df['MSZoning_com'] != 'Nonresidential']
df.reset_index(drop=True, inplace = True)

In [47]:
#separate price from table
price=df['SalePrice']
df=df.drop(['SalePrice_log'],axis=1) #'SalePrice',

In [48]:
#add radial average price feature
avg_price_df=pd.read_csv('../data/house_surrounding_avg_prices.csv')
avg_price_df2=avg_price_df[['PID','AvgPrice-0.5']].drop_duplicates() #,'AvgPrice-0.25'

df2=df.merge(avg_price_df2,how='left')


In [49]:
#add geographical features
radial = pd.read_csv('./../data/house_coordinates_1.0.csv')
radial.drop(columns = ('2204_park'), inplace = True)
for col in radial.columns:
    prefix = str(col)[0:4]
    if re.search('^\d\d\d\d_', str(col)):
        radial.rename(columns = {col: col[5:]}, inplace = True)
rad_drops = [
    'Address',
    'Coords4',
    'latitude',
    'longitude',
    'town_hall',
    'cemetery',
    'motel',
    'camp_site',
    'general',
    'picnic_site',
    'wastewater_plant',
    'spring',
    'beach',
    'street_lamp',
    'helipad',
    'vineyard',
    'crossing',
    'tree',
    'grass',
    'christian',
    'bus_stop',
    'parking',
    'toilet',
    'bench',
    'commercial',
    'waste_basket',
    'drinking_water',
    'convenience',
    'camera_surveillance',
    'comms_tower',
    'residential',
    'gift_shop',
    'jeweller',
    'hairdresser',
    'bookshop',
    'clothes',
    'retail',
    'food_court',
    'artwork',
    'cafe',
    'traffic_signals',
    'beauty_shop',
    'sports_shop',
    'weir',
    'track',
    'turning_circle',
    'computer_shop',
    'bicycle_shop',
    'department_store',
    'parking_bicycle',
    'golf_course',
    'tower',
    'beverages',
    'university'
]
radial.drop(columns = rad_drops, inplace = True)

df2=df2.merge(radial, how='left')




In [50]:
#other rows to drop:
droplist=['PID','GarageFinish','SaleCondition','GarageType_com','Garage_age_bin','sold_datetime']
df2=df2.drop(droplist,axis=1)

In [51]:
#fillnas
df2=df2.fillna(0)

<H1>ML function section

In [52]:
def corr_list(name, df_corr):
    '''
    Takes columname from dataframe and dataframe as inputs
    '''
    tester_corr=df_corr[features_to_use]
    to_dummify2=filtering(to_dummify)
    tester_corr = pd.get_dummies(tester_corr, columns = to_dummify2, drop_first = True)

    x=tester_corr[f'{name}']
    corr_lst=[]
    for y in tester_corr.columns:
        cor=x.corr(tester_corr[y])
        corr_lst.append(cor)
    df_corr_res=pd.DataFrame({'Col':list(tester_corr.columns),'Corr':corr_lst})
    return df_corr_res.sort_values('Corr',ascending=False)
        

In [53]:
def cat_b(x_train,x_test,y_train,y_test, i):
    clf = CatBoostRegressor(logging_level='Silent', random_state=0)
    #clf=XGBRegressor(random_state=0, scoring='neg_mean_squared_error')
    
    params = ({'n_estimators':[4000],
              'learning_rate':[0.04], 
               'subsample':[0.9], 
               'max_depth':[2,3],
              })

    grid_search_cat = GridSearchCV(clf, param_grid=params, cv=5, n_jobs=-1)
    grid_search_cat.fit(x_train, y_train)
    
    if i==0:
        feature_imp=pd.DataFrame({'Col':list(x_train.columns),'Importance':grid_search_cat.best_estimator_.feature_importances_})
        print(feature_imp.sort_values('Importance',ascending=False).iloc[:100,:])
        print('-'*50)
        print(grid_search_cat.best_params_)
        print('-'*50)
    return grid_search_cat.score(x_test, y_test)
    

In [54]:
def vif_calc(df_VIF):
    viflist=[variance_inflation_factor(df_VIF.values, i) for i in range(len(df_VIF.columns))]
    list_of_tuples = list(zip(list(df_VIF.columns), viflist))
    df=pd.DataFrame(list_of_tuples,columns=['Feature','VIF'])
    df=df.sort_values('VIF',ascending=False)
    return df

In [55]:
def cross_val_VIF_score(df_val):
    score_list=[]
    for i in range(0,5):
        x_train2, x_test2, y_train2, y_test2 = train_test_split(df_val, price, test_size=0.3, random_state=i)

        score_list.append(cat_b(x_train2, x_test2, y_train2, y_test2, i))

    print(sum(score_list)/len(score_list))
    print('-'*50)    
    vif_df=vif_calc(df_val)        
    print(vif_df[:20])


In [56]:
def fun(variable):
    if (variable in features_to_use):
        return True
    else:
        return False

def filtering(listing):  
# using filter function
    filtered = filter(fun, listing)
    return list(filtered)

def fun2(variable):
    if (variable in features_to_use):
        return False
    else:
        return True

def filtering2(listing):  
# using filter function
    filtered = filter(fun2, listing)
    return list(filtered)

<H1> Feature selection

In [58]:
Not
'PUD', 'BldgType', 'RoofStyle', 'RoofMatl'
'Utilities', 'Heating_com'

                 'ext_Wood', 'ext_Metal_Siding', 'ext_Vinyl_Siding', 'ext_Stucco',
                'ext_Cement_Board', 'ext_Brick', 'ext_Asbestos_Shingles', 'ext_Other',
'TotRmsAbvGrd','FireplaceQu'
?
'LandSlope'

maybe: 
    'Electrical_com',
'Foundation'
,'MasVnrArea'
,'AvgPrice-0.5',
'MSZoning_com',

'1stFloorArea%','2ndFloorArea%',

further review:
    'number_floors'
    
    ,'7203_residential'
,'2301_restaurant'
'2101_pharmacy',
'7228_farmyard',

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 6)

In [59]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

def fit_scale(col):
    scaler.fit(df2[[col]])
    df2[[col]]=scaler.transform(df2[[col]])

fit_scale('OverallQual')
fit_scale('ExterQual')
fit_scale('OverallCond')
fit_scale('KitchenQual')

#df2['Porch']=((df2['OpenPorchSF']>0) | (df2['EnclosedPorch']>0) | (df2['3SsnPorch']>0) | (df2['ScreenPorch']>0))
df2['PorchSF']=df2['OpenPorchSF']+df2['EnclosedPorch']+df2['3SsnPorch']+df2['ScreenPorch']
#df2['1stFloorArea%']=df2['1stFlrSF']/df2['GrLivArea']
#df2['2ndFloorArea%']=df2['2ndFlrSF']/df2['GrLivArea']
df2['ExterQualDisc']=df2['OverallQual']-df2['ExterQual']
df2['OverallCondDisc']=df2['OverallQual']-df2['OverallCond']
df2['KitchenQualDisc']=df2['OverallQual']-df2['KitchenQual']

df2['SaleTypeNew']=(df2['SaleType']=='New')
df2['SaleTypeNew']=df2['SaleTypeNew'].apply(lambda x: 1 if x==True else 0)
#df2['BSMT_GLQ%']=df2['BSMT_GLQ']/df2['TotalBsmtSF']
#df2['BSMT_ALQ%']=df2['BSMT_ALQ']/df2['TotalBsmtSF']
#df2['BSMT_GLQ%']=df2['BSMT_GLQ%'].fillna(0)
#df2['BSMT_ALQ%']=df2['BSMT_ALQ%'].fillna(0)

df2['BSMT_LowQual']=df2['TotalBsmtSF']-df2['BSMT_GLQ']-df2['BSMT_ALQ']
df2['BSMT_HighQual']=df2['BSMT_GLQ']+df2['BSMT_ALQ']

In [60]:
radial.columns

Index(['PID', 'police', 'fire_station', 'post_box', 'post_office', 'library',
       'nursing_home', 'graveyard', 'school', 'pharmacy', 'hospital',
       'doctors', 'dentist', 'veterinary', 'theatre', 'cinema', 'playground',
       'dog_park', 'sports_centre', 'pitch', 'swimming_pool', 'stadium',
       'restaurant', 'fast_food', 'pub', 'bar', 'hotel', 'guesthouse',
       'shelter', 'supermarket', 'bakery', 'mall', 'florist', 'shoe_shop',
       'optician', 'stationery', 'outdoor_shop', 'mobile_phone_shop',
       'car_dealership', 'doityourself', 'furniture_shop', 'garden_centre',
       'car_wash', 'laundry', 'bank', 'atm', 'tourist_info', 'attraction',
       'museum', 'memorial', 'viewpoint', 'fountain', 'water_tower',
       'water_works', 'christian_catholic', 'christian_evangelical',
       'christian_lutheran', 'christian_methodist', 'jewish', 'muslim', 'stop',
       'motorway_junction', 'fuel', 'parking_multistorey', 'slipway', 'pier',
       'dam', 'airport', 'apron', 'for

In [78]:
features_to_use=['SalePrice','GrLivArea', 'LotArea', 'OverallQual',
    'BSMT_LowQual', 'house_age_years', 'GarageCars','MasVnrType',
                 'FullBath','HalfBath',
                'BsmtExposure_ord','SaleTypeNew',
                 'Neighborhood',
                 'BldgType','PorchSF',
                 'ExterQualDisc','OverallCondDisc','BSMT_HighQual',
                 'KitchenQualDisc',
                'Fireplaces','Pool','BedroomAbvGr',
                  'water_tower', 'graveyard', 'police', 
                 'optician', 'slipway',  'bar', 'farmyard', 'cinema', 'supermarket' ,'hotel'
                ] #'water','bank','forest',  'fast_food','car_wash',

tester=df2[features_to_use]
to_dummify2=filtering(to_dummify)
tester2 = pd.get_dummies(tester, columns = to_dummify2, drop_first = True)


In [80]:
train_x,test_x=train_test_split(tester2, test_size=0.3, random_state=0)

In [83]:
train_x.to_csv('train_x.csv')
test_x.to_csv('test_x.csv')

In [103]:
import gbdtpl

In [100]:
params={
    'num_trees':500,
    'objective':'l2'
}

In [101]:
import os
os.getcwd()

'/Users/moritz/Desktop/private_repository/operation_goldfish/Moritz'

In [104]:
gbdtpl.DataMat(file_path='/Users/moritz/Desktop/private_repository/operation_goldfish/Moritz/train_x.csv',params=params, label_index=0, query_index=None, name='training')




OSError: dlopen(/Users/moritz/.local/lib/python3.8/site-packages/gbdtpl/liblineargbm.py, 6): no suitable image found.  Did find:
	/Users/moritz/.local/lib/python3.8/site-packages/gbdtpl/liblineargbm.py: unknown file type, first eight bytes: 0x7F 0x45 0x4C 0x46 0x02 0x01 0x01 0x03
	/Users/moritz/.local/lib/python3.8/site-packages/gbdtpl/liblineargbm.py: unknown file type, first eight bytes: 0x7F 0x45 0x4C 0x46 0x02 0x01 0x01 0x03

In [88]:
gbdtpl.DataMat(name='first', params=params, label_index=0, group_id_index=-1, path_to_csv_file='...csv', train_dataset=None)



TypeError: __init__() got an unexpected keyword argument 'group_id_index'

In [None]:
cross_val_VIF_score(tester2)

In [1]:
corr_list('GrLivArea', df2[features_to_use])

NameError: name 'corr_list' is not defined

In [None]:
tester=df2
tester = pd.get_dummies(tester, columns = to_dummify, drop_first = True)


In [None]:
cross_val_VIF_score(tester)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline
pca=PCA()

In [None]:
len(pca.explained_variance_ratio_)

In [None]:
pca.set_params(n_components=62)

In [None]:
pc_tester = pca.fit_transform(tester2)

In [None]:
pc_tester=pd.DataFrame(pc_tester)

In [None]:
cross_val_VIF_score(pc_tester)


