In [2]:
import sys
import os
import pandas as pd

from sklearn.feature_extraction import DictVectorizer

import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [65]:
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

In [36]:
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn.utils import shuffle

In [11]:
%ls data

180_days_maxT_24hrPrecip.csv
180_days_mean1hrT_1hrPrecip.csv
180_days_meanT_24hrPrecip.csv
2019-09-27-basel-collections.csv
2019-09-27-basel-image-metadata.csv
2019-09-27-basel-measures-FEAT-TempPrecip-IntLabels.csv
2019-09-27-basel-measures-FEAT-TempPrecip.csv
2019-09-27-basel-measures-FEAT.csv
2019-09-27-basel-measures-cleaned.csv
2019-09-27-basel-measures-prediction-cleaned-FEAT.csv
2019-09-27-basel-measures-prediction-cleaned-TempPrecip.csv
2019-09-27-basel-measures-prediction-cleaned.csv
2019-09-27-basel-measures-prediction.csv
2019-09-27-basel-measures.csv
all_tweets.csv
all_tweets_relcols.csv
[34mdistributions[m[m/
event_cal.csv
places.txt
twitter.csv
twitter_accounts.txt
[34mtwitter_data[m[m/
twitter_day_features-TEST.csv
twitter_day_features.csv


In [12]:
in_df = pd.read_csv("./data/2019-09-27-basel-measures-FEAT-TempPrecip-IntLabels.csv")

in_df = in_df.drop('Unnamed: 0', axis=1)

in_df['comb_id'] = in_df[['osm_id', 'cci_id']].apply(lambda x: '-'.join([str(i) for i in x]), axis=1)

In [13]:
print(in_df.shape)
in_df.head()

(58242, 20)


Unnamed: 0,Unnamed: 0.1,osm_id,cci_id,date,place_name,place_type,cci,cci_p,hour,weekday,month,month_german,weekday_german,daytime,t_mean_2m_24h:C,precip_24h:mm,place_type_enc,daytime_enc,place_name_enc,comb_id
0,0,1175332462,287,2019-04-01 8:40:49,Leimgrubenweg,bus_stop,3.2,3.2,8,0,4,April,Montag,morning,11.4,0.0,0,2,163,1175332462-287
1,1,25149740,86_9668,2019-04-01 8:40:53,Leimgrubenweg,secondary,3.35764,3.35764,8,0,4,April,Montag,morning,11.4,0.0,11,2,163,25149740-86_9668
2,2,117485263,86_11641,2019-04-01 8:40:53,Reinacherstrasse,primary,3.35764,3.35764,8,0,4,April,Montag,morning,11.4,0.0,8,2,210,117485263-86_11641
3,3,2621172927,287,2019-04-01 8:40:57,Leimgrubenweg,bus_stop,3.19757,3.19757,8,0,4,April,Montag,morning,11.4,0.0,0,2,163,2621172927-287
4,4,148833576,86_12436,2019-04-01 8:42:11,Dornacherstrasse,secondary,5.0,5.0,8,0,4,April,Montag,morning,11.4,0.0,11,2,65,148833576-86_12436


In [20]:
def row_feat_dict(row):
    '''
    extract features for one row as a dictionary

    '''
    
    cols = ['place_type', 'weekday', 'month', 'daytime']

    feat_dict = {c:row[c] for c in cols}  # dictionary for one training instance
    
    feat_dict['place_name'] = row.place_name.lower()
    
    
    feat_dict['t_mean_2m_24h:C'] = row['t_mean_2m_24h:C']
    
    feat_dict['precip_24h:mm'] = row['precip_24h:mm']
    
    
    return feat_dict

## twitter features

In [22]:
feats_dict_list = in_df.apply(lambda x: row_feat_dict(x), axis=1)

In [23]:
dict_vec = DictVectorizer()

dict_vec.fit(feats_dict_list)

DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
               sparse=True)

In [71]:
list(dict_vec.get_feature_names())
feature_names = dict_vec.get_feature_names()

In [25]:
# transform to (dense) feature matrix

X = dict_vec.transform(feats_dict_list)
X = X.todense()
print(X.shape)


(58242, 347)


In [29]:
y = in_df.cci.values
y

array([3.2       , 3.35764015, 3.35764015, ..., 3.86769316, 4.39927712,
       4.39927712])

In [66]:
random_state = 98
#random_state = 72


pipelines = {
    
    'LinearRegression': LinearRegression(),
    #'DecisionTree': DecisionTreeClassifier(),
    #'GBC': GradientBoostingClassifier(random_state=random_state),
    'LGBMRegressor': lgb.LGBMRegressor(
                                    random_state=random_state,
                                    boosting_type='gbdt', 
                                    learning_rate=0.01,
                                    num_leaves=31,
                                    max_depth=-1,
                                    reg_lambda=0),  # with tuned hyperparams
#     'MLP': MLPClassifier(alpha=0.1, 
#                         activation='tanh',
#                         solver='lbfgs',
#                         random_state=random_state,
#                         hidden_layer_sizes=(30,30),
#                         max_iter=500,
#                         learning_rate_init=0.001)  # with tuned hyperparams

  
}

In [73]:
def run_pipelines(X, y, pipelines, num_folds=10, random_state=7):
    
    X, y = shuffle(X, y, random_state=random_state)
    
    folds = num_folds

    for reg_name in pipelines:
        print(reg_name)
        reg = pipelines[reg_name]
        
        result = cross_validate(reg, X, y, scoring=['neg_mean_absolute_error'], cv=folds)
        
        print(result)

        reg = reg.fit(X,y)
        

In [74]:
run_pipelines(X, y, pipelines)

LinearRegression
{'fit_time': array([4.06343603, 5.26135135, 7.57071304, 8.52098584, 3.60590315,
       4.37710595, 4.17494512, 4.07589293, 4.60212183, 3.67186594]), 'score_time': array([0.00517011, 0.00404787, 0.00488806, 0.00623012, 0.00477886,
       0.00525188, 0.00538397, 0.00440812, 0.00399327, 0.00580025]), 'test_neg_mean_absolute_error': array([-5.23620229e-01, -5.32199130e-01, -5.36550829e-01, -5.29596362e-01,
       -5.31384032e-01, -5.33055943e-01, -5.85274249e+05, -8.74673936e+05,
       -5.27165433e-01, -5.64325166e+05])}
LGBMRegressor
{'fit_time': array([2.53389025, 2.23073578, 2.35285783, 2.08792877, 2.16487622,
       1.95210004, 1.89813495, 2.46655202, 1.98220277, 1.84657407]), 'score_time': array([0.02789688, 0.0277431 , 0.06601906, 0.01983404, 0.03264999,
       0.03113174, 0.03314114, 0.02886295, 0.04457712, 0.02765179]), 'test_neg_mean_absolute_error': array([-0.52307177, -0.53335073, -0.53766329, -0.53398625, -0.53407552,
       -0.53288344, -0.52908465, -0.527346

In [39]:
sorted(metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'brier_score_loss',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'v_measure_score']