In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [2]:
trainingFeaturesFilename = 'ModelInput/features.csv'
trainingLabelsFilename = 'ModelInput/labels.csv'

trainingFeatures = pd.read_csv(trainingFeaturesFilename)
trainingLabels = pd.read_csv(trainingLabelsFilename)

# For classification this isn't necessary!
#trainingFeatures['cityHasRiver'] = trainingFeatures['cityHasRiver'].astype('category')

print("Training features: {}".format(trainingFeatures.shape))
print("Training labels: {}".format(trainingLabels.shape))

tempDF = pd.merge(trainingFeatures, trainingLabels, on='cityId', how='inner')
print("Combined: {}".format(tempDF.shape))

Training features: (263, 36)
Training labels: (263, 13)
Combined: (263, 48)


In [4]:
# Start at 2nd column, i.e. exclude country_code
X = pd.get_dummies(trainingFeatures.iloc[:,2:len(trainingFeatures)])
y = tempDF['cityScore'].values
print(X.shape)
#print(type(X))
print(y)

(263, 34)
[0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 1.
 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.
 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]


In [5]:
# feature extraction
test = SelectKBest(score_func=chi2, k=6)
fit = test.fit(X, y)

In [6]:
np.set_printoptions(precision=3, suppress=True)
df = pd.DataFrame(fit.scores_)
#print(fit.scores_)
print(df.sort_values(by=0, ascending=False).T)
features = fit.transform(X)
print(features)

         17        8        11        33        7         2         6   \
0  2.444793  1.553785  1.36616  1.250376  1.152826  1.073487  0.976894   

         9         14        32    ...           0         13        28  \
0  0.614278  0.579996  0.552602    ...     0.035247  0.029845  0.027257   

         19        31        16        25       5         4         18  
0  0.022131  0.019477  0.017268  0.011005  0.00355  0.000472  0.000236  

[1 rows x 34 columns]
[[0.316 0.053 0.    0.    0.    1.   ]
 [0.105 0.    0.158 0.053 0.    1.   ]
 [0.316 0.263 0.    0.    0.    1.   ]
 ...
 [0.053 0.105 0.    0.368 0.    1.   ]
 [0.105 0.21  0.    0.053 0.    0.   ]
 [0.    0.526 0.    0.    0.105 1.   ]]


In [7]:
pos = [17, 8, 11, 33, 7, 2, 6]
colname = X.columns[pos]
print(colname)
tempDf = X[colname]
print(tempDf.head)

Index(['GrasslandHillsWoods', 'CoastLake', 'Desert', 'cityHasRiver',
       'Grassland', 'Plains', 'PlainsHillsRainforest'],
      dtype='object')
<bound method NDFrame.head of      GrasslandHillsWoods  CoastLake  Desert  cityHasRiver  Grassland  Plains  \
0                 0.0000     0.0000  0.0000             1     0.0526  0.3158   
1                 0.0000     0.1579  0.0526             1     0.0000  0.1053   
2                 0.0000     0.0000  0.0000             1     0.2632  0.3158   
3                 0.0000     0.3158  0.0000             0     0.0000  0.2105   
4                 0.0526     0.0000  0.0000             1     0.3684  0.0000   
5                 0.0526     0.0000  0.0000             1     0.6842  0.0000   
6                 0.0000     0.1053  0.0000             1     0.0000  0.2632   
7                 0.0000     0.2105  0.0000             1     0.2632  0.1053   
8                 0.0000     0.0000  0.0000             1     0.0526  0.2105   
9                 0.000