In [109]:
from os import path
import urllib.request
import json

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.metrics import classification_report


In [110]:
#create empty list to load dataframes
training_dataframes = []

In [111]:
#Load csv files

training_dataframes.append(pd.read_csv('raw_train/csv/train1.csv'))
training_dataframes.append(pd.read_csv('raw_train/csv/train2.csv', sep= ';'))

In [112]:
# Load Json data

root_url = "http://schneiderapihack-env.eba-3ais9akk.us-east-2.elasticbeanstalk.com"
json_files = ["first", "second", "third"]

for j_file in json_files:
  with urllib.request.urlopen(path.join(root_url,j_file)) as url:
      data = json.loads(url.read().decode())
      training_dataframes.append(pd.DataFrame(data))

In [113]:
df_training = pd.concat(training_dataframes, axis=0, ignore_index=True)

In [114]:
df_training.columns

Index(['countryName', 'eprtrSectorName', 'EPRTRAnnexIMainActivityLabel',
       'FacilityInspireID', 'facilityName', 'City', 'targetRelease',
       'pollutant', 'reportingYear', 'MONTH', 'DAY', 'CONTINENT',
       'max_wind_speed', 'avg_wind_speed', 'min_wind_speed', 'max_temp',
       'avg_temp', 'min_temp', 'DAY WITH FOGS', 'REPORTER NAME', 'CITY ID', '',
       'EPRTRAnnexIMainActivityCode', 'EPRTRSectorCode'],
      dtype='object')

In [None]:
print(len(df_training.columns))
for col in df_training:
  print(col, len(df_training[col].unique()))
  print(df_training[col].unique())

Le hecho un ojo a los datos. a ver que columnas puedo eliminar de primeras:

1.   targetRelease. Solo tiene 1 valor
2.   continent. Lo mismo
3.   REPORTER NAME. No me interesa
4.   El que no tiene nombre. No me interesa
5.   DAY. El dia particular no creo que sea relevante
6. CITY ID. Es lo mismo que city
7. FacilityInspireID . Hay demasiados
8. facilityName . Lo mismo
9. EPRTRAnnexIMainActivityCode . Es repetido
10. EPRTRSectorCode . Al final eso viene de lo anterior
11. City. 5000 ciudades, demasiada liada tb para hacer ahora



In [163]:
def curate_dataframe(df):
  '''Function to prepare dataset'''


  # Delete the columns I dont want
  rem_att = ['targetRelease', 'CONTINENT', 'REPORTER NAME', '', 'DAY', 'FacilityInspireID', 'facilityName', 'EPRTRAnnexIMainActivityCode', 'EPRTRSectorCode', 'City']
  for att in rem_att:
    try:
      df.pop(att)
    except:
      print('Atrribute not present')

  df= df.rename(columns = {'CITY ID':'CITY_ID'})

  # Format several fields
  df['reportingYear'] = df['reportingYear'].astype(int).astype("category")
  df['MONTH'] = df['MONTH'].astype(int).astype("category")
  df['DAY WITH FOGS'] = df['DAY WITH FOGS'].astype(str).astype(int)

  df['max_wind_speed'] = df['max_wind_speed'].astype(float)
  df['avg_wind_speed'] = df['avg_wind_speed'].astype(float)
  df['min_wind_speed'] = df['min_wind_speed'].astype(float)

  df['max_temp'] = df['max_temp'].astype(float)
  df['avg_temp'] = df['avg_temp'].astype(float)
  df['min_temp'] = df['min_temp'].astype(float)


  # Encode pollutant to int
  try:
    pollutant_dict={
    "Nitrogen oxides (NOX)":0,
    "Carbon dioxide (CO2)":1,
    "Methane (CH4)":2}
    df = df.replace({"pollutant": pollutant_dict})
  except:
    print('Pollutant not present in the dataframe')

  df= df.rename(columns = {'DAY WITH FOGS':'DAY_WITH_FOGS'})

  return df

In [116]:
# Delete the columns I dont want
df_training.pop('targetRelease')
df_training.pop('CONTINENT')
df_training.pop('REPORTER NAME')
df_training.pop('')
df_training.pop('DAY')
df_training= df_training.rename(columns = {'CITY ID':'CITY_ID'})
# df_training.pop('CITY_ID')
df_training.pop('FacilityInspireID')
df_training.pop('facilityName')
df_training.pop('EPRTRAnnexIMainActivityCode')
df_training.pop('EPRTRSectorCode')
df_training.pop('City')

0                      Sehnde
1        TAVERNOLA BERGAMASCA
2          PUERTO DEL ROSARIO
3                       Kadaň
4                     Tampere
                 ...         
65623                 LARNAKA
65624                Naantali
65625                    Ptuj
65626                VIGGIANO
65627                 Runcorn
Name: City, Length: 65628, dtype: object

In [117]:
# make all the numeric number strings to numbers
# Use to string before just in case there are both types
df_training['reportingYear'] = df_training['reportingYear'].astype(int).astype("category")
df_training['MONTH'] = df_training['MONTH'].astype(int).astype("category")
df_training['DAY WITH FOGS'] = df_training['DAY WITH FOGS'].astype(str).astype(int)

df_training['max_wind_speed'] = df_training['max_wind_speed'].astype(float)
df_training['avg_wind_speed'] = df_training['avg_wind_speed'].astype(float)
df_training['min_wind_speed'] = df_training['min_wind_speed'].astype(float)

df_training['max_temp'] = df_training['max_temp'].astype(float)
df_training['avg_temp'] = df_training['avg_temp'].astype(float)
df_training['min_temp'] = df_training['min_temp'].astype(float)

In [118]:
# Encode pollutant to int
pollutant_dict={
"Nitrogen oxides (NOX)":0,
"Carbon dioxide (CO2)":1,
"Methane (CH4)":2}

df_training = df_training.replace({"pollutant": pollutant_dict})

In [119]:
df_training= df_training.rename(columns = {'DAY WITH FOGS':'DAY_WITH_FOGS'})

In [120]:
df_training.describe()

Unnamed: 0,pollutant,max_wind_speed,avg_wind_speed,min_wind_speed,max_temp,avg_temp,min_temp,DAY_WITH_FOGS
count,65628.0,65628.0,65628.0,65628.0,65628.0,65628.0,65628.0,65628.0
mean,0.858292,15.515958,18.015285,22.521038,9.455406,10.448142,13.442827,2.232568
std,0.793736,3.067272,2.310739,3.059973,5.216525,5.084529,5.216068,3.778429
min,0.0,8.011958,14.0001,15.032589,-3.141464,-0.199176,0.894827,0.0
25%,0.0,13.324166,16.012197,20.346158,5.879821,7.186013,9.894281,0.0
50%,1.0,15.50682,18.020789,22.540387,9.698967,10.701504,13.692473,1.0
75%,2.0,17.718201,20.011702,24.715251,13.282417,14.193578,17.268,2.0
max,2.0,22.991382,21.999973,29.933603,20.938266,19.999403,24.902108,19.0


In [121]:
df_training.corr()

Unnamed: 0,pollutant,max_wind_speed,avg_wind_speed,min_wind_speed,max_temp,avg_temp,min_temp,DAY_WITH_FOGS
pollutant,1.0,0.005049,0.001564,0.00027,0.002914,0.003375,0.002614,0.090077
max_wind_speed,0.005049,1.0,0.752854,0.565108,-0.005401,-0.004884,-0.004942,-0.006641
avg_wind_speed,0.001564,0.752854,1.0,0.750644,-0.006726,-0.006853,-0.00795,-0.006585
min_wind_speed,0.00027,0.565108,0.750644,1.0,-0.001764,-0.002531,-0.003061,-0.006167
max_temp,0.002914,-0.005401,-0.006726,-0.001764,1.0,0.975139,0.950746,0.003192
avg_temp,0.003375,-0.004884,-0.006853,-0.002531,0.975139,1.0,0.975169,0.003429
min_temp,0.002614,-0.004942,-0.00795,-0.003061,0.950746,0.975169,1.0,0.002612
DAY_WITH_FOGS,0.090077,-0.006641,-0.006585,-0.006167,0.003192,0.003429,0.002612,1.0


In [122]:
#Check if have unbalanced data

print(np.sum(df_training['pollutant']==0))
print(np.sum(df_training['pollutant']==1))
print(np.sum(df_training['pollutant']==2))

25982
22964
16682


The data is unbalanced to some extent

In [123]:
# Get the labels

y = df_training.pop("pollutant")

In [18]:
# Try PCA to reduce dimensionality of numerical values

numerical_features = ['max_wind_speed', 'avg_wind_speed','min_wind_speed','max_temp','avg_temp','min_temp','DAY_WITH_FOGS']
X_train_num = df_training[numerical_features]

# pca = PCA(n_components=5)
# pca.fit(X_train_num)
# X_train_num_transform = pca.transform(X_train_num)
# print(pca.explained_variance_ratio_.sum())

In [19]:
categorical_features = ["countryName",	"eprtrSectorName",	"EPRTRAnnexIMainActivityLabel",	"reportingYear",	"MONTH"]
x_train_feat = df_training[categorical_features]

In [183]:
X_train, X_test, y_train, y_test = train_test_split(df_training,y,test_size=0.05, random_state=seed)

In [22]:
col_trans = make_column_transformer(
                        (OneHotEncoder(),categorical_features),
                        remainder = "passthrough"
                        )

In [82]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
              min_samples_leaf=50,
              n_estimators=150,
              bootstrap=True,
              oob_score=True,
              n_jobs=-1,
              max_features='auto')

ann = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(128, 64), random_state=1)

ada = AdaBoostClassifier(n_estimators=100)



In [83]:
from sklearn.pipeline import make_pipeline

rf_pipe = make_pipeline(col_trans, rf)
rf_pipe.fit(X_train, y_train)
y_pred = rf_pipe.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.53      0.75      0.63      5169
           1       0.55      0.34      0.42      4618
           2       0.92      0.82      0.87      3339

    accuracy                           0.63     13126
   macro avg       0.67      0.64      0.64     13126
weighted avg       0.64      0.63      0.62     13126



In [84]:
ann_pipe = make_pipeline(col_trans, ann)
ann_pipe.fit(X_train, y_train)
y_pred = ann_pipe.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.81      0.62      5169
           1       0.46      0.17      0.25      4618
           2       0.93      0.82      0.87      3339

    accuracy                           0.59     13126
   macro avg       0.63      0.60      0.58     13126
weighted avg       0.59      0.59      0.55     13126



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [85]:
ada_pipe = make_pipeline(col_trans, ada)
ada_pipe.fit(X_train, y_train)
y_pred = ada_pipe.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.54      0.66      0.59      5169
           1       0.53      0.43      0.47      4618
           2       0.91      0.83      0.87      3339

    accuracy                           0.62     13126
   macro avg       0.66      0.64      0.64     13126
weighted avg       0.63      0.62      0.62     13126



# Tensorflow implementation

In [184]:
import tensorflow as tf
from tensorflow import feature_column
from tensorflow.keras import layers

In [185]:
batch_size = 64

In [186]:
train_ds = tf.data.Dataset.from_tensor_slices((dict(X_train), y_train))
test_ds = tf.data.Dataset.from_tensor_slices((dict(X_test), y_test))

train_ds = train_ds.shuffle(buffer_size=len(y_train))
test_ds = test_ds.shuffle(buffer_size=len(y_test))

train_ds = train_ds.batch(batch_size)
test_ds = test_ds.batch(batch_size)

In [187]:
for feature_batch, label_batch in train_ds.take(1):
  print('Every feature:', list(feature_batch.keys()))
  print('A batch of targets:', label_batch )


Every feature: ['countryName', 'eprtrSectorName', 'EPRTRAnnexIMainActivityLabel', 'reportingYear', 'MONTH', 'max_wind_speed', 'avg_wind_speed', 'min_wind_speed', 'max_temp', 'avg_temp', 'min_temp', 'DAY_WITH_FOGS', 'CITY_ID']
A batch of targets: tf.Tensor(
[0 2 1 2 1 1 0 0 1 0 2 0 2 0 2 0 1 1 2 0 0 0 1 2 1 0 0 1 0 0 0 2 1 1 1 0 1
 0 2 0 1 0 2 0 1 1 1 2 1 1 1 0 0 2 0 0 1 0 0 0 0 1 1 1], shape=(64,), dtype=int64)


In [188]:
feature_columns = []

#Define feature columns

#numerical
for attrib in ['DAY_WITH_FOGS','max_wind_speed', 'avg_wind_speed', 'min_wind_speed', 'max_temp', 'avg_temp', 'min_temp']:
  feature_columns.append(feature_column.numeric_column(attrib))


# One hot. Only 9 values
sector_name = feature_column.categorical_column_with_vocabulary_list(
      'eprtrSectorName', df_training['eprtrSectorName'].unique())
sector_name_one_hot = feature_column.indicator_column(sector_name)
feature_columns.append(sector_name_one_hot)

# One hot year and month, also cross it
year = feature_column.categorical_column_with_vocabulary_list(
      'reportingYear', df_training['reportingYear'].unique())
year_one_hot = feature_column.indicator_column(year)
feature_columns.append(year_one_hot)

month = feature_column.categorical_column_with_vocabulary_list(
      'MONTH', df_training['MONTH'].unique())
month_one_hot = feature_column.indicator_column(month)
feature_columns.append(month_one_hot)

# embeding countries 
countries = feature_column.categorical_column_with_vocabulary_list(
      'countryName', df_training['countryName'].unique())
countries_embedding = feature_column.embedding_column(countries, dimension=8)
feature_columns.append(countries_embedding)

# Embeding citys
city = feature_column.categorical_column_with_vocabulary_list(
      'CITY_ID', df_training['CITY_ID'].unique())
city_embedding = feature_column.embedding_column(city, dimension=8)
feature_columns.append(city_embedding)


# Embeding EPRTRAnnexIMainActivityLabel
EPRTR = feature_column.categorical_column_with_vocabulary_list(
      'EPRTRAnnexIMainActivityLabel', df_training['EPRTRAnnexIMainActivityLabel'].unique())
EPRTR_embedding = feature_column.embedding_column(EPRTR, dimension=32)
feature_columns.append(EPRTR_embedding)

In [189]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [190]:
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(1024, activation='relu'),
  layers.Dense(512, activation='relu'),
  layers.Dense(128, activation='relu'),
  layers.Dense(3)
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [191]:
model.fit(train_ds, validation_data=test_ds, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50


KeyboardInterrupt: ignored

In [192]:
y_pred = np.argmax(model.predict(dict(X_test)),axis=1)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.61      0.64      0.63      1305
           1       0.57      0.53      0.55      1143
           2       0.88      0.91      0.89       834

    accuracy                           0.67      3282
   macro avg       0.69      0.69      0.69      3282
weighted avg       0.67      0.67      0.67      3282



In [182]:
y_pred = np.argmax(model.predict(dict(X_test)),axis=1)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.64      0.62      0.63      5169
           1       0.60      0.60      0.60      4618
           2       0.90      0.91      0.90      3339

    accuracy                           0.69     13126
   macro avg       0.71      0.71      0.71     13126
weighted avg       0.69      0.69      0.69     13126



# Predictions

In [193]:
test_data = pd.read_csv('raw_train/csv/train1.csv')

In [194]:
test_data = curate_dataframe(test_data)

Atrribute not present
Atrribute not present
Atrribute not present


In [195]:
final_preds = np.argmax(model.predict(dict(test_data)),axis=1)



In [196]:
final_preds.shape

(18563,)

In [200]:
final_preds_dict = {'test_index':np.arange(len(final_preds)), 'pollutant':final_preds}
final_preds_df = pd.DataFrame(data=final_preds_dict)


In [203]:
final_preds_df.to_csv('predictions.csv')
final_preds_df.to_json('predictions.sjon')

In [201]:
final_preds_df

Unnamed: 0,test_index,pollutant
0,0,1
1,1,1
2,2,2
3,3,0
4,4,2
...,...,...
18558,18558,1
18559,18559,0
18560,18560,1
18561,18561,1
