<a href="https://colab.research.google.com/github/leobioinf0/SEHackathon2022/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, euclidean_distances
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, recall_score, plot_confusion_matrix
from sklearn.metrics import precision_score, f1_score, classification_report, balanced_accuracy_score

In [2]:
def parse_values(x):
    if x =="Nitrogen oxides (NOX)":
       return 0
    elif x =="Carbon dioxide (CO2)":
       return 1
    else:
       return 2
def analize_models(model, name, X_train, X_test, y_train, y_test):
    
    model.fit(X_train,y_train)    
    y_pred = model.predict(X_test)

    scores = {'model': str(name),
            'Accuracy':accuracy_score(y_test,y_pred),
            'B-Accuracy':balanced_accuracy_score(y_test,y_pred),
            'F1 Score':f1_score(y_test,y_pred, average='macro'),
            'Precision':precision_score(y_test,y_pred, average='macro'),
            'Recall':recall_score(y_test,y_pred, average='macro')}

    print(model)
    print('')
    print(f'Accuracy : {accuracy_score(y_test,y_pred)}')
    print(f'B-Accuracy :{balanced_accuracy_score(y_test,y_pred)}')
    print(f'F1 Score : {f1_score(y_test,y_pred, average="macro")}')
    print(f'Precision : {precision_score(y_test,y_pred, average="macro")}')
    print(f'Recall : {recall_score(y_test,y_pred, average="macro")}')
    print('')
    return scores

In [3]:
test_x = pd.read_csv('https://challenges-asset-files.s3.us-east-2.amazonaws.com/data_sets/Data-Science/4+-+events/SchneiderElectricES22/final/test_x.csv', index_col="test_index")

In [4]:
df_1 = pd.read_csv('/content/drive/MyDrive/02-Schneider-Electric-Hack/Data-Science-SE/data/train1.csv')
df_2 = pd.read_csv('/content/drive/MyDrive/02-Schneider-Electric-Hack/Data-Science-SE/data/train2.csv', sep=";")
df_2.drop_duplicates(inplace=True)
df_csv = pd.concat([df_1, df_2], ignore_index=True)
df_csv = df_csv[["countryName", "eprtrSectorName", "EPRTRAnnexIMainActivityLabel",  "reportingYear", "MONTH", "DAY", "max_wind_speed", "avg_wind_speed", "min_wind_speed", "max_temp", "avg_temp", "min_temp", "pollutant"]].copy()

In [5]:
# import json 1
api_url = "http://schneiderapihack-env.eba-3ais9akk.us-east-2.elasticbeanstalk.com/first"
response = requests.get(api_url)
data = response.json()
df_3 = pd.DataFrame(data)
api_url = "http://schneiderapihack-env.eba-3ais9akk.us-east-2.elasticbeanstalk.com/second"
response = requests.get(api_url)
data = response.json()
df_4 = pd.DataFrame(data)
api_url = "http://schneiderapihack-env.eba-3ais9akk.us-east-2.elasticbeanstalk.com/third"
response = requests.get(api_url)
data = response.json()
df_5 = pd.DataFrame(data)
df_json = pd.concat([df_3,df_4,df_5])
df_json.drop(columns=[''], inplace=True)
df_json[test_x.select_dtypes('int').columns] = df_json[test_x.select_dtypes('int').columns].astype("int64")
df_json[test_x.select_dtypes('float64').columns] = df_json[test_x.select_dtypes('float64').columns].astype("float64")
df_json[test_x.select_dtypes('object').columns] = df_json[test_x.select_dtypes('object').columns].astype("object")
df_json = df_json[["countryName", "eprtrSectorName", "EPRTRAnnexIMainActivityLabel", "reportingYear", "MONTH", "DAY", "max_wind_speed", "avg_wind_speed", "min_wind_speed", "max_temp", "avg_temp", "min_temp", "pollutant"]].copy()

In [6]:
df = pd.concat([df_csv,df_json])

In [7]:
df['pollutant'] = df['pollutant'].apply(parse_values)

In [8]:
encoder = OrdinalEncoder()
df[df.select_dtypes('object').columns]= encoder.fit_transform(df[df.select_dtypes('object').columns])

In [9]:
X = df[df.select_dtypes('number').columns].drop(columns=["pollutant"])
y = df['pollutant']

In [10]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

## Classifier Models

In [12]:
XGBClass = XGBClassifier()
RandomForest = RandomForestClassifier()
LogReg = LogisticRegression()
DecissionTree = DecisionTreeClassifier()

### XGBClassifier

In [13]:
XGBClass_Score = analize_models(XGBClass, 'XGBClassificier', X_train, X_test, y_train, y_test)

XGBClassifier(objective='multi:softprob')

Accuracy : 0.6078732519602296
B-Accuracy :0.6249620236170351
F1 Score : 0.6271084162108683
Precision : 0.6568996647889961
Recall : 0.6249620236170351



### RandomForestClassifier

In [14]:
RandomForest_Score = analize_models(RandomForest, 'RandomForestClassifier', X_train, X_test, y_train, y_test)

RandomForestClassifier()

Accuracy : 0.6529787406030232
B-Accuracy :0.6717954147104388
F1 Score : 0.6786900226095121
Precision : 0.6900481655447329
Recall : 0.6717954147104388



### LogisticRegression

In [15]:
LogReg_Score = analize_models(LogReg, 'LogisticRegression' ,X_train, X_test, y_train, y_test)

LogisticRegression()

Accuracy : 0.5231590008891763
B-Accuracy :0.5270889299984365
F1 Score : 0.4445045373443099
Precision : 0.5465506689309279
Recall : 0.5270889299984365



### Decission Tree

In [16]:
DecissionTree_Score  = analize_models(DecissionTree, 'DecisionTreeClassifier', X_train,X_test,y_train,y_test)

DecisionTreeClassifier()

Accuracy : 0.6178158596718131
B-Accuracy :0.6401160750916193
F1 Score : 0.6401725531403168
Precision : 0.6402802154522238
Recall : 0.6401160750916193



In [17]:
scores = pd.DataFrame([XGBClass_Score, RandomForest_Score, LogReg_Score ,DecissionTree_Score]).set_index(keys='model')
scores

Unnamed: 0_level_0,Accuracy,B-Accuracy,F1 Score,Precision,Recall
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
XGBClassificier,0.607873,0.624962,0.627108,0.6569,0.624962
RandomForestClassifier,0.652979,0.671795,0.67869,0.690048,0.671795
LogisticRegression,0.523159,0.527089,0.444505,0.546551,0.527089
DecisionTreeClassifier,0.617816,0.640116,0.640173,0.64028,0.640116


### RandomForestClassifier

In [18]:
test = test_x[['countryName', 'eprtrSectorName', 'EPRTRAnnexIMainActivityLabel', 'reportingYear', 'MONTH', 'DAY', 'max_wind_speed', 'avg_wind_speed', 'min_wind_speed', 'max_temp', 'avg_temp', 'min_temp']].copy()

In [19]:
test[['countryName', 'eprtrSectorName', 'EPRTRAnnexIMainActivityLabel']] = encoder.transform(test[['countryName', 'eprtrSectorName', 'EPRTRAnnexIMainActivityLabel']])

In [20]:
test = scaler.transform(test)

In [21]:
test_x["pollutant"] = RandomForest.predict(test)

In [22]:
test_x[["pollutant"]].to_csv("./predictions.csv")

In [23]:
test_x.reset_index(inplace=True)

In [24]:
test_x[["test_index","pollutant"]].to_json("./predictions.json")