In [70]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, roc_auc_score
import xgboost as xgb

from imblearn.over_sampling import SMOTE

from scipy.stats import chi2_contingency

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
xl = pd.ExcelFile('./2023-04-13_KitaTable_Final.xlsx')

In [4]:
projects = xl.parse('Projects')
issuances = xl.parse('Issuances')
retirements = xl.parse('Credits')
buffer = xl.parse('Buffer')
countries = xl.parse('Countries')
methodologies = xl.parse('Methodologies')

In [6]:
def credits_to_vintages(df, credit_type):
    grouped = df.groupby(['RegistryID', 'Vintage']) \
                .agg(credit = ('Amount', 'sum')) \
                .rename({'credit': credit_type}, axis=1) \
                .reset_index()
    grouped['VintageNum'] = grouped.groupby('RegistryID').cumcount() + 1
    
    pivot = pd.pivot(grouped, index = 'RegistryID', columns = 'VintageNum', values = credit_type)
    
    pivot_sum = pivot.sum(axis=1)
    vintage_pivot = pivot \
                        .loc[:, 1:3] \
                        .rename({
                            1: f'{credit_type}_FirstVintage',
                            2: f'{credit_type}_SecondVintage',
                            3: f'{credit_type}_ThirdVintage',
                            }, axis=1)
    
    vintage_pivot[f'Total{credit_type}'] = pivot_sum
    
    return vintage_pivot

In [7]:
issued_vintage = credits_to_vintages(issuances, 'IssuedCredits')
retired_vintage = credits_to_vintages(retirements, 'RetiredCredits')
buffer_grouped = buffer.groupby('RegistryID').agg(Buffer = ('Amount', 'sum'))

In [8]:
credits = pd.concat([issued_vintage, retired_vintage, buffer_grouped], axis=1).reset_index()

In [9]:
project_info = pd.merge(
    pd.merge(projects, countries, left_on = 'Country', right_on = 'CountryCodeA3', how='left'),
    methodologies,
    left_on = 'Methodology',
    right_on = 'MethodologyID',
    how='left',
    suffixes = ('_Projects', '_Methods')
)

In [10]:
merged = pd.merge(project_info, credits, on='RegistryID', how='left')

In [11]:
columns_epi = ['AMP.raw.2022', 'APR.raw.2020', 'BCA.raw.2019', 'BHV.raw.2020', 'BLC.raw.2019',
           'CDA.raw.2019', 'CDL.raw.2017', 'CDO.raw.2019', 'CH4.raw.2019', 'CHA.raw.2019', 
           'COE.raw.2019', 'CXN.raw.2019', 'EEZ.raw.2022', 'FGA.raw.2019', 'FOG.raw.2019', 
           'FSS.raw.2018', 'FTD.raw.2018', 'GDB.raw.2020', 'GDP.raw.2020', 'GHG.raw.2019', 
           'GHI.raw.2019', 'GHN.raw.2019', 'GHP.raw.2019', 'GIB.raw.2019', 'GL5.raw.2020', 
           'GPC.raw.2020', 'GRA.raw.2020', 'GRL.raw.2020', 'HAD.raw.2019', 'LCB.raw.2017', 
           'MPA.raw.2022', 'MSW.raw.2019', 'NDA.raw.2019', 'NOE.raw.2019', 'NOT.raw.2019', 
           'NOX.raw.2019', 'NXA.raw.2019', 'OCP.raw.2020', 'OZD.raw.2019', 'PAR.raw.2020', 
           'PBD.raw.2019', 'PDN.raw.2020', 'PMD.raw.2019', 'POP.raw.2020', 'PST.raw.2020', 
           'REC.raw.2020', 'RMS.raw.2018', 'SDA.raw.2019', 'SHI.raw.2014', 'SNM.raw.2015', 
           'SO2.raw.2019', 'SOE.raw.2019', 'SPI.raw.2021', 'SPU.raw.2020', 'TBG.raw.2022', 
           'TBN.raw.2022', 'TC5.raw.2020', 'TCA.raw.2000', 'TCC.raw.2020', 'TCL.raw.2020', 
           'USD.raw.2019', 'UWD.raw.2019', 'VOE.raw.2019', 'WL5.raw.2020', 'WST.raw.2019', 
           'WTA.raw.2020', 'WTL.raw.2020', 'WWT.raw.2020']

columns_wgi = ['WGI_VNA_2021', 'WGI_SNV_2021', 'WGI_GE_2021', 'WGI_RQ_2021', 'WGI_ROL_2021', 'WGI_COC_2021']

In [12]:
# model_df = pd.concat([
#     merged.loc[:, columns_epi],
#     merged.loc[:, columns_wgi],
#     merged.loc[:, ['EstAnnualReductions', 'Sector']],
#     merged.loc[:, 'IssuedCredits_FirstVintage':'Buffer'],
#     merged.loc[:, 'Status']
# ], axis=1)

In [13]:
# model_df = pd.concat([
#     merged.loc[:, ['EstAnnualReductions', 'Sector']],
#     merged.loc[:, 'AMP.raw.1990':'WGI_COC_2021'],
#     merged.loc[:, 'IssuedCredits_FirstVintage':'Buffer'],
#     merged.loc[:, 'Status']
# ], axis=1)

In [14]:
model_df = merged.loc[:, ['RetiredCredits_FirstVintage', 'Sector', 'EPI.new', 'EstAnnualReductions', 'Status']]

In [15]:
# model_df = pd.concat([
#     merged.loc[:, ['EstAnnualReductions', 'Sector']],
#     merged.loc[:, 'EPI.new'],
#     merged.loc[:, 'IssuedCredits_FirstVintage':'Buffer'],
#     merged.loc[:, 'Status']
# ], axis=1)

In [16]:
model_df['WGI'] = merged.loc[:, ['WGI_VNA_2021', 'WGI_SNV_2021', 'WGI_GE_2021', 
                                        'WGI_RQ_2021', 'WGI_ROL_2021', 'WGI_COC_2021']].mean(axis=1)

In [17]:
model_df.dropna(subset = ['EstAnnualReductions','Status'], inplace=True)

In [18]:
model_df['Status'] = model_df['Status'].apply(lambda x: 1 if x == 'Cancelled/Rejected' else 0)

In [19]:
model_df = pd.get_dummies(model_df, drop_first = True)

In [20]:
model_df = model_df.fillna(0)

In [21]:
model_df['Status'].value_counts()

0    18447
1      448
Name: Status, dtype: int64

In [194]:
# model_df.to_csv('./Boruta3.csv', index=False)

### Chi-Squared test

In [22]:
contingency = pd.crosstab(merged['Sector'], merged['Status'].apply(lambda x: 1 if x == 'Cancelled/Rejected' else 0))

In [23]:
# Chi-square test of independence. 
c, p, dof, expected = chi2_contingency(contingency) 
# Print the p-value
print(p)

1.7328192311115696e-05


# Train Test Split

In [24]:
model_df.head()

Unnamed: 0,RetiredCredits_FirstVintage,EPI.new,EstAnnualReductions,Status,WGI,Sector_Household Devices,Sector_Industrial Process,Sector_Nature Based Solution,Sector_Waste
2662,4889573.0,18.9,3000000.0,0,47.799341,0,1,0,0
2663,0.0,46.9,1400000.0,0,80.294846,0,1,0,0
2664,67303.0,43.6,670133.0,0,40.818064,0,0,0,1
2665,2210.0,36.5,37032.0,0,23.664143,0,0,0,0
2666,0.0,28.4,8411432.0,0,44.061164,0,1,0,0


In [59]:
X = model_df.dropna().drop('Status', axis=1).values
y = model_df.dropna().loc[:, 'Status'].values

In [60]:
sc = StandardScaler()

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=123)

In [62]:
X_train[:, :-4] = sc.fit_transform(X_train[:, :-4])
X_test[:, :-4] = sc.transform(X_test[:, :-4])

In [63]:
oversampling = SMOTE()
X_train, y_train = oversampling.fit_resample(X_train, y_train)

In [30]:
rf = RandomForestClassifier(n_estimators=300, random_state=123)
rf.fit(X_train, y_train)

In [31]:
y_pred = rf.predict(X_test)

In [32]:
sum(y_test == 1)

134

In [34]:
print(confusion_matrix(y_test, y_pred))

[[4618  917]
 [  71   63]]


In [35]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.83      0.90      5535
           1       0.06      0.47      0.11       134

    accuracy                           0.83      5669
   macro avg       0.52      0.65      0.51      5669
weighted avg       0.96      0.83      0.88      5669



In [36]:
print(f'''
Precision: {precision_score(y_test, y_pred)}
Recall: {recall_score(y_test, y_pred)}
Area Under Curve (AUC): {roc_auc_score(y_test, y_pred)}
''')


Precision: 0.06428571428571428
Recall: 0.4701492537313433
Area Under Curve (AUC): 0.6522381318340547



## Support Vector Machine

In [266]:
svc = SVC()
svc.fit(X_train, y_train)

In [267]:
y_pred = svc.predict(X_test)

In [268]:
print(confusion_matrix(y_test, y_pred))

[[3570 1965]
 [  47   87]]


In [269]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.64      0.78      5535
           1       0.04      0.65      0.08       134

    accuracy                           0.65      5669
   macro avg       0.51      0.65      0.43      5669
weighted avg       0.96      0.65      0.76      5669



In [270]:
print(f'''
Precision: {precision_score(y_test, y_pred)}
Recall: {recall_score(y_test, y_pred)}
Area Under Curve (AUC): {roc_auc_score(y_test, y_pred)}
''')


Precision: 0.04239766081871345
Recall: 0.6492537313432836
Area Under Curve (AUC): 0.6471200906038911



# ANN

In [271]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [272]:
model = Sequential([
    Dense(1024, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(64, activation = 'relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [273]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [274]:
history = model.fit(X_train, y_train, epochs=100, validation_split=0.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [275]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print('Test loss: {}, Test accuracy: {}'.format(test_loss, test_accuracy))

Test loss: 0.3810683786869049, Test accuracy: 0.8544716835021973


In [276]:
y_pred = model.predict(X_test)



In [277]:
y_pred = [1 if x > 0.5 else 0 for x in y_pred]

In [278]:
print(confusion_matrix(y_test, y_pred))

[[4783  752]
 [  73   61]]


In [279]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.86      0.92      5535
           1       0.08      0.46      0.13       134

    accuracy                           0.85      5669
   macro avg       0.53      0.66      0.52      5669
weighted avg       0.96      0.85      0.90      5669



In [280]:
print(f'''
Precision: {precision_score(y_test, y_pred)}
Recall: {recall_score(y_test, y_pred)}
Area Under Curve (AUC): {roc_auc_score(y_test, y_pred)}
''')


Precision: 0.07503075030750307
Recall: 0.4552238805970149
Area Under Curve (AUC): 0.6596805943183811



In [283]:
confusion_matrix(y_test, y_pred)

array([[4783,  752],
       [  73,   61]])

## XGBoost

In [66]:
bst = xgb.XGBClassifier(n_estimators=250, max_depth=2, learning_rate=1, objective='binary:logistic')
bst.fit(X_train, y_train)

In [67]:
y_pred = bst.predict(X_test)

In [68]:
print(confusion_matrix(y_test, y_pred))

[[4712  823]
 [  63   71]]


In [69]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.85      0.91      5535
           1       0.08      0.53      0.14       134

    accuracy                           0.84      5669
   macro avg       0.53      0.69      0.53      5669
weighted avg       0.97      0.84      0.90      5669



In [65]:
#GridSearch
model = xgb.XGBClassifier()

n_estimators = range(50, 401, 50)
param_grid = dict(n_estimators=n_estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=123)
grid_search = GridSearchCV(model, param_grid, scoring = 'neg_log_loss', n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X_train, y_train)

# Summarize the result
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: -0.309010 using {'n_estimators': 250}
-0.339892 (0.007701) with: {'n_estimators': 50}
-0.317818 (0.008171) with: {'n_estimators': 100}
-0.311842 (0.007483) with: {'n_estimators': 150}
-0.309957 (0.007989) with: {'n_estimators': 200}
-0.309010 (0.007978) with: {'n_estimators': 250}
-0.309433 (0.007982) with: {'n_estimators': 300}
-0.310427 (0.007860) with: {'n_estimators': 350}
-0.311618 (0.007945) with: {'n_estimators': 400}


## Logistic Regression

In [71]:
lgr = LogisticRegression()
lgr.fit(X_train, y_train)

In [72]:
y_pred = lgr.predict(X_test)

In [73]:
print(confusion_matrix(y_test, y_pred))

[[4833  702]
 [  79   55]]
