In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [None]:
from neo4j import GraphDatabase

uri = "bolt://localhost:7687"  
username = "neo4j"     
password = "foodContamination" #This is whatever password you set when you are created your database    

driver = GraphDatabase.driver(uri, auth=(username, password))

In [None]:
cypher_query = """MATCH (n) 
OPTIONAL MATCH (n)-[r]->(m) 
RETURN n, r, m"""

In [None]:
with driver.session() as session:
    result = session.run(cypher_query)

    # Extract properties and relationships from nodes
    data = []
    for record in result:
        node = record['n']
        properties = dict(node.items())
        relationship = record['r']
        if relationship is not None:
            relationship_properties = dict(relationship.items())
        else:
            relationship_properties = {}
        data.append({**properties, **relationship_properties})

    # Create DataFrame from the data
    df = pd.DataFrame(data)

In [None]:
df2 = df[['commodity','eventID', 'gtin', 'sgln', 'cteDate', 'pgln', 'contaminated', 'shortDescription']]
df2 = df2.dropna(how='all')

In [None]:
df2['cteDate'] = pd.to_datetime(df2['cteDate'])
reference_date = pd.to_datetime('1970-01-01')
df2['cteDate'] = (df2['cteDate'] - reference_date).dt.days.astype(float)

df2['commodity'] = df2['commodity'].fillna(df2['shortDescription'])

In [None]:
df2 = df2.drop('shortDescription', axis=1)

In [None]:
df2['eventID'] = pd.to_numeric(df2['eventID'].str.replace('.', ''))
df2['gtin'] = pd.to_numeric(df2['gtin'].str.replace('.', ''))
df2['sgln'] = pd.to_numeric(df2['sgln'].str.replace('.', ''))
df2['pgln'] = pd.to_numeric(df2['pgln'].str.replace('.', ''))

In [None]:
# Define the columns to be one-hot encoded
columns_to_encode = ['commodity']

# Create the ColumnTransformer
transformer = ColumnTransformer([('one_hot_encoder', OneHotEncoder(), columns_to_encode)], remainder='passthrough')

# Apply the transformation
df_encoded = transformer.fit_transform(df2)

# Convert the encoded data to a DataFrame
df3 = pd.DataFrame(df_encoded.toarray(), columns=transformer.get_feature_names_out(df2.columns))

In [None]:
X = df3.drop('remainder__contaminated', axis=1)
y = df3['remainder__contaminated'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

In [None]:
# Predict probabilities for the positive class
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Compute AUC
auc = roc_auc_score(y_test, y_pred_proba)
print("AUC:", auc)

In [None]:
# Get the feature names from the DataFrame
feature_names = X_train.columns

# Create a DMatrix for feature importance
dmatrix = xgb.DMatrix(X_train, feature_names=feature_names)

# Retrieve the feature importance scores
importance_scores = model.get_booster().get_score(importance_type='weight')

# Create lists to store the feature names and importance scores
features = []
scores = []

# Iterate over the feature importance scores
for feature, score in importance_scores.items():
    features.append(feature)
    scores.append(score)

# Sort the feature names and importance scores by score
sorted_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
sorted_features = [features[i] for i in sorted_indices]
sorted_scores = [scores[i] for i in sorted_indices]

# Plot the feature importance
plt.figure(figsize=(10, 6))
plt.barh(sorted_features, sorted_scores)
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.title('Variable Importance')
plt.show()

In [None]:

# Create the XGBoost classifier
xgb_model = xgb.XGBClassifier()

# Setting the range for the hyperparameters
param_grid = {
    'max_depth': [3, 6, 9],
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.3],
    'reg_lambda': [0, 1, 10],
    'reg_alpha': [0, 1, 10]
}

# Fitting grid search to the train data with 5 folds
gridsearch = GridSearchCV(estimator=xgb_model, 
                          param_grid=param_grid,
                          cv=StratifiedKFold(), 
                          scoring='roc_auc', 
                          verbose=2).fit(X_train, y_train)


In [None]:
# Get the best model and best hyperparameters from the grid search
best_xgb_model = gridsearch.best_estimator_

# Make predictions on the test data using the best model
y_pred = best_xgb_model.predict(X_test)

# Compute AUC
auc = roc_auc_score(y_test, y_pred)
print("AUC:", auc)