In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [None]:
from neo4j import GraphDatabase

uri = "bolt://localhost:7687" 
username = "neo4j"     
password = "foodContamination" #This is whatever password you set when you are created your database    

driver = GraphDatabase.driver(uri, auth=(username, password))

In [None]:
cypher_query = """MATCH (n) 
OPTIONAL MATCH (n)-[r]->(m) 
RETURN n, r, m"""

In [None]:
with driver.session() as session:
    result = session.run(cypher_query)

    # Extract properties and relationships from nodes
    data = []
    for record in result:
        node = record['n']
        properties = dict(node.items())
        relationship = record['r']
        if relationship is not None:
            relationship_properties = dict(relationship.items())
        else:
            relationship_properties = {}
        data.append({**properties, **relationship_properties})

    # Create DataFrame from the data
    df = pd.DataFrame(data)

In [None]:
df2 = df[['commodity','eventID', 'gtin', 'sgln', 'cteDate', 'pgln', 'contaminated', 'shortDescription']]
#df2 = df[[ 'commodity', 'shortDescription','cteDate', 'gtin', 'contaminated']]
df2 = df2.dropna(how='all')

In [None]:
df2['cteDate'] = pd.to_datetime(df2['cteDate'])
reference_date = pd.to_datetime('1970-01-01')
df2['cteDate'] = (df2['cteDate'] - reference_date).dt.days.astype(float)

df2['commodity'] = df2['commodity'].fillna(df2['shortDescription'])

In [None]:
df2 = df2.drop('shortDescription', axis=1)

In [None]:
df2['eventID'] = pd.to_numeric(df2['eventID'].str.replace('.', ''))
df2['gtin'] = pd.to_numeric(df2['gtin'].str.replace('.', ''))
df2['sgln'] = pd.to_numeric(df2['sgln'].str.replace('.', ''))
df2['pgln'] = pd.to_numeric(df2['pgln'].str.replace('.', ''))
#df2['parentID'] = pd.to_numeric(df2['parentID'].str.replace('.', ''))

In [None]:
# Define the columns to be one-hot encoded
columns_to_encode = ['commodity']

# Create the ColumnTransformer
transformer = ColumnTransformer([('one_hot_encoder', OneHotEncoder(), columns_to_encode)], remainder='passthrough')

# Apply the transformation
df_encoded = transformer.fit_transform(df2)

# Convert the encoded data to a DataFrame
df3 = pd.DataFrame(df_encoded.toarray(), columns=transformer.get_feature_names_out(df2.columns))

In [None]:
#columns_to_drop = ['contaminated']
#X = df_encoded.loc[:, ~df_encoded.columns.isin(columns_to_drop)]
X = df3.drop('remainder__contaminated', axis=1)
y = df3['remainder__contaminated']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
lr = LogisticRegression()

# Setting the range for class weights and regularization values (C)
weights = np.linspace(0.0, 0.99, 100)
C_values = [0.001, 0.01, 0.1, 1, 10, 100]

# Creating a dictionary grid for grid search
param_grid = {'class_weight': [{0: x, 1: 1.0 - x} for x in weights],
              'C': C_values}

#Fitting grid search to the train data with 5 folds
gridsearch = GridSearchCV(estimator= lr, 
                          param_grid= param_grid,
                          cv=StratifiedKFold(), 
                          n_jobs=-1, 
                          scoring='roc_auc', 
                          verbose=2).fit(X_train, y_train)


In [None]:
# Get the best class_weight and regularization value (C) from the grid search
best_class_weight = gridsearch.best_params_['class_weight']
best_C = gridsearch.best_params_['C']

# Create the best logistic regression model with the best class_weight and C
best_model = LogisticRegression(class_weight=best_class_weight, C=best_C)

# Train the best model on the entire training data
best_model.fit(X_train, y_train)

In [None]:
# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test)

# Calculate the AUC score
auc_score = roc_auc_score(y_test, y_pred)

print("AUC:", auc_score)

In [None]:
# Get the feature names (assuming X is a DataFrame or has column names)
feature_names = X.columns.tolist()

# Create a DataFrame to store feature names and their corresponding coefficients
coeff_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': best_model.coef_[0]})

# Sort the features by their absolute coefficients (importance)
coeff_df['Absolute Coefficient'] = np.abs(coeff_df['Coefficient'])
coeff_df = coeff_df.sort_values(by='Absolute Coefficient', ascending=False)

# Display the top 10 important features
print(coeff_df.head(10))