In [1]:

#libraries
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif 

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
!pip install geopy



In [3]:
num_data_points = 100  # Number of data points to generate
#min_latitude, max_latitude = 37.0, 40.0
#min_longitude, max_longitude = -120.0, -117.0

min_latitude, max_latitude = 37.7, 38.0
min_longitude, max_longitude = -119.0, -118.5

min_distance, max_distance = 1.0, 50.0
availability_values = [True, False]
blood_types = ["O+","O-","A+","A-","B+","B-","AB+","AB-"]

    



In [4]:
hospital_names = [f"Hospital {i}" for i in range(num_data_points)]
hospital_latitudes = np.random.uniform(min_latitude, max_latitude, num_data_points)
hospital_longitudes = np.random.uniform(min_longitude, max_longitude, num_data_points)
volunteer_latitudes = np.random.uniform(min_latitude, max_latitude, num_data_points)
volunteer_longitudes = np.random.uniform(min_longitude, max_longitude, num_data_points)


In [5]:
def calculate_distance(lat1, lon1, lat2, lon2):
    return np.sqrt((lat1 - lat2) ** 2 + (lon1 - lon2) ** 2)

distances = calculate_distance(hospital_latitudes, hospital_longitudes, volunteer_latitudes, volunteer_longitudes)


In [6]:
from geopy.distance import geodesic

def calculate_distance_km(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).kilometers

distances_km = [calculate_distance_km(h_lat, h_lon, v_lat, v_lon)
             for h_lat, h_lon, v_lat, v_lon in zip(hospital_latitudes, hospital_longitudes, volunteer_latitudes, volunteer_longitudes)]

In [7]:
availability = np.random.choice(availability_values, num_data_points)
requested_blood_type = np.random.choice(blood_types, num_data_points)
volunteer_blood_type = np.random.choice(blood_types, num_data_points)


In [8]:
distances_km

[27.049670584972045,
 26.903983200625717,
 4.675794345076006,
 12.798486756465962,
 9.316687375226431,
 8.135037234972511,
 6.626115727023246,
 12.736419111999199,
 10.165021366025156,
 12.709176073737101,
 8.20441755553515,
 17.557149071988075,
 26.911382276129128,
 36.682723660300994,
 21.23828765110558,
 10.180038197549768,
 30.917573343622696,
 14.674283159892076,
 17.56561138588825,
 29.192473873997095,
 38.260107705296775,
 20.146486745952227,
 22.92066252584084,
 35.01313587909207,
 24.86194214780623,
 24.780792723941552,
 3.4189457609702254,
 16.087974108068984,
 13.499549804017429,
 20.846957995668173,
 11.346068306199477,
 22.900913656124867,
 3.7046699845667312,
 21.33045147736467,
 28.021816500382428,
 18.323872741700963,
 19.58558256431866,
 13.712664048869724,
 15.457142747199873,
 22.93827206563567,
 11.739702197237975,
 21.3186330817464,
 36.04229403545724,
 13.114273546609057,
 12.068806145320856,
 41.72874017854423,
 29.812547697131407,
 28.890800727481874,
 30.642773

In [9]:
data = pd.DataFrame({
    "Hospital Name": hospital_names,
    "Hospital Latitude": hospital_latitudes,
    "Hospital Longitude": hospital_longitudes,
    "Volunteer Latitude": volunteer_latitudes,
    "Volunteer Longitude": volunteer_longitudes,
    "Distance": distances_km,
    "Availability": availability,
    "Requested Blood Type": requested_blood_type,
    "Volunteer Blood Type": volunteer_blood_type
})


In [10]:
# Real blood donation science compatibility rules
blood_compatibility = {
    "O+": ["O+", "A+", "B+", "AB+"],
    "O-": ["O+", "O-"],
    "A+": ["A+", "AB+"],
    "A-": ["A-", "A+", "AB-", "AB+"],
    "B+": ["B+", "AB+"],
    "B-": ["B-", "B+", "AB-", "AB+"],
    "AB+": ["AB+"],
    "AB-": ["AB-", "AB+"]
}

# Label Data based on real blood donation science
def label_match(distance, requested_blood_type, volunteer_blood_type):
    max_distance_threshold = 10.0
    required_blood_type = requested_blood_type

    is_distance_match = distance <= max_distance_threshold
    is_blood_type_match = required_blood_type in blood_compatibility[volunteer_blood_type]

    return is_distance_match and is_blood_type_match

data["Matched"] = data.apply(lambda row: label_match(row["Distance"], row["Requested Blood Type"], row["Volunteer Blood Type"]), axis=1)

# Save the updated dataset to a new CSV file
#data.to_csv("matching_data_with_real_blood_science.csv", index=False)


In [11]:
data

Unnamed: 0,Hospital Name,Hospital Latitude,Hospital Longitude,Volunteer Latitude,Volunteer Longitude,Distance,Availability,Requested Blood Type,Volunteer Blood Type,Matched
0,Hospital 0,37.706757,-118.566327,37.839929,-118.823461,27.049671,False,O+,B+,False
1,Hospital 1,37.965521,-118.649109,37.725626,-118.692874,26.903983,False,B-,AB+,False
2,Hospital 2,37.953196,-118.501887,37.936276,-118.550603,4.675794,False,B+,A-,False
3,Hospital 3,37.875066,-118.860083,37.796243,-118.966199,12.798487,True,B-,O-,False
4,Hospital 4,37.714045,-118.779992,37.785205,-118.723920,9.316687,False,O+,A+,False
...,...,...,...,...,...,...,...,...,...,...
95,Hospital 95,37.720121,-118.566941,37.983920,-118.526841,29.491966,False,A+,AB-,False
96,Hospital 96,37.792491,-118.949844,37.888406,-118.806831,16.486417,False,O-,A-,False
97,Hospital 97,37.701282,-118.984650,37.843493,-118.845186,20.002975,False,AB+,B+,False
98,Hospital 98,37.868096,-118.679386,37.752980,-118.770339,15.079728,True,B+,O-,False


In [12]:
data['Matched'].value_counts()

False    95
True      5
Name: Matched, dtype: int64

In [13]:
#data.to_csv("synthetic_matching_data.csv", index=False)

In [14]:
# Load the synthetic dataset
#data = pd.read_csv("synthetic_matching_data.csv")

# Separate the data into positive (successful match) and negative (unsuccessful match) classes
positive_data = data[data["Matched"] == True]
negative_data = data[data["Matched"] == False]

# Upsample the positive class to match the size of the negative class
upsampled_positive_data = positive_data.sample(n=len(negative_data), replace=True, random_state=42)

# Concatenate the upsampled positive class with the original negative class to create the balanced dataset
balanced_data = pd.concat([upsampled_positive_data, negative_data], ignore_index=True)

# Shuffle the data to ensure a random order
balanced_data = balanced_data.sample(frac=1, random_state=42)

# Save the balanced dataset to a new CSV file
balanced_data.to_csv("balanced_matching_data.csv", index=False)


In [15]:
balanced_data

Unnamed: 0,Hospital Name,Hospital Latitude,Hospital Longitude,Volunteer Latitude,Volunteer Longitude,Distance,Availability,Requested Blood Type,Volunteer Blood Type,Matched
175,Hospital 85,37.792820,-118.798567,37.768172,-118.506726,25.854438,False,B-,O+,False
180,Hospital 90,37.766829,-118.917897,37.948154,-118.945556,20.272540,False,B-,AB-,False
111,Hospital 17,37.799405,-118.668198,37.780801,-118.833138,14.674283,False,B-,O+,False
65,Hospital 73,37.964944,-118.713483,37.974023,-118.736082,2.226852,False,AB+,O+,True
101,Hospital 7,37.761657,-118.731421,37.728379,-118.869721,12.736419,True,AB+,A-,False
...,...,...,...,...,...,...,...,...,...,...
106,Hospital 12,37.870575,-118.801623,37.769096,-118.524033,26.911382,True,AB+,AB+,False
14,Hospital 73,37.964944,-118.713483,37.974023,-118.736082,2.226852,False,AB+,O+,True
92,Hospital 64,37.812726,-118.922886,37.804486,-118.815672,9.485494,True,AB+,A-,True
179,Hospital 89,37.712671,-118.895851,37.808123,-118.910155,10.669199,False,AB-,AB+,False


In [16]:
balanced_data['Matched'].value_counts()

False    95
True     95
Name: Matched, dtype: int64

In [17]:
balanced_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 190 entries, 175 to 102
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Hospital Name         190 non-null    object 
 1   Hospital Latitude     190 non-null    float64
 2   Hospital Longitude    190 non-null    float64
 3   Volunteer Latitude    190 non-null    float64
 4   Volunteer Longitude   190 non-null    float64
 5   Distance              190 non-null    float64
 6   Availability          190 non-null    bool   
 7   Requested Blood Type  190 non-null    object 
 8   Volunteer Blood Type  190 non-null    object 
 9   Matched               190 non-null    bool   
dtypes: bool(2), float64(5), object(3)
memory usage: 13.7+ KB


In [18]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# Identify categorical columns
categorical_columns = balanced_data.select_dtypes(include=['object','bool']).columns

# Apply Label Encoding to categorical columns
label_encoder = LabelEncoder()
encoded_data = balanced_data.copy()
for column in categorical_columns:
    encoded_data[column] = label_encoder.fit_transform(balanced_data[column])

In [19]:
encoded_data.drop('Hospital Name',axis=1,inplace=True)

In [20]:
encoded_data

Unnamed: 0,Hospital Latitude,Hospital Longitude,Volunteer Latitude,Volunteer Longitude,Distance,Availability,Requested Blood Type,Volunteer Blood Type,Matched
175,37.792820,-118.798567,37.768172,-118.506726,25.854438,0,5,6,0
180,37.766829,-118.917897,37.948154,-118.945556,20.272540,0,5,3,0
111,37.799405,-118.668198,37.780801,-118.833138,14.674283,0,5,6,0
65,37.964944,-118.713483,37.974023,-118.736082,2.226852,0,2,6,1
101,37.761657,-118.731421,37.728379,-118.869721,12.736419,1,2,1,0
...,...,...,...,...,...,...,...,...,...
106,37.870575,-118.801623,37.769096,-118.524033,26.911382,1,2,2,0
14,37.964944,-118.713483,37.974023,-118.736082,2.226852,0,2,6,1
92,37.812726,-118.922886,37.804486,-118.815672,9.485494,1,2,1,1
179,37.712671,-118.895851,37.808123,-118.910155,10.669199,0,3,2,0


In [21]:
X = encoded_data.drop('Matched', axis=1)  # Replace 'target_variable' with the actual name of the target column
y = encoded_data['Matched']  # Replace 'target_variable' with the actual name of the target column


In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 42)

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

lreg = LogisticRegression()
lreg.fit(X_train,y_train) 

#Confusion Matrix
y_pred = lreg.predict(X_test)
conf_mat = confusion_matrix(y_test,y_pred)

print(conf_mat)

[[31  1]
 [ 0 32]]


In [24]:
from sklearn import metrics
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98        32
           1       0.97      1.00      0.98        32

    accuracy                           0.98        64
   macro avg       0.98      0.98      0.98        64
weighted avg       0.98      0.98      0.98        64



In [25]:
y_train_pred = lreg.predict(X_train)

# Calculate and print the train accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Train Accuracy:", train_accuracy)

Train Accuracy: 0.9682539682539683


In [26]:
print(metrics.classification_report(y_train,y_train_pred))

              precision    recall  f1-score   support

           0       1.00      0.94      0.97        63
           1       0.94      1.00      0.97        63

    accuracy                           0.97       126
   macro avg       0.97      0.97      0.97       126
weighted avg       0.97      0.97      0.97       126



In [27]:
import joblib
joblibfile='Matching_algo'
joblib.dump(lreg,joblibfile)

['Matching_algo']

In [28]:
loaded_model = joblib.load(open("Matching_algo", 'rb'))

In [29]:
P_L = loaded_model.predict(X_test) #making sure the loaded model is the same as we trained above

In [30]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test,P_L))

[[31  1]
 [ 0 32]]


In [31]:
rec=[37.720475,-118.706756,37.919783,-118.914674,28.714059,1,6,7]

In [32]:
rec=[rec]

In [33]:
loaded_model.predict(rec)[0]  #testing the model for one final time

0