Making the models

In [1]:
import pandas as pd
import numpy as np
import vaex
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import joblib

In [2]:
data =pd.read_csv('Doceree-HCP_Train.csv', encoding='latin-1')
test =pd.read_csv('Doceree-HCP-Test.csv', encoding='latin-1')

In [3]:
# Assuming 'data' is pandas DataFrame containing string columns
label_encoder = LabelEncoder()

for column in data.columns:
    if data[column].dtype == 'object':
        # Encode the string data to numerical values
        data[column] = label_encoder.fit_transform(data[column])

# The Same thing for test_dataset
for column in test.columns:
    if test[column].dtype == 'object':
        test[column] = label_encoder.fit_transform(test[column])

In [4]:
vaex_data=vaex.from_pandas(data)
vaex_test=vaex.from_pandas(test)

In [5]:
vaex_data.export_hdf5('Doceree-HCP_Train.hdf5')
vaex_test.export_hdf5('Doceree-HCP-Test.hdf5')

In [6]:
# # imputing
# imputer = SimpleImputer(strategy='mean')
# # txo_imputer = SimpleImputer(strategy='mean')

In [7]:
vaex_df=vaex.open('Doceree-HCP_Train.hdf5')
# Iterate over each column in the DataFrame
for column in vaex_df.columns:
    # Fill missing values with 0 for the current column
    vaex_df[column].fillna(0, inplace=True)

In [8]:
vaex_df = vaex_df.shuffle(random_state=42)  # Shuffle the DataFrame with a fixed random state
df_train,df_test = vaex_df.ml.train_test_split(test_size=0.2)  # Split into train and test sets

# df_train.shape



In [9]:
features=['DEVICETYPE', 'PLATFORM_ID', 'BIDREQUESTIP', 'USERPLATFORMUID', 'USERCITY', 'USERZIPCODE', 'USERAGENT', 'PLATFORMTYPE', 'CHANNELTYPE', 'URL', 'KEYWORDS']
target='IS_HCP'
txo_features = ['DEVICETYPE', 'PLATFORM_ID', 'BIDREQUESTIP', 'USERPLATFORMUID', 'USERCITY', 'USERZIPCODE', 'USERAGENT', 'PLATFORMTYPE', 'CHANNELTYPE', 'URL', 'KEYWORDS']
txo_target='TAXONOMY'

In [10]:
# Create the RandomForestRegressor model
model = RandomForestClassifier(n_estimators=100)

In [11]:
# Fill missing values with a specific value (e.g., 0)
df_train = df_train.fillna(0)
df_test = df_test.fillna(0)

In [26]:
print((df_train))

#       ID      DEVICETYPE    PLATFORM_ID    BIDREQUESTIP    USERPLATFORMUID    USERCITY    USERZIPCODE    USERAGENT    PLATFORMTYPE    CHANNELTYPE    URL    KEYWORDS    TAXONOMY    IS_HCP
0       13340   0             7              12917           6723               3888        22560.0        2863         3               0              1933   2007        207         0.0
1       55559   0             2              4994            34235              2718        10010.0        2852         3               0              817    575         207         0.0
2       82193   2             2              11467           1127               642         17202.0        2210         3               0              4174   404         149         1.0
3       18089   1             2              1490            27233              24          44307.0        3965         3               0              1074   80          149         1.0
4       68921   1             2              24535           15118 

In [36]:
# Extract the features and target arrays from the DataFrames
X_train = df_train[features].values
# X_train.shapes
# print(X_train)
y_train = df_train[target].values
y_train = y_train.ravel()
txo_train = y_train #to use in futuer

X_test = df_test[features].values
y_test = df_test[target].values.astype(np.int64)
y_test = y_test.ravel()
txo_test = y_test #to use in futuer

print(X_train.shape,y_train.shape)

(91150, 11) (91150,)


In [37]:
print(y_test)

[1 1 0 ... 0 1 0]


In [13]:
# # impute NaN values
# X_train[np.isnan(X_train)] = 0

# y_train = pd.Series(y_train)
# # Fill missing values in y_train with 0
# y_train = y_train.fillna(0)

# print(X_train.shape , y_train.shape)

In [38]:
# Fit the model
model.fit(X_train, y_train)

In [39]:
# making predictions
# X_test[np.isnan(X_test)] = 0
predictions = model.predict(X_test)
# print(predictions)

accuracy = accuracy_score(y_test,predictions)
# Print the values
print("IS_HCP Model Evaluation:")
print("Accuracy is :", accuracy)
print(X_train.shape,y_train.shape)

IS_HCP Model Evaluation:
Accuracy is : 0.9920568745337254
(91150, 11) (91150,)


In [43]:

#predictions=predictions.astype(np.int64)
predictions2=label_encoder.inverse_transform(predictions)
print(predictions2)

['Abortion|Anxiety Disorders|Apnea|False|Trauma|Mental Health|Psychiatry|Urology|Cardiology|Medicine|Health|Bipolar and Related Disorders|General|Clinical|Myocardial Infarction|Conduct Disorders'
 'Abscess|Critical Care|Drainage|Pathology|Gastroenterology|Rheumatology|Medicine|Anesthesia|Emergency Medicine|General|Clinical|Trauma|Oncology|Orthopedics|Physical Medicine & Rehabilitation|Dentistry|Small|False|Oral|Preventive|Neurological Surgery|Neurology|Shock|Intravenous|Cardiac|Urogenital|Dermatology|Nuclear Medicine|Surgery|Health|Specialties|Rehabilitation|Diagnosis|Infected|Cardiology|Resuscitation|Surgical|Anesthesiology|Pediatrics|General Practice|Transplantation|Congenital|Technology|Large|Ophthalmology|Preventive Medicine|Psychiatry|Debridement|Adrenal Hyperplasia|Otolaryngology|Urology'
 'Abortion|Anxiety Disorders|Apnea|False|Trauma|Mental Health|Psychiatry|Urology|Cardiology|Medicine|Health|Bipolar and Related Disorders|General|Clinical|Myocardial Infarction|Conduct Disorders

Txo Model

In [16]:
# # creating mask
# mask1 = y_train.astype(np.int64)
# mask2 = y_test.astype(np.int64)

In [17]:
# Extract the features and target arrays from the DataFrames
X_train = df_train[txo_features].values
y_train = df_train[txo_target].values
# print(X_train.shape)

X_test = df_test[txo_features].values
y_test = df_test[txo_target].values

# # mask = predictions == 1
# X_train = X_train[mask1]
# print(X_train.shape)

# y_train = y_train[mask1]
# print(y_train)

# # mask = predictions == 1
# X_test = X_test[mask2]
# y_test = y_test[mask2]


# y_test = y_test.ravel()

In [18]:
# Creating model
txo_model = RandomForestClassifier(n_estimators=100)

In [19]:
# impute NaN values
# X_train[np.isnan(X_train)] = 0

# y_train = pd.Series(y_train)
# # Fill missing values in y_train with 0
# y_train = y_train.fillna(0)

# print(X_train.shape,y_train.shape)
# X_train

In [20]:
# Fit model
txo_model.fit(X_train,y_train)

In [52]:
# Model Evaluation
# X_test[np.isnan(X_test)] = 0

t_predictions = txo_model.predict(X_test)
t_prediction=label_encoder.inverse_transform(t_predictions)

In [53]:
print(t_prediction)

['Critical Care|Hemoptysis|Thoracotomy|Small|Chronic|Cardiology|Oncology|Heart Failure|General Practice|Lung Diseases|Biopsy|Dentistry|Technology|Surgical|Neurology|Nuclear Medicine|Pediatrics|Autoimmune|Rehabilitation|Large|Otolaryngology|Physical Medicine & Rehabilitation|Resuscitation|Health|Preventive|Anesthesiology|Dermatology|Gastroenterology|Neurological Surgery|Diabetes Mellitus|Macrophages|Physicians|Diagnosis|Tuberculosis|Ophthalmology|Urology|Medicine|Lung|Cardiac|False|Trauma|Female|Papillary|Oral|Rheumatology|Surgery|Specialties|Transplantation|Intravenous|Orthopedics|Emergency Medicine|Myocardial Infarction|General|Clinical|Pathology|Preventive Medicine|Psychiatry'
 'Child|Clinical|Diagnosis|Medicine|Oral|Chronic|Total|Cardiology|Dermatology|Urology|Dermatitis|General|Autoimmune|False'
 'Critical Care|Hemoptysis|Thoracotomy|Small|Chronic|Cardiology|Oncology|Heart Failure|General Practice|Lung Diseases|Biopsy|Dentistry|Technology|Surgical|Neurology|Nuclear Medicine|Pediatr

In [23]:
# Calculate and print the evaluation metrics 0.5745445036738107 24.9261978021978
accuracy = accuracy_score(y_test,t_prediction)

print("\nTAXONOMY Model Evaluation:")
print("Accurcy is :", accuracy)


TAXONOMY Model Evaluation:
Accurcy is : 0.9138105059902576


Saving the models

In [24]:
# Save the trained model
joblib.dump(model, 'trained_model.pkl')
joblib.dump(txo_model, 'txo_trained_model.pkl')

['txo_trained_model.pkl']

Using the models for prediction

Part-1

In [25]:
## Load the trained model
prediction_model = joblib.load('trained_model.pkl')

# Load the vaex DataFrame for prediction
df_prediction = vaex.open('Doceree-HCP-Test.hdf5')
# Iterate over each column in the DataFrame
for column in df_prediction.columns:
    # Fill missing values with 0 for the current column
    df_prediction[column].fillna(0, inplace=True)

df_prediction = df_prediction.fillna(0)

X_prediction = df_prediction[features].values
user_ids = df_prediction['ID'].values

# Make predictions
predictions = prediction_model.predict(X_prediction)

# Apply threshold for classification if needed
# threshold = 0.461  //use previous used value of threshold

# class_predictions = np.where(predictions >= threshold, 1, 0)
IS_HCP = predictions

Part-2

In [26]:
# Loading Txo model
txo_prediction_model = joblib.load('txo_trained_model.pkl')

# Making dataframe for txo model
X_prediction = X_prediction[IS_HCP == 1]
ids = df_prediction['ID'].values
ids = ids[IS_HCP == 1]

In [27]:
# X_prediction = df_prediction[txo_features].values

In [44]:
#make prediction
prediction = txo_prediction_model.predict(X_prediction)

In [45]:
# if prediction.dtype == int:
prediction = label_encoder.inverse_transform(prediction)

In [46]:
zip(X_prediction,prediction)

# creating mask
mask = np.isin(user_ids,ids)

In [47]:
prediction = np.resize(prediction,len(mask))


In [48]:
print(prediction)

['Child|Clinical|Diagnosis|Medicine|Oral|Chronic|Total|Cardiology|Dermatology|Urology|Dermatitis|General|Autoimmune|False'
 'Critical Care|Hemoptysis|Thoracotomy|Small|Chronic|Cardiology|Oncology|Heart Failure|General Practice|Lung Diseases|Biopsy|Dentistry|Technology|Surgical|Neurology|Nuclear Medicine|Pediatrics|Autoimmune|Rehabilitation|Large|Otolaryngology|Physical Medicine & Rehabilitation|Resuscitation|Health|Preventive|Anesthesiology|Dermatology|Gastroenterology|Neurological Surgery|Diabetes Mellitus|Macrophages|Physicians|Diagnosis|Tuberculosis|Ophthalmology|Urology|Medicine|Lung|Cardiac|False|Trauma|Female|Papillary|Oral|Rheumatology|Surgery|Specialties|Transplantation|Intravenous|Orthopedics|Emergency Medicine|Myocardial Infarction|General|Clinical|Pathology|Preventive Medicine|Psychiatry'
 'Critical Care|Hemoptysis|Thoracotomy|Small|Chronic|Cardiology|Oncology|Heart Failure|General Practice|Lung Diseases|Biopsy|Dentistry|Technology|Surgical|Neurology|Nuclear Medicine|Pediatr

In [49]:
txo_prediction = np.where(mask, prediction,"")

# print(user_ids.shape,txo_prediction.shape,IS_HCP.shape)

In [51]:
# Create a DataFrame with the user IDs (userplatformuid) and the predicted HCP values and taxonomy predictions

predictions = pd.DataFrame({'ID': user_ids,'taxonomy': txo_prediction, 'IS_HCP': IS_HCP})

# Save the DataFrame to an Excel file
predictions.to_excel('predictions.xlsx', index=False)