In [5]:
import pandas as pd
import numpy as np
from faker import Faker

# Initialize Faker
fake = Faker()

# Set the random seed for reproducibility
Faker.seed(0)
np.random.seed(0)

# Define lists for Gender, Race, and Side effects
genders = ['Male', 'Female', 'Non-binary', 'Other']
races = ['Asian', 'Black', 'Hispanic', 'White', 'Other']
side_effects = ['Headache', 'Nausea', 'Dizziness', 'Fatigue', 'None']

# Define the number of records
num_records = 400000

# Generate the data
data = {
    'Name': [fake.name() for _ in range(num_records)],
    'Age': np.random.randint(18, 90, size=num_records),
    'Gender': np.random.choice(genders, size=num_records),
    'Race': np.random.choice(races, size=num_records),
    'Side_effects': np.random.choice(side_effects, size=num_records)
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('drug_trial_data.csv', index=False)

df.head()


Unnamed: 0,Name,Age,Gender,Race,Side_effects
0,Norma Fisher,62,Non-binary,Other,Nausea
1,Jorge Sullivan,65,Male,Hispanic,Headache
2,Elizabeth Woods,82,Non-binary,Hispanic,Dizziness
3,Susan Wagner,85,Female,Black,Dizziness
4,Peter Montgomery,85,Other,White,Fatigue


In [7]:
# Load the dataset
df = pd.read_csv('drug_trial_data.csv')

# Check for missing values
print("Missing values before cleaning:")
print(df.isnull().sum())

# Drop rows with any missing values (if there are any)
df.dropna(inplace=True)

# Remove duplicate entries
df.drop_duplicates(inplace=True)

# Validate data types
df['Age'] = df['Age'].astype(int)

# Handle inconsistent data entries
# Ensure Gender values are from the predefined list
valid_genders = set(genders)
df = df[df['Gender'].isin(valid_genders)]

# Ensure Race values are from the predefined list
valid_races = set(races)
df = df[df['Race'].isin(valid_races)]

# Ensure Side effects values are from the predefined list
valid_side_effects = set(side_effects)
df = df[df['Side_effects'].isin(valid_side_effects)]

# Check the dataset after cleaning
print("Dataset after cleaning:")
print(df.info())
print(df.head())

# Save the cleaned dataset
df.to_csv('cleaned_drug_trial_data.csv', index=False)


Missing values before cleaning:
Name                0
Age                 0
Gender              0
Race                0
Side_effects    79470
dtype: int64
Dataset after cleaning:
<class 'pandas.core.frame.DataFrame'>
Index: 320407 entries, 0 to 399999
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Name          320407 non-null  object
 1   Age           320407 non-null  int32 
 2   Gender        320407 non-null  object
 3   Race          320407 non-null  object
 4   Side_effects  320407 non-null  object
dtypes: int32(1), object(4)
memory usage: 13.4+ MB
None
               Name  Age      Gender      Race Side_effects
0      Norma Fisher   62  Non-binary     Other       Nausea
1    Jorge Sullivan   65        Male  Hispanic     Headache
2   Elizabeth Woods   82  Non-binary  Hispanic    Dizziness
3      Susan Wagner   85      Female     Black    Dizziness
4  Peter Montgomery   85       Other     White      Fatigue

In [11]:
# Load the cleaned dataset
df = pd.read_csv('cleaned_drug_trial_data.csv')

# Anonymize Names by replacing with unique IDs
df['Patient_ID'] = ['PAT' + str(i) for i in range(1, len(df) + 1)]
df.drop(columns=['Name'], inplace=True)

# Ensure Age values are within a realistic range (e.g., 18 to 90)
df = df[(df['Age'] >= 18) & (df['Age'] <= 90)]

# Encode categorical variables
df['Gender'] = df['Gender'].astype('category').cat.codes
df['Race'] = df['Race'].astype('category').cat.codes
df['Side_effects'] = df['Side_effects'].astype('category').cat.codes

# Check the dataset after sanitizing
print("Dataset after sanitizing:")
print(df.info())
print(df.head())

# Save the sanitized dataset
df.to_csv('sanitized_drug_trial_data.csv', index=False)


Dataset after sanitizing:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 320407 entries, 0 to 320406
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Age           320407 non-null  int64 
 1   Gender        320407 non-null  int8  
 2   Race          320407 non-null  int8  
 3   Side_effects  320407 non-null  int8  
 4   Patient_ID    320407 non-null  object
dtypes: int64(1), int8(3), object(1)
memory usage: 5.8+ MB
None
   Age  Gender  Race  Side_effects Patient_ID
0   62       2     3             3       PAT1
1   65       1     2             2       PAT2
2   82       2     2             0       PAT3
3   85       0     1             0       PAT4
4   85       3     4             1       PAT5


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the sanitized dataset
df = pd.read_csv('sanitized_drug_trial_data.csv')

# Check for missing values
print("Missing values in each column before handling:")
print(df.isnull().sum())

# Drop any rows with missing values (if any)
df.dropna(inplace=True)

# Verify no missing values
print("Missing values in each column after handling:")
print(df.isnull().sum())

# Partitioning the data
# Separate features and target variable
X = df.drop(columns=['Side_effects', 'Patient_ID'])
y = df['Side_effects']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Output the shapes of the training and testing sets
train_shape = X_train.shape, y_train.shape
test_shape = X_test.shape, y_test.shape

print("Training set shape (features, target):", train_shape)
print("Testing set shape (features, target):", test_shape)


Missing values in each column before handling:
Age             0
Gender          0
Race            0
Side_effects    0
Patient_ID      0
dtype: int64
Missing values in each column after handling:
Age             0
Gender          0
Race            0
Side_effects    0
Patient_ID      0
dtype: int64
Training set shape (features, target): ((256325, 3), (256325,))
Testing set shape (features, target): ((64082, 3), (64082,))


In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
df = pd.read_csv('drug_trial_data.csv')

# Check for missing values
print("Missing values in each column before handling:")
print(df.isnull().sum())

# Drop any rows with missing values (if any)
df.dropna(inplace=True)

# Verify no missing values
print("Missing values in each column after handling:")
print(df.isnull().sum())

# Encode categorical variables
label_encoders = {}
categorical_columns = ['Gender', 'Race']

for column in categorical_columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Separate features and target variable
X = df.drop(columns=['Side_effects', 'Name'])
y = df['Side_effects']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model to the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Model Accuracy:", accuracy)
print("\nClassification Report:\n", report)


Missing values in each column before handling:
Name                0
Age                 0
Gender              0
Race                0
Side_effects    79470
dtype: int64
Missing values in each column after handling:
Name            0
Age             0
Gender          0
Race            0
Side_effects    0
dtype: int64
Model Accuracy: 0.2487130689795027

Classification Report:
               precision    recall  f1-score   support

   Dizziness       0.25      0.26      0.25     15882
     Fatigue       0.25      0.25      0.25     16057
    Headache       0.25      0.25      0.25     16075
      Nausea       0.25      0.24      0.24     16092

    accuracy                           0.25     64106
   macro avg       0.25      0.25      0.25     64106
weighted avg       0.25      0.25      0.25     64106

