In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE


In [6]:
import pandas as pd

file_path = r"C:\Users\COMPAQ\Downloads\MASTER SHEET.csv"
df = pd.read_csv(file_path)

In [8]:
# Rename columns for easier reference
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")


In [10]:
# Define abnormal results based on keywords
abnormal_keywords = ["ASCUS", "HSIL", "CANDIDA", "BACTERIAL VAGINOSIS", "INFLAMMATORY"]


In [12]:
def classify_pap_result(result):
    result = str(result).upper().strip()
    if any(keyword in result for keyword in abnormal_keywords):
        return 1  # Abnormal
    return 0  # Normal

In [14]:
# Apply classification
df["pap_label"] = df["pap_results"].apply(classify_pap_result)


In [16]:
# Convert 'age' to numeric, replacing missing values with the median age
df["age"] = pd.to_numeric(df["age"], errors="coerce")
df["age"].fillna(df["age"].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["age"].fillna(df["age"].median(), inplace=True)


In [18]:
print(df.columns)


Index(['patient_no', 'age', 'hpv_type_(cobas)', 'ohr_genotype', 'pap_results',
       'pap_label'],
      dtype='object')


In [22]:
df.rename(columns=lambda x: x.strip(), inplace=True)

In [24]:
print(df["age"].unique())  # Look for unexpected values

[51.  49.  41.5 36.  39.  55.  42.  31.  44.  43.  40.  37.  32.  38.
 33.  34.  35.  56.  48.  30.  41.  57.  50.  61.  28.  45.  25. ]


In [26]:
df["age"] = df["age"].replace(["unknown", "N/A"], None)

In [28]:
print(df["age"].isnull().sum(), df["age"].notna().sum())

0 52


In [32]:
if df["age"].notna().sum() > 0:  
    df["age"].fillna(df["age"].median(), inplace=True)
else:
    print("No valid numeric values in 'age' column!")
    

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["age"].fillna(df["age"].median(), inplace=True)


In [34]:
df["age"] = pd.to_numeric(df["age"], errors="coerce")
df["age"] = df["age"].astype(float)  # Convert to float if needed

In [36]:
# Encode 'OHR GENOTYPE' as categorical numeric values
label_encoder = LabelEncoder()
df["ohr_genotype"] = label_encoder.fit_transform(df["ohr_genotype"].astype(str))


In [38]:
# Select features and target variable
X = df[["age", "ohr_genotype"]]
y = df["pap_label"]


In [40]:
# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


In [42]:
# Split dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [44]:
# Make predictions on the test set
y_pred = model.predict(X_test)


In [46]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)


In [48]:
print(f"Balanced Model Accuracy: {accuracy:.2f}")
print("Classification Report:\n", report)


Balanced Model Accuracy: 0.53
Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.38      0.46         8
           1       0.50      0.71      0.59         7

    accuracy                           0.53        15
   macro avg       0.55      0.54      0.52        15
weighted avg       0.55      0.53      0.52        15



In [52]:
# Print results
print(f"Accuracy: {accuracy:.2f}")
print(f"Balanced Accuracy: {balanced_acc:.2f}")
print("Classification Report:\n", report)

Accuracy: 0.53


NameError: name 'balanced_acc' is not defined

In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# Load the dataset
file_path =  r"C:\Users\COMPAQ\Downloads\MASTER SHEET.csv"
df = pd.read_csv(file_path)

# Rename columns for easier reference
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Define abnormal results based on keywords
abnormal_keywords = ["ASCUS", "HSIL", "CANDIDA", "BACTERIAL VAGINOSIS", "INFLAMMATORY"]

def classify_pap_result(result):
    result = str(result).upper().strip()
    if any(keyword in result for keyword in abnormal_keywords):
        return 1  # Abnormal
    return 0  # Normal

# Apply classification
df["pap_label"] = df["pap_results"].apply(classify_pap_result)

# Convert 'age' to numeric, replacing missing values with the median age
df["age"] = pd.to_numeric(df["age"], errors="coerce")
df["age"].fillna(df["age"].median(), inplace=True)

# Encode 'OHR GENOTYPE' as categorical numeric values
label_encoder = LabelEncoder()
df["ohr_genotype"] = label_encoder.fit_transform(df["ohr_genotype"].astype(str))

# Select features and target variable
X = df[["age", "ohr_genotype"]]
y = df["pap_label"]

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Standardize features for SVM
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)

# Split dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Train an SVM model
model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"SVM Model Accuracy: {accuracy:.2f}")
print("Classification Report:\n", report)



SVM Model Accuracy: 0.47
Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.62      0.56         8
           1       0.40      0.29      0.33         7

    accuracy                           0.47        15
   macro avg       0.45      0.46      0.44        15
weighted avg       0.45      0.47      0.45        15



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["age"].fillna(df["age"].median(), inplace=True)
