In [1]:
import pandas as pd, sklearn, numpy as np

In [2]:
# Import cervical cancer data.
cerv_cancer = pd.read_csv("risk_factors_cervical_cancer.csv")
cerv_cancer

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
2,34,1.0,?,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,?,?,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,?,?,0,0,0,0,0,0,0,0
5,42,3.0,23.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
6,51,3.0,17.0,6.0,1.0,34.0,3.4,0.0,0.0,1.0,...,?,?,0,0,0,0,1,1,0,1
7,26,1.0,26.0,3.0,0.0,0.0,0.0,1.0,2.0,1.0,...,?,?,0,0,0,0,0,0,0,0
8,45,1.0,20.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,1,0,1,1,0,0,0,0
9,44,3.0,15.0,?,1.0,1.266972909,2.8,0.0,0.0,?,...,?,?,0,0,0,0,0,0,0,0


In [3]:
# Drop null values from dataframe.
cerv_cancer = cerv_cancer.drop(columns=['STDs: Time since first diagnosis','STDs: Time since last diagnosis'])

In [4]:
cerv_cancer = cerv_cancer.replace(to_replace='?', value=np.nan).dropna()

In [6]:
#Basic statistical analysis
cerv_cancer_mean = [(col, cerv_cancer[col].astype('float64').mean()) for col in cerv_cancer.columns]
print("The mean of each feature:\n", cerv_cancer_mean,"\n")
cerv_cancer_std = [(col, cerv_cancer[col].astype('float64').std()) for col in cerv_cancer.columns]
print("The standard deviation of each feature:\n", cerv_cancer_std)

The mean of each feature:
 [('Age', 27.26497005988024), ('Number of sexual partners', 2.5239520958083834), ('First sexual intercourse', 17.142215568862277), ('Num of pregnancies', 2.3233532934131738), ('Smokes', 0.1437125748502994), ('Smokes (years)', 1.235523546541916), ('Smokes (packs/year)', 0.4589530354193114), ('Hormonal Contraceptives', 0.6437125748502994), ('Hormonal Contraceptives (years)', 2.290036528505988), ('IUD', 0.1122754491017964), ('IUD (years)', 0.5300299401197603), ('STDs', 0.09730538922155689), ('STDs (number)', 0.1661676646706587), ('STDs:condylomatosis', 0.05538922155688623), ('STDs:cervical condylomatosis', 0.0), ('STDs:vaginal condylomatosis', 0.005988023952095809), ('STDs:vulvo-perineal condylomatosis', 0.05389221556886228), ('STDs:syphilis', 0.02245508982035928), ('STDs:pelvic inflammatory disease', 0.0014970059880239522), ('STDs:genital herpes', 0.0014970059880239522), ('STDs:molluscum contagiosum', 0.0014970059880239522), ('STDs:AIDS', 0.0), ('STDs:HIV', 0.01

In [9]:
# Drop columns with mean and/or standard deviation values of 0.0
cerv_cancer = cerv_cancer.drop(columns=[mean_item[0] for mean_item, stddev_item in zip(cerv_cancer_mean, cerv_cancer_std)
                                        if mean_item[1] == 0 or stddev_item[1] == 0])

# Split current dataframe to prepare for train_test_split()
X = cerv_cancer.drop(columns=['Dx:Cancer'])
y = cerv_cancer['Dx:Cancer']

In [10]:
# Create training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

In [11]:
# Standardize datasets for Random Forest Classifier
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [19]:
# Create Random Forest Classifier Model and find the most important features in the dataset.
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
rf_model = rf.fit(X_train, y_train)
feat_importances = [(X.columns[idx], col) for col,idx in zip(rf_model.feature_importances_,
                                                                  range(len(rf_model.feature_importances_)))]
most_important_feats = [feat for feat in feat_importances if feat[1] > 0]

In [20]:
# Print column names with their accuracy scores.
print("All important features in determining cervical cancer: \n%s\n" % feat_importances)
print("Most important features in determining cervical cancer: \n%s\n" % most_important_feats)

All important features in determining cervical cancer: 
[('Age', 0.04365364073375368), ('Number of sexual partners', 0.015973937042003214), ('First sexual intercourse', 0.0600141941934009), ('Num of pregnancies', 0.022603705144372127), ('Smokes', 0.0012763600483664916), ('Smokes (years)', 0.05008895819399727), ('Smokes (packs/year)', 0.05926077240460694), ('Hormonal Contraceptives', 0.00046299553033645136), ('Hormonal Contraceptives (years)', 0.023902005988034585), ('IUD', 0.006870568905489685), ('IUD (years)', 0.041536768954859866), ('STDs', 0.0004359095431732357), ('STDs (number)', 0.001227525213568076), ('STDs:condylomatosis', 0.0001490816887713207), ('STDs:vaginal condylomatosis', 0.0), ('STDs:vulvo-perineal condylomatosis', 0.00040173033042284327), ('STDs:syphilis', 0.0), ('STDs:pelvic inflammatory disease', 0.0), ('STDs:genital herpes', 0.0), ('STDs:molluscum contagiosum', 0.0), ('STDs:HIV', 0.0), ('STDs:Hepatitis B', 0.0), ('STDs:HPV', 0.04240054171190456), ('STDs: Number of dia

In [17]:
# Calculate prediction accuracy of Random Forest Model
from sklearn import metrics
predictions = rf_model.predict(X_test)
print("Accuracy of Random Forest Model: %s" % metrics.accuracy_score(y_test, predictions))

Accuracy of Random Forest Model: 0.9820359281437125
