In [1]:
import joblib
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [2]:
#Functions to perform EDA, Col transformation to prepare for training/testing
x_cols = ["gender","freq_see_elderly","knowledge_elderly_pop","freq_interact_w_elderly","difficulties_interacting","disrupt_student_lives","thoughts_inter_hub"]
y_col = "interest_csp_elderly_smu"


def prepare_data(x_cols,y_col):
    df = pd.read_csv("youth_opinions_3.csv")

    for col in df:
        #get dtype for column
        dt = df[col].dtype 
        #check if it is a number
        if dt == int or dt == float:
            df[col] = df[col].fillna(0)
        else:
            df[col] = df[col].fillna("-")
    
    #renamed columns

    df.columns = ['gender','faculty','ug_pg','year','csp_cleared','cleared_mode',
                  'clearing_plan','freq_see_elderly','elderly_do','knowledge_elderly_pop','freq_interact_w_elderly',
                  'difficulties_interacting','difficulties_face','interest_participate_elderly_center_csp','thoughts_inter_hub','thoughts','brand',
                  'disrupt_student_lives','disrupt_reason','interest_csp_elderly_smu']

    #Encode textual data into classes
    df['csp_cleared'] = (df['csp_cleared'] == 'Yes' ).astype(int)
    df['knowledge_elderly_pop'] = (df['knowledge_elderly_pop'] == 'Yes' ).astype(int)
    df['difficulties_interacting'] = (df['difficulties_interacting'] == 'Yes' ).astype(int)
    df['interest_participate_elderly_center_csp'] = (df['interest_participate_elderly_center_csp'] == 'Yes' ).astype(int)
    df['disrupt_student_lives'] = (df['disrupt_student_lives'] == 'Yes' ).astype(int)

    df["gender"] = df["gender"].apply(encode_gender)
    df["gender"] = df["gender"].astype(int)
    
    df['thoughts_inter_hub'] = df['thoughts_inter_hub'].apply(encode_score)
    df['thoughts_inter_hub'] = df['thoughts_inter_hub'].astype(int)
    
    elderly_related = df["cleared_mode"]+ df["clearing_plan"]
    df["csp_elderly_related"] = elderly_related.apply(score_csp_elderly)
    
    
    df["year"] = df["year"].apply(remove_year_text)
    df["year"] = df["year"].astype(int)
    

    series = []
    for col in x_cols:
        series.append(df[col])

    x_df = pd.concat(series,axis=1)
    
    x_data = x_df
    prepped_y = df[y_col].apply(prep_class_labels)
    y_data = prepped_y
    
    return x_data,y_data

    
def prep_class_labels(data):
    if data >= 3:
        return 1
    else:
        return 0
    
    
#Encode the thoughts on intergenerational hub
def encode_score(data):
    if data == "That's an excellent idea!":
        return 4
    if data == "That's good!":
        return 3
    if data == "Not bad":
        return 2
    if data == "Don't really feel good about it":
        return 1
    if data == "No way!":
        return 0
    
def encode_gender(data):
    if data == "Male":
        return 1
    if data == "Female":
        return 0

def score_csp_elderly(data):
    words_related = ["elderly","Inspirar","old","folks"]
    might_be_words_related = ["Uni-Y","uniy","uni-y","rotaract","Rotaract"]
    if any(word in data for word in words_related):
        return 2
    elif any(word in data for word in might_be_words_related):
        return 1
    return 0

def remove_year_text(data):
    if data == "-":
        return 0
    data = data.replace("Year","")
    data = int(data)
    return data

In [3]:
x_data, y_data = prepare_data(x_cols,y_col)


#Split into train and test set
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.15, 
                                                          stratify = y_data,
                                                          random_state = 1)



print(y_data.value_counts(1))
print("Probability of Interested in Elderly CSP in SMU:",y_data.value_counts(1)[1],"%")
print("Probability of NOT Interested in Elderly CSP in SMU:",y_data.value_counts(1)[0],"%")

1    0.648649
0    0.351351
Name: interest_csp_elderly_smu, dtype: float64
Probability of Interested in Elderly CSP in SMU: 0.6486486486486487 %
Probability of NOT Interested in Elderly CSP in SMU: 0.35135135135135137 %


In [4]:
#Load all the saved models

path = "./backup_best_models/"

regressor = joblib.load(path + "logistic_regression_youth.sav")
nb = joblib.load(path + "naive_bayes_youth.sav")
dt = joblib.load(path + "decision_tree_youth.sav")
random_forest = joblib.load(path + "random_forest_classifier_youth.sav")
best_tree = joblib.load(path + "random_forest_best_tree_youth.sav")
clf = joblib.load(path + "SVM_youth.sav")

In [5]:
print("Logistic Regression Measures")
y_predict = regressor.predict(X_test)

#cnf_matrix = confusion_matrix(y_test, y_pred)
cnf_matrix = confusion_matrix(y_test, y_predict, labels = [1,0])
print(cnf_matrix)

tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
specificity = tn / (tn+fp)
precision = tp / (tp + fp)
recall_or_sensitivity = tp / (tp + fn)

print("Specificity :", round(specificity,2))
print("Precision :", round(precision, 2))
print("Recall or Sensitivity :", round(recall_or_sensitivity, 2))
print("F Score",f1_score(y_test, y_predict))

Logistic Regression Measures
[[8 0]
 [1 3]]
Specificity : 0.75
Precision : 0.89
Recall or Sensitivity : 1.0
F Score 0.9411764705882353


In [6]:
print("Naive Bayes Measures")
y_predict = nb.predict(X_test)

#cnf_matrix = confusion_matrix(y_test, y_pred)
cnf_matrix = confusion_matrix(y_test, y_predict, labels = [1,0])
print(cnf_matrix)

tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
specificity = tn / (tn+fp)
precision = tp / (tp + fp)
recall_or_sensitivity = tp / (tp + fn)

print("Specificity :", round(specificity,2))
print("Precision :", round(precision, 2))
print("Recall or Sensitivity :", round(recall_or_sensitivity, 2))
print("F Score",f1_score(y_test, y_predict))

print(nb.classes_)
print(nb.class_prior_)



Naive Bayes Measures
[[7 1]
 [2 2]]
Specificity : 0.5
Precision : 0.78
Recall or Sensitivity : 0.88
F Score 0.823529411764706
[0 1]
[0.35135135 0.64864865]


In [7]:
print("Decision Tree Measures")
y_predict = dt.predict(X_test)

#cnf_matrix = confusion_matrix(y_test, y_pred)
cnf_matrix = confusion_matrix(y_test, y_predict, labels = [1,0])
print(cnf_matrix)

tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
specificity = tn / (tn+fp)
precision = tp / (tp + fp)
recall_or_sensitivity = tp / (tp + fn)

print("Specificity :", round(specificity,2))
print("Precision :", round(precision, 2))
print("Recall or Sensitivity :", round(recall_or_sensitivity, 2))
print("F Score",f1_score(y_test, y_predict))

Decision Tree Measures
[[7 1]
 [1 3]]
Specificity : 0.75
Precision : 0.88
Recall or Sensitivity : 0.88
F Score 0.875


In [8]:
print("Random Forest Classifier Measures")
y_predict = random_forest.predict(X_test)

#cnf_matrix = confusion_matrix(y_test, y_pred)
cnf_matrix = confusion_matrix(y_test, y_predict, labels = [1,0])
print(cnf_matrix)

tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
print(tn,fp,fn,tp)
specificity = tn / (tn+fp)
precision = tp / (tp + fp)
recall_or_sensitivity = tp / (tp + fn)

print("Specificity :", round(specificity,2))
print("Precision :", round(precision, 2))
print("Recall or Sensitivity :", round(recall_or_sensitivity, 2))
print("F Score",f1_score(y_test, y_predict))




for i in range(len(random_forest.feature_importances_)):
    feature = random_forest.feature_importances_[i]
    print(feature,":",x_cols[i])


Random Forest Classifier Measures
[[6 2]
 [0 4]]
4 0 2 6
Specificity : 1.0
Precision : 1.0
Recall or Sensitivity : 0.75
F Score 0.8571428571428571
0.10630759355315132 : gender
0.20309410156126162 : freq_see_elderly
0.07918856644678446 : knowledge_elderly_pop
0.13782349399623556 : freq_interact_w_elderly
0.0435029792672991 : difficulties_interacting
0.09795554723295223 : disrupt_student_lives
0.3321277179423158 : thoughts_inter_hub


In [9]:
# print("Random Forest Best Tree Measures")
# y_predict = best_tree.predict(X_test)

# #cnf_matrix = confusion_matrix(y_test, y_pred)
# cnf_matrix = confusion_matrix(y_test, y_predict, labels = [1,0])
# print(cnf_matrix)

# tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
# specificity = tn / (tn+fp)
# precision = tp / (tp + fp)
# recall_or_sensitivity = tp / (tp + fn)

# print("Specificity :", round(specificity,2))
# print("Precision :", round(precision, 2))
# print("Recall or Sensitivity :", round(recall_or_sensitivity, 2))
# print("F Score",f1_score(y_test, y_predict))



# for i in range(len(best_tree.feature_importances_)):
#     feature = best_tree.feature_importances_[i]
#     print(feature,":",x_cols[i])

In [10]:
print("SVM Measures")
y_predict = clf.predict(X_test)

#cnf_matrix = confusion_matrix(y_test, y_pred)
cnf_matrix = confusion_matrix(y_test, y_predict, labels = [1,0])
print(cnf_matrix)

tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
specificity = tn / (tn+fp)
precision = tp / (tp + fp)
recall_or_sensitivity = tp / (tp + fn)

print("Specificity :", round(specificity,2))
print("Precision :", round(precision, 2))
print("Recall or Sensitivity :", round(recall_or_sensitivity, 2))
print("F Score",f1_score(y_test, y_predict))


SVM Measures
[[8 0]
 [1 3]]
Specificity : 0.75
Precision : 0.89
Recall or Sensitivity : 1.0
F Score 0.9411764705882353
