In [2]:
"""    ~ DOC String ~

The data used in the program is from the TV Series Game of Thrones which aired
from 2011 - 2019.The premise is several powerful families fight for control of
the Seven Kingdoms.There are a total of 73 episodes in which there have been 
1k+ of characters throughout  the series. 

This program is designed to remove null values from features and fill with
appropriate metrics.The program consists of two model types, 
Logistic Regression & Classification Trees.Each model type has its own 
functions for hyperparameter tuning. The models produce a confusion matrix,
training, testing, and AUC scores.

Extra Information:
https://en.wikipedia.org/wiki/Game_of_Thrones
https://www.imdb.com/title/tt0944947/

No known bugs or Errors
"""

'    ~ DOC String ~\n\nThe data used in the program is from the TV Series Game of Thrones which aired\nfrom 2011 - 2019.The premise is several powerful families fight for control of\nthe Seven Kingdoms.There are a total of 73 episodes in which there have been \n1k+ of characters throughout  the series. \n\nThis program is designed to remove null values from features and fill with\nappropriate metrics.The program consists of two model types, \nLogistic Regression & Classification Trees.Each model type has its own \nfunctions for hyperparameter tuning. The models produce a confusion matrix,\ntraining, testing, and AUC scores.\n\nExtra Information:\nhttps://en.wikipedia.org/wiki/Game_of_Thrones\nhttps://www.imdb.com/title/tt0944947/\n\nNo known bugs or Errors\n'

# Imports

In [3]:
#Importing the libraries needed
import pandas as pd # importing data science lib
import numpy as np #importing math lib
import matplotlib.pyplot as plt #importing data plotting lib
import seaborn as sns # impirting enhanced graphics for plotting lib
import gender_guesser.detector as gender # importing gener detector
from sklearn.neighbors import KNeighborsRegressor # KNN for Regression
from sklearn.preprocessing import StandardScaler # standard scaler
from sklearn.model_selection import train_test_split # train-test split
from sklearn.linear_model import LogisticRegression  # logistic regression
import statsmodels.formula.api as smf                # logistic regression
from sklearn.metrics import confusion_matrix         # confusion matrix
from sklearn.metrics import roc_auc_score            # auc score
from sklearn.tree import DecisionTreeClassifier      # classification trees
from sklearn.tree import plot_tree                   # tree plots
from sklearn.model_selection import RandomizedSearchCV     # hyperparameter tuning
from sklearn.metrics import make_scorer #importing scoring models lib


#custom function for helping in the tuning of hyper params
def plot_feature_importances(model, train, export = False):
    """
    Plots the importance of features from a CART model.
    
    PARAMETERS
    ----------
    model  : CART model
    train  : explanatory variable training data
    export : whether or not to export as a .png image, default False
    """
    
    # declaring the number
    n_features = train.shape[1]
    
    # setting plot window
    fig, ax = plt.subplots(figsize=(12,9))
    
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), train.columns)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    
    if export == True:
        plt.savefig('./analysis_images/Feature_Importance.png')


#  Data & File Reading

In [4]:
#saving the data inside a file variable
file = "./GOT_character_predictions.xlsx"

# setting pandas print options
pd.set_option('display.max_rows', 900)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)

#reading in the excel data and saving it in a variable
got = pd.read_excel(io = file, header = 0, sheet_name = 0)


#looping to creating new columns for missing values in existing features
for col in got:
    if got[col].isnull().astype(int).sum() > 0:
        got['m_'+col] = got[col].isnull().astype(int)

# Handling Missing Values

In [5]:
#saving the mode of titles in a variable
title_mode = "Ser"
# filling title NAs with mode
got['title'].fillna(value = title_mode, inplace = True)

#new feature to capture the dummies
got["new_title"] = 0
#looping through to replace into dummies
for i, value in got.iterrows():
    if got.loc[i,"title"] == "Ser":
        got.loc[i, "new_title"] = int(1)
################################################### TITLE
# filling house NAs with 0
got['house'].fillna(value = 0,inplace = True)

#creating new feature
got["new_house"] = 0

#making the houses into dummy
got.loc[got["house"] == 0 , "new_house"] =0
got.loc[got["house"] != 0 , "new_house"] =1
###################################################### HOUSE
#instantiate a new temp feature
got["new_culture"] = 0

#loop through all the rows in culture and set value accordinly
for i, value in got.iterrows():
    if got.loc[i,"culture"] == "Northmen":
        got.loc[i, "new_culture"] = 1
        
    elif got.loc[i,"culture"] == "Ironborn":
        got.loc[i, "new_culture"] = 1
        
    elif got.loc[i,"culture"] == "Free Folk":
        got.loc[i, "new_culture"] = 1
    
    elif got.loc[i,"culture"] == "Valyrian":
        got.loc[i, "new_culture"] = 1
        
    elif got.loc[i,"culture"] == "Braavosi":
        got.loc[i, "new_culture"] = 1
        
#replacing the new feature into original col name       
got["culture"] = got["new_culture"]
###################################################### Culture

#loop through all the rows in age and set value accordinly
for i, value in got.iterrows():
    if got.loc[i,"age"] <0 :
        got.loc[i, "age"] = (got.loc[i,"age"]*-1)/10000
        
#saving the median age in variable
age_median = round(got["age"].median())

#filling the missing values with the median age
got['age'].fillna(value = age_median,inplace = True)

###################################################### AGE

# filling heir NAs with 0
got['heir'].fillna(value = 0,inplace = True)

#creating new feature
got["new_heir"] = 0

#making the heir into dummy
got.loc[got["heir"] == 0 , "new_heir"] =0
got.loc[got["heir"] != 0 , "new_heir"] =1
###################################################### Heir

#correlation was very weak
#got["rel_person"] = 1

#for index, name in got.iterrows():      
#    if got.loc[index, "gender_guess"]== "unknown":
#        got.loc[index,"rel_person"] = 0
 ###################################################### Gender Guess       


# Feature Engineering 

In [6]:
#Combination of all the books
got["all_books"]= got["book1_A_Game_Of_Thrones"] + got["book2_A_Clash_Of_Kings"]+ \
got["book3_A_Storm_Of_Swords"] + got["book4_A_Feast_For_Crows"]+ \
got["book5_A_Dance_with_Dragons"]         #Na

#combination of the two highest correlated books
got["book_1_4"] = got["book1_A_Game_Of_Thrones"] + got["book4_A_Feast_For_Crows"] #na

#feature for age and married
got["age_married"] = got["age"] + got["isMarried"] #na

#feature for house and heir
got["house_heir"] = got["new_house"] + got["new_heir"] #na
#feature for house and title
got["house_title"] = got["new_house"] + got["new_title"] #no correlation
#feature for age and number of dead relations
got["dead_age"] = got["age"] * got["numDeadRelations"] #marginal
#feature for popularity and house
got["pop_house"] = got["popularity"] * got["new_house"] #na

# feature for missing title - missing house - missing culture
got["house_cult_title"] = got["m_title"] + got["m_house"] + got["m_culture"] #marginal

#feature for LOG of age 
got["log_age"] = np.log(got["age"]+0.000001)

#dividing age into above and below the feature mean
got["split_age"] = 0
mean_age = round(got["age"].mean(),0)
#looping through all the observations
for index, x in got.iterrows():      #marginal at best
    if got.loc[index, "age"]>= mean_age:
        got.loc[index,"split_age"] = 1
        
#inversie of popularity feature
got["unpopular"] = 1- got["popularity"]
#log of popularity
got["log_popularity"] = np.log(got["popularity"]+0.001) #HIGH CORR

# Dropping Non-Valuable Features 

NOTES

These are not relevant as they would not impact logically if the current person is alive or not. 

1)isAliveFather 2)  isAliveHeir 3)  isAliveMother 4)  isAliveSpouse               

These are not relevant because another category already accounts for this information. such as birthday and age are the same data point essentially only in different forms. 

-dateOfBirth       
-spouse    

These have to little data to do anything relevant. 
-Father
-Mother

Produces a weak correlation to the Response Variable and processing time takes to long.

Male (Using Gender Guesser)


In [7]:
#making a copy of the data set
got_copy = got.copy(deep = False)

#droping the columns that wont be used in models
got_copy = got_copy.drop(columns =["dateOfBirth","isAliveSpouse" ,"spouse" ,"isAliveFather",
              "father","isAliveHeir","isAliveMother", "mother"],
              axis = 1)


# Dictionary of Response Variables & Setting Train Test Split


In [8]:
#dict for testing multiple x variable combinations
got_copy_dict = {
    
    'test_1': ['book4_A_Feast_For_Crows', 
                'm_age', 'age', 'popularity'],
                
    'test_2':  [  "book1_A_Game_Of_Thrones","numDeadRelations","m_age",
                            "book4_A_Feast_For_Crows","new_heir","popularity" ],
    
    'test_3':  [  "book1_A_Game_Of_Thrones","numDeadRelations","m_age",
                            "book4_A_Feast_For_Crows","new_heir","popularity" ],
    
    'test_4': ['book4_A_Feast_For_Crows', 
                 'age_married','log_popularity','dead_age']
}
#preparing the x variables
got_copy_data   =  got_copy.loc[ : , got_copy_dict["test_4"]]
#preparing the response variable 
got_copy_target = got_copy.loc[:, "isAlive"]


#setting up the train test split with given params
x_train, x_test, y_train, y_test = train_test_split(
            got_copy_data,
            got_copy_target,
            test_size = 0.10,
            random_state = 219,
            stratify = got_copy_target)


# Logistic Regression &  Hyperparameter Tuning

In [9]:

# declaring a hyperparameter space
C_range          = np.arange(0.1, 5.0, 0.1)
warm_start_range = [True, False]
solver_range     = ['newton-cg', 'sag', 'lbfgs','saga']
max_iter_range    = np.arange(600,1000,100) 
tot               =  np.arange(0.01,3,0.01)

# creating a hyperparameter grid
param_grid = {'C'          : C_range,
              'warm_start' : warm_start_range,
              'solver'     : solver_range,
              'max_iter'   : max_iter_range,
                'tol'       : tot}


# INSTANTIATING the model object without hyperparameters
lr_tuned = LogisticRegression(random_state = 219)


# GridSearchCV object
lr_tuned_cv = RandomizedSearchCV(estimator           = lr_tuned,   # the model object
                                 param_distributions = param_grid, # parameters to tune
                                 cv                  = 3,          # how many folds in cross-validation
                                 n_iter              = 250,        # number of combinations of hyperparameters to try
                                 random_state        = 219,        # starting point for random sequence
                                 scoring = make_scorer(
                                           roc_auc_score,
                                           needs_threshold = False)) # scoring criteria (AUC)


# FITTING to the FULL DATASET (due to cross-validation)
lr_tuned_cv.fit(got_copy_data, got_copy_target)




# printing the optimal parameters and best score
print("Tuned Parameters  :", lr_tuned_cv.best_params_)
print("Tuned CV AUC      :", lr_tuned_cv.best_score_.round(4))

Tuned Parameters  : {'warm_start': True, 'tol': 2.17, 'solver': 'lbfgs', 'max_iter': 800, 'C': 3.9000000000000004}
Tuned CV AUC      : 0.6342


In [10]:


#instantiating the model for logistic regression 
logreg = LogisticRegression(solver = 'lbfgs',
                            C = 3.9,
                            max_iter= 800,
                            warm_start= True,
                            random_state = 219,
                           tol = 2.17)


#fitting the training data for the model
logreg_fit = logreg.fit(x_train, y_train)


#predicting for the model based on test set 
logreg_pred = logreg_fit.predict(x_test)


#scoring and printing the results
print("LOG REGRESSION")
print('Training ACCURACY:', logreg_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', logreg_fit.score(x_test, y_test).round(4))


#saving the score data in a variable
logreg_train_score = logreg_fit.score(x_train, y_train).round(4) # accuracy
logreg_test_score  = logreg_fit.score(x_test, y_test).round(4) # accuracy


#printing out a confusion matrix
print(confusion_matrix(y_true = y_test,
                       y_pred = logreg_pred))

# saving the confusion matrix in multiple variables
logreg_tn, \
logreg_fp, \
logreg_fn, \
logreg_tp = confusion_matrix(y_true = y_test, y_pred = logreg_pred).ravel()


#printing the confusion matrix out in a better format
print(f"""
True Negatives : {logreg_tn}
False Positives: {logreg_fp}
False Negatives: {logreg_fn}
True Positives : {logreg_tp}
""")

#Printing out the AUC score
print("AUC")
print(roc_auc_score(y_true  = y_test,
                    y_score = logreg_pred).round(decimals = 4))


#saving AUC score in a variable
logreg_auc_score = roc_auc_score(y_true  = y_test,
                                 y_score = logreg_pred).round(decimals = 4)

LOG REGRESSION
Training ACCURACY: 0.7778
Testing  ACCURACY: 0.8513
[[ 24  26]
 [  3 142]]

True Negatives : 24
False Positives: 26
False Negatives: 3
True Positives : 142

AUC
0.7297


# Decision Tree Classifier & Hyperparameter Tuning

In [11]:
# declaring a hyperparameter space
criterion = ['gini', 'entropy']
split     = ['best', 'random']
depth     = np.arange(1, 8, 1)
min_samples_leaf      = np.arange(1, 100, 1)
min_impurity_decrease      =np.arange(0,1,0.05)
min_samples_split     = np.arange(30,120,1)
max_feat = ["auto","sqrt","log2"]

# creating a hyperparameter grid
param_grid = {'criterion'        : criterion,
              'splitter'         : split,
              'max_depth'        : depth,
             "max_features"      : max_feat,
             "min_samples_split" : min_samples_split,
            "min_samples_leaf"   :  min_samples_leaf   }

 # INSTANTIATING the model object without hyperparameters
DT_tuned = DecisionTreeClassifier(random_state = 219)


 # GridSearchCV object
DT_tuned_cv = RandomizedSearchCV(estimator           = DT_tuned,   # the model object
                                 param_distributions = param_grid, # parameters to tune
                                 cv                  = 3,          # how many folds in cross-validation
                                 n_iter              = 250,        # number of combinations of hyperparameters to try
                                 random_state        = 219,        # starting point for random sequence
                                 scoring = make_scorer(
                                           roc_auc_score,
                                           needs_threshold = False)) # scoring criteria (AUC)


# FITTING to the FULL DATASET (due to cross-validation)
DT_tuned_cv.fit(got_copy_data, got_copy_target)




# printing the optimal parameters and best score
print("Tuned Parameters  :", DT_tuned_cv.best_params_)
print("Tuned CV AUC      :", DT_tuned_cv.best_score_.round(4))

Tuned Parameters  : {'splitter': 'best', 'min_samples_split': 35, 'min_samples_leaf': 29, 'max_features': 'auto', 'max_depth': 6, 'criterion': 'entropy'}
Tuned CV AUC      : 0.6626


In [12]:




# INSTANTIATING a classification tree object
tree_pruned = DecisionTreeClassifier(max_depth= 5,
                    min_samples_leaf= 1,
                    random_state = 219,
                    splitter = "best",
                    max_features= "sqrt",
                    criterion = "entropy",
                    min_samples_split= 36)


# FITTING the training data
tree_pruned_fit = tree_pruned.fit(x_train, y_train)


# PREDICTING on new data
tree_prune_pred = tree_pruned_fit.predict(x_test)


# SCORING the model
print("PRUNED TREE")
print('Training ACCURACY:', tree_pruned_fit.score(x_test,y_test).round(4))
print('Testing  ACCURACY:', tree_pruned_fit.score(x_train, y_train).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = tree_prune_pred).round(4))


# saving scoring data for future use
pruned_tree_train_score = tree_pruned_fit.score(x_test,y_test).round(4) # accuracy
pruned_tree_test_score  = tree_pruned_fit.score(x_train, y_train).round(4) # accuracy


# saving auc score
pruned_tree_auc_score   = roc_auc_score(y_true  = y_test,
                                        y_score = tree_prune_pred) # auc

# unpacking the confusion matrix
pruned_tree_tn, \
pruned_tree_fp, \
pruned_tree_fn, \
pruned_tree_tp = confusion_matrix(y_true = y_test, y_pred = tree_prune_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {pruned_tree_tn}
False Positives: {pruned_tree_fp}
False Negatives: {pruned_tree_fn}
True Positives : {pruned_tree_tp}
""")

PRUNED TREE
Training ACCURACY: 0.8
Testing  ACCURACY: 0.7881
AUC Score        : 0.6231

True Negatives : 13
False Positives: 37
False Negatives: 2
True Positives : 143



# Extra Code

In [13]:
#creating a Pearson Corr to check against the response variable
#got_copy_corr = got.corr(method = "pearson").round(2)

#got_copy_corr["isAlive"].sort_values(ascending = True)

#got_copy.isnull().sum().sort_values(ascending = True)


In [14]:
# plotting feature importances
#plot_feature_importances(model = tree_pruned, train = x_train, export = False)

In [15]:
# #Used to plot any feature in order to look for swekness
# mean = round(got_copy["log_popularity"].mean(),3)
# #instantiating the plot
# sns.histplot(data = got_copy, x = "log_popularity",kde = True, color = "red")
# #creating a vertical line drawn at the mean
# plt.axvline(mean,
#             color = "green",
#                 ls = ":")
# #showing the plot
# plt.tight_layout()
# plt.show()

In [16]:
#would be used for gender guesser in the mdoel if it was of high correlation
# lst = []

# for i, col in got.iterrows():

#     # splitting email domain at '@'
#     split_name = got.loc[i, 'name'].split(sep = " ")

#     # appending placeholder_lst with the results
#     lst.append(split_name)


# # converting placeholder_lst into a DataFrame 
# split_name = pd.DataFrame(lst)




# got["split_name"] = split_name[0]

# got["split_name"].head()

# # placeholder list
# placeholder_lst = []


# # looping to guess gender
# for name in got['split_name']:
#     guess = gender.Detector().get_gender(name)
#     placeholder_lst.append(guess)

#  #converting list into a series
# got['gender_guess'] = pd.Series(placeholder_lst)




# Final Model Outputs

In [17]:
#producing results table for model outputs
print(f"""
|     Model Name               Training Accuracy     Testing Accuracy     AUC Score     Confusion Matrix:  TN,FP,FN,TP|
|     ----------               ------------------    -----------------    ----------     -----------------------------|
|Classification Trees - Pruned    {pruned_tree_train_score}              {pruned_tree_test_score}                 {pruned_tree_auc_score.round(3)}          {pruned_tree_tn,pruned_tree_fp,pruned_tree_fn,pruned_tree_tp}             |
|                                                                                                                     |                                        
|---------------------------------------------------------------------------------------------------------------------|
|                                                                                                                     |
|Logistic Regression (FINAL)      {logreg_train_score}              {logreg_test_score}                {logreg_auc_score}       {logreg_tn,logreg_fp,logreg_fn,logreg_tp}             |""")


|     Model Name               Training Accuracy     Testing Accuracy     AUC Score     Confusion Matrix:  TN,FP,FN,TP|
|     ----------               ------------------    -----------------    ----------     -----------------------------|
|Classification Trees - Pruned    0.8              0.7881                 0.623          (13, 37, 2, 143)             |
|                                                                                                                     |                                        
|---------------------------------------------------------------------------------------------------------------------|
|                                                                                                                     |
|Logistic Regression (FINAL)      0.7778              0.8513                0.7297       (24, 26, 3, 142)             |
