In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, plot_confusion_matrix

In [20]:
SEED = 42
CSV_LOCATION = "peace_sys.csv"

In [21]:
VARS = {'ID1.1Over' : 'Overarching Identity', 'Int2.4Hist' : 'Interconnected Historically', 
        'Int2.2Econ' : 'Interconnected Economically', 'Dep3.3Econ' : 'Interdependent Economically', 
        'Dep3.2Ecol' : 'Interdependent Ecologically', 'NWNorm5.1': 'Non-Warring Norms', 
        'NWVal4.1' : 'Non-Warring Values', 'SymP6': 'Peace Symbols', 'RitP6' : 'Peace Rituals', 
        'CM8.5Peace' : 'Peaceful Conflict Management Overall'}

In [39]:
def mode(x):
    """
    Parameters
    ----------
        x: 1D numpy array
        
    Return
    ------
        scalar representing the mode of the arr
    """
    # count number of 1 in x
    one_sum = np.count_nonzero(x == 1)
    # count number of 0 in x
    zero_sum = np.count_nonzero(x == 0)
    # return mode if a mode exists
    if one_sum >= zero_sum:
        return 1
    
    return 0

In [31]:
def logistic_sigmoid(x, beta, intercept):
    """
    Args: 
        x: scalar with score [1-4] for each factor for each obeservation.
        betas: scalar betas extracted from logreg model.
        intercept: number extracted from logreg model. Denoted as beta_0
    Returns:
        sscalar that represent the probability
    """
    
    return 1 / (1 + np.exp(-(intercept + beta * x)))

In [58]:
def train_and_pred_on_feature(X, y):
    """
    Trains logistic regression model on 1 feature.  X and y must have equal dimensions.
    
    Parameters
    ----------
    X: dataframe containing 1 column. Shape is (m, 1). m is the number of rows.
        X can contain Nan values.
    y: dataframe containing labels for each obeservation in X. 
        Same is (m, 1). m is the number of rows. 
        
    Returns
    -------
    pred: prediction for feature. pred can be 0 or 1
            
    """
    # store the freature name for reference
    feature_name = X.columns[0]
    label = 'label'
    # concatenate feature and labels into one df
    concat_df = pd.DataFrame({feature_name: list(X[feature_name]), label: y})
    # drop nan values
    concat_df = concat_df.dropna()
    # reasign values to feature df
    X = pd.DataFrame(concat_df[feature_name])
    # reasign values to label df
    y = pd.DataFrame(concat_df[label])
    # init model
    logreg = LogisticRegression(random_state=SEED)
    # train model
    logreg.fit(X, y[label].ravel())
    
    intercept = logreg.intercept_[0]
    beta = logreg.coef_[0][0]
    # probas after pass into sigmoid 
    probas = logistic_sigmoid(np.array(X[feature_name]), beta, intercept)
    # prediction, 0 or 1
    pred = mode(probas)
    
    return pred


In [50]:
def get_predictions(X, y):
    """
    Gets predicition for each feature in X. 
    
    Parameters
    ---------
    X: dataframe containing all observations. Shape (m, n)
    y: dataframe containg all labels for each observaation in X. Shape (m, 1)
    
    Returns
    -------
    preds: array with n different predictions.
    
    """
    preds = []
    
    for col in X.columns:
        # get the df for col
        feature_df = pd.DataFrame(X[col])
        # get prediciton
        y_hat = train_and_pred_on_feature(feature_df, y)
        # add to the other predictions
        preds.append(y_hat)
        
    return preds

In [22]:
df = pd.read_csv(CSV_LOCATION)

In [24]:
X = df[VARS.keys()].replace(9, np.nan)

In [26]:
y = df["PSysRec"]

In [59]:
y_preds = get_predictions(X, y)

In [62]:
y_preds

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]