In [30]:
import pandas as pd
import numpy as np
import os

from sklearn import metrics

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline
import matplotlib.pyplot as plt

import sklearn.preprocessing as pp
from sklearn.neighbors import KNeighborsClassifier


In [31]:
combined_df = pd.read_csv('merged.csv')

In [32]:
temp = combined_df.dropna()

In [33]:
temp['BIAS MOTIVATION'].unique()

array(['Anti-Race', 'Anti-Religion', 'Anti-Sexual Orientation',
       'Anti-Disability', 'Anti-Gender'], dtype=object)

In [34]:
class StdScalerByGroup(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        """
        :Example:
        >>> cols = {'g': ['A', 'A', 'B', 'B'], 'c1': [1, 2, 2, 2], 'c2': [3, 1, 2, 0]}
        >>> X = pd.DataFrame(cols)
        >>> std = StdScalerByGroup().fit(X)
        >>> std.grps_ is not None
        True
        """
        # X may not be a pandas dataframe (e.g. a np.array)
        df = pd.DataFrame(X)
        mean_df = df.groupby(df.columns[0]).mean()
        std_df = df.groupby(df.columns[0]).std()
        
        # A dictionary of means/standard-deviations for each column, for each group.
        self.grps_ = [mean_df, std_df]
        return self

    def transform(self, X, y=None):
        """
        :Example:
        >>> cols = {'g': ['A', 'A', 'B', 'B'], 'c1': [1, 2, 3, 4], 'c2': [1, 2, 3, 4]}
        >>> X = pd.DataFrame(cols)
        >>> std = StdScalerByGroup().fit(X)
        >>> out = std.transform(X)
        >>> out.shape == (4, 2)
        True
        >>> np.isclose(out.abs(), 0.707107, atol=0.001).all().all()
        True
        """
        try:
            getattr(self, "grps_")
        except AttributeError:
            raise RuntimeError("You must fit the transformer before tranforming the data!")
        
        # X may not be a dataframe (e.g. np.array)
        df = pd.DataFrame(X)
        
        grouped_mean = self.grps_[0]
        grouped_std = self.grps_[1]
        numerator = (df.set_index(df.columns[0])-grouped_mean.reindex(df[df.columns[0]])).reset_index()
        z_score = (numerator.set_index(df.columns[0])/grouped_std.reindex(df[df.columns[0]])).reset_index()
        return z_score.set_index(z_score.columns[0])

In [35]:
x_value = temp.drop(columns=['BIAS MOTIVATION'])
y_value = temp['BIAS MOTIVATION']
x_train, x_test, y_train, y_test = train_test_split(x_value, y_value, test_size=0.25)

In [36]:
def rfc(dataset):
    """
    :Example:
    >>> fp = os.path.join('data', 'titanic.csv')
    >>> data = pd.read_csv(fp)
    >>> pl = titanic_model(data)
    >>> isinstance(pl, Pipeline)
    True
    >>> from sklearn.base import BaseEstimator
    >>> isinstance(pl.steps[-1][-1], BaseEstimator)
    True
    >>> preds = pl.predict(data.drop('Survived', axis=1))
    >>> ((preds == 0)|(preds == 1)).all()
    True
    """    
    def prefix(col):
        col = col.iloc[:,0]
        return np.array(col.str.split().str[:1].str[0]).reshape(-1,1)

    cat_feat = ['AGENCY IDENTIFIER', 'CITY', 'JUDICIAL DISTRICT',
       'LOCATION', 'POPULATION', 'QUARTER', 'REGION', 'STATE', 'YEAR']
    cat_tran = Pipeline([
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # preprocessing pipeline (put them together)
    pre = ColumnTransformer(transformers=[('cat', cat_tran, cat_feat)], remainder='passthrough')

    comb = Pipeline([('preprocessor', pre), ('rfc', RandomForestClassifier(n_estimators=180,min_samples_leaf=3))])
    comb.fit(x_train,y_train)
    return comb

In [37]:
rfc_list = []
for i in range(10): 
    p1 = rfc(temp)
    rfc_list.append(p1)

In [38]:
preds_list = []
for i in range(len(rfc_list)): 
    preds = rfc_list[i].predict(x_test)
    preds_list.append(preds)

In [39]:
accuracy_rfc = []
for i in range(len(preds_list)): 
    accuracy_rfc.append(metrics.accuracy_score(y_test, preds_list[i]))

In [40]:
mean_rfc = np.sum(accuracy_rfc)/10
mean_rfc

0.6655396835992258

In [41]:
types_rfc = []
for i in range(len(preds_list)): 
    random_forest = pd.Series(preds_list[i]).value_counts()
    types_rfc.append(random_forest/random_forest.sum())

In [42]:
overall_rfc = types_rfc[0]
for i in range(1, len(types_rfc)): 
    overall_rfc += types_rfc[i]

In [43]:
overall_rfc/10

Anti-Race                  0.909709
Anti-Religion              0.079621
Anti-Sexual Orientation    0.009903
Anti-Disability            0.000768
dtype: float64

In [44]:
def knc(dataset):
    """
    :Example:
    >>> fp = os.path.join('data', 'titanic.csv')
    >>> data = pd.read_csv(fp)
    >>> pl = titanic_model(data)
    >>> isinstance(pl, Pipeline)
    True
    >>> from sklearn.base import BaseEstimator
    >>> isinstance(pl.steps[-1][-1], BaseEstimator)
    True
    >>> preds = pl.predict(data.drop('Survived', axis=1))
    >>> ((preds == 0)|(preds == 1)).all()
    True
    """    
    def prefix(col):
        col = col.iloc[:,0]
        return np.array(col.str.split().str[:1].str[0]).reshape(-1,1)

    cat_feat = ['AGENCY IDENTIFIER', 'CITY', 'JUDICIAL DISTRICT',
       'LOCATION', 'POPULATION', 'QUARTER', 'REGION', 'STATE', 'YEAR']
    cat_tran = Pipeline([
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # preprocessing pipeline (put them together)
    pre = ColumnTransformer(transformers=[('cat', cat_tran, cat_feat)], remainder='passthrough')

    comb = Pipeline([('preprocessor', pre), ('rfc', KNeighborsClassifier())])
    comb.fit(x_train,y_train)
    return comb

In [45]:
knc_list = []
for i in range(10): 
    p2 = knc(temp)
    knc_list.append(p2)

In [46]:
preds_list2 = []
for i in range(len(knc_list)): 
    preds = knc_list[i].predict(x_test)
    preds_list2.append(preds)

KeyboardInterrupt: 

In [None]:
accuracy_knc = []
for i in range(len(preds_list2)): 
    accuracy_knc.append(metrics.accuracy_score(y_test, preds_list2[i]))

In [None]:
mean_knc = np.sum(accuracy_knc)/10
mean_knc

In [None]:
types_knc = []
for i in range(len(preds_list2)): 
    random_forest = pd.Series(preds_list2[i]).value_counts()
    types_knc.append(random_forest/random_forest.sum())

In [None]:
overall_knc = types_knc[0]
for i in range(1, len(types_knc)): 
    overall_knc += types_knc[i]
overall_knc/10