In [1]:
import pandas as pd
import numpy as np
import os

from sklearn import metrics

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline
import matplotlib.pyplot as plt

import sklearn.preprocessing as pp
from sklearn.neighbors import KNeighborsClassifier


In [2]:
combined_df = pd.read_csv('merged.csv')

In [3]:
temp = combined_df.dropna()

In [4]:
temp

Unnamed: 0,AGENCY IDENTIFIER,BIAS MOTIVATION,CITY,JUDICIAL DISTRICT,LOCATION,POPULATION,QUARTER,REGION,STATE,YEAR
0,AK0010100,Anti-Race,ANCHORAGE,020A,Residence/home,260900,Jul to Sep,West,AK,2000
1,AK0010100,Anti-Race,ANCHORAGE,020A,Residence/home,260900,Jul to Sep,West,AK,2000
2,AK0010100,Anti-Race,ANCHORAGE,020A,Highway/road/alley,260900,Jan to Mar,West,AK,2000
3,AK0010100,Anti-Race,ANCHORAGE,020A,Residence/home,260900,Jan to Mar,West,AK,2000
4,AR0080300,Anti-Race,GREEN FOREST,035W,Residence/home,2785,Jul to Sep,South,AR,2000
...,...,...,...,...,...,...,...,...,...,...
120250,WV0490100,Anti-Race,BUCKHANNON,430N,Bar/nightclub,5662,Oct to Dec,South,WV,2016
120251,WV0540100,Anti-Sexual Orientation,PARKERSBURG,435S,Highway/road/alley,30913,Jan to Mar,South,WV,2016
120252,WY0030100,Anti-Race,GILLETTE,450A,School--elementary/secondary,33218,Apr to Jun,West,WY,2016
120253,WY0030100,Anti-Race,GILLETTE,450A,School--elementary/secondary,33218,Apr to Jun,West,WY,2016


In [5]:
class StdScalerByGroup(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        """
        :Example:
        >>> cols = {'g': ['A', 'A', 'B', 'B'], 'c1': [1, 2, 2, 2], 'c2': [3, 1, 2, 0]}
        >>> X = pd.DataFrame(cols)
        >>> std = StdScalerByGroup().fit(X)
        >>> std.grps_ is not None
        True
        """
        # X may not be a pandas dataframe (e.g. a np.array)
        df = pd.DataFrame(X)
        mean_df = df.groupby(df.columns[0]).mean()
        std_df = df.groupby(df.columns[0]).std()
        
        # A dictionary of means/standard-deviations for each column, for each group.
        self.grps_ = [mean_df, std_df]
        return self

    def transform(self, X, y=None):
        """
        :Example:
        >>> cols = {'g': ['A', 'A', 'B', 'B'], 'c1': [1, 2, 3, 4], 'c2': [1, 2, 3, 4]}
        >>> X = pd.DataFrame(cols)
        >>> std = StdScalerByGroup().fit(X)
        >>> out = std.transform(X)
        >>> out.shape == (4, 2)
        True
        >>> np.isclose(out.abs(), 0.707107, atol=0.001).all().all()
        True
        """
        try:
            getattr(self, "grps_")
        except AttributeError:
            raise RuntimeError("You must fit the transformer before tranforming the data!")
        
        # X may not be a dataframe (e.g. np.array)
        df = pd.DataFrame(X)
        
        grouped_mean = self.grps_[0]
        grouped_std = self.grps_[1]
        numerator = (df.set_index(df.columns[0])-grouped_mean.reindex(df[df.columns[0]])).reset_index()
        z_score = (numerator.set_index(df.columns[0])/grouped_std.reindex(df[df.columns[0]])).reset_index()
        return z_score.set_index(z_score.columns[0])

In [6]:
x_value = temp.drop(columns=['BIAS MOTIVATION'])
y_value = temp['BIAS MOTIVATION']
x_train, x_test, y_train, y_test = train_test_split(x_value, y_value, test_size=0.25)

In [7]:
def rfc(dataset):
    """
    :Example:
    >>> fp = os.path.join('data', 'titanic.csv')
    >>> data = pd.read_csv(fp)
    >>> pl = titanic_model(data)
    >>> isinstance(pl, Pipeline)
    True
    >>> from sklearn.base import BaseEstimator
    >>> isinstance(pl.steps[-1][-1], BaseEstimator)
    True
    >>> preds = pl.predict(data.drop('Survived', axis=1))
    >>> ((preds == 0)|(preds == 1)).all()
    True
    """    
    def prefix(col):
        col = col.iloc[:,0]
        return np.array(col.str.split().str[:1].str[0]).reshape(-1,1)

    cat_feat = ['AGENCY IDENTIFIER', 'CITY', 'JUDICIAL DISTRICT',
       'LOCATION', 'POPULATION', 'QUARTER', 'REGION', 'STATE', 'YEAR']
    cat_tran = Pipeline([
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # preprocessing pipeline (put them together)
    pre = ColumnTransformer(transformers=[('cat', cat_tran, cat_feat)], remainder='passthrough')

    comb = Pipeline([('preprocessor', pre), ('rfc', RandomForestClassifier(n_estimators=180,min_samples_leaf=3))])
    comb.fit(x_train,y_train)
    return comb

In [8]:
rfc_list = []
for i in range(10): 
    p1 = rfc(temp)
    rfc_list.append(p1)

In [9]:
preds_list = []
for i in range(len(rfc_list)): 
    preds = rfc_list[i].predict(x_test)
    preds_list.append(preds)

In [10]:
accuracy_rfc = []
for i in range(len(preds_list)): 
    accuracy_rfc.append(metrics.accuracy_score(y_test, preds_list[i]))

In [11]:
mean_rfc = np.sum(accuracy_rfc)/10
mean_rfc

0.6662973099259062

In [12]:
types_rfc = []
for i in range(len(preds_list)): 
    random_forest = pd.Series(preds_list[i]).value_counts()
    types_rfc.append(random_forest/random_forest.sum())

In [13]:
overall_rfc = types_rfc[0]
for i in range(1, len(types_rfc)): 
    overall_rfc += types_rfc[i]

In [14]:
overall_rfc/10

Anti-Race                  0.906488
Anti-Religion              0.083462
Anti-Sexual Orientation    0.009248
Anti-Disability            0.000801
dtype: float64

In [15]:
def knc(dataset):
    """
    :Example:
    >>> fp = os.path.join('data', 'titanic.csv')
    >>> data = pd.read_csv(fp)
    >>> pl = titanic_model(data)
    >>> isinstance(pl, Pipeline)
    True
    >>> from sklearn.base import BaseEstimator
    >>> isinstance(pl.steps[-1][-1], BaseEstimator)
    True
    >>> preds = pl.predict(data.drop('Survived', axis=1))
    >>> ((preds == 0)|(preds == 1)).all()
    True
    """    
    def prefix(col):
        col = col.iloc[:,0]
        return np.array(col.str.split().str[:1].str[0]).reshape(-1,1)

    cat_feat = ['AGENCY IDENTIFIER', 'CITY', 'JUDICIAL DISTRICT',
       'LOCATION', 'POPULATION', 'QUARTER', 'REGION', 'STATE', 'YEAR']
    cat_tran = Pipeline([
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # preprocessing pipeline (put them together)
    pre = ColumnTransformer(transformers=[('cat', cat_tran, cat_feat)], remainder='passthrough')

    comb = Pipeline([('preprocessor', pre), ('rfc', KNeighborsClassifier())])
    comb.fit(x_train,y_train)
    return comb

In [16]:
knc_list = []
for i in range(10): 
    p2 = knc(temp)
    knc_list.append(p2)

In [17]:
preds_list2 = []
for i in range(len(knc_list)): 
    preds = knc_list[i].predict(x_test)
    preds_list2.append(preds)

In [18]:
accuracy_knc = []
for i in range(len(preds_list2)): 
    accuracy_knc.append(metrics.accuracy_score(y_test, preds_list2[i]))

In [19]:
mean_knc = np.sum(accuracy_knc)/10
mean_knc

0.633168680328416

In [20]:
types_knc = []
for i in range(len(preds_list2)): 
    random_forest = pd.Series(preds_list2[i]).value_counts()
    types_knc.append(random_forest/random_forest.sum())

In [21]:
overall_knc = types_knc[0]
for i in range(1, len(types_knc)): 
    overall_knc += types_knc[i]
overall_knc/10

Anti-Race                  0.796776
Anti-Religion              0.124491
Anti-Sexual Orientation    0.075529
Anti-Disability            0.002503
Anti-Gender                0.000701
dtype: float64