In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import scipy.sparse 
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler
import dask.dataframe as dd
from dask.dataframe import from_pandas
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
import seaborn as sns
from tqdm import tqdm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from prettytable import PrettyTable

In [4]:
df=pd.read_csv('/content/drive/MyDrive/feature_extracted')

In [5]:
df_scraped=pd.read_csv('/content/drive/MyDrive/scraped_df')

In [6]:
columns=['length',
       'no_of_words', 'avg_word_length', 'caps', 'excl', 'sentence_count',
       'avg_sentence_length', 'sentiment', 'subjectivity', 'noun_count',
       'verb_count', 'adj_count', 'adv_count', 'pron_count']

In [7]:
identity_columns = ['asian',
       'atheist', 'bisexual', 'black', 'buddhist', 'christian', 'female',
       'heterosexual', 'hindu', 'homosexual_gay_or_lesbian',
       'intellectual_or_learning_disability', 'jewish', 'latino', 'male',
       'muslim', 'other_disability', 'other_gender', 'other_race_or_ethnicity',
       'other_religion', 'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white']

In [8]:
def convert_to_bool(df, col_name):
    df[col_name] = np.where(df[col_name] >= 0.5, 1, 0)
    
def convert_dataframe_to_bool(df):
    bool_df = df.copy()
    for col in ['target'] + identity_columns:
        convert_to_bool(bool_df, col)
    return bool_df

df = convert_dataframe_to_bool(df)

In [9]:
df_non_toxic=df[df['target']==0].sample(frac=0.2)
df_toxic=df[df['target']==1]

In [10]:
df_new=pd.concat([df_toxic,df_non_toxic],axis=0)

In [11]:
Y=df_new['target']
X=df_new.drop(['target'],axis=1)

In [12]:
len(X)

471898

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.35, random_state=42,stratify=Y, shuffle=True)

In [14]:
X_train=X_train[['clean_text','length',
       'no_of_words', 'avg_word_length', 'caps', 'excl', 'sentence_count',
       'avg_sentence_length', 'sentiment', 'subjectivity', 'noun_count',
       'verb_count', 'adj_count', 'adv_count', 'pron_count']]

In [15]:
X_scraped=df_scraped.drop(['Unnamed: 0', 'text', 'target'],axis=1)
Y_scraped=df_scraped['target']

In [16]:
X_train_scrapped=pd.concat([X_train,X_scraped],axis=0)

In [17]:
y_train_scraped=pd.concat([y_train,Y_scraped],axis=0)

In [18]:
vectorizer = TfidfVectorizer()
tfidf_vect = vectorizer.fit(X_train['clean_text'])

In [19]:
vectorizer1 = TfidfVectorizer()
vectorizer1.fit(X_train_scrapped['clean_text'])

TfidfVectorizer()

In [20]:
import pickle
with open('/content/drive/MyDrive/glove.840B.300d.pkl', 'rb') as fp:
    glove = pickle.load(fp)
glove_words=glove.keys()

In [22]:
dictionary=dict(zip(vectorizer.get_feature_names(),list(vectorizer.idf_)))
word_list=set(vectorizer.get_feature_names())
X_train_w2v=[]
count={0}
for sentence in tqdm(X_train['clean_text']):
    vector=np.zeros(300)    
    tfidf_weight=0    
    l=len(sentence.split())
    for word in sentence.split():
        if word in glove_words and word in word_list:
            count.add(word)
            vec=glove[word]               
            tfidf=dictionary[word]*(sentence.count(word)/l)            
            vector+=(vec*tfidf)            
            tfidf_weight+=tfidf
    if tfidf_weight !=0:
        vector/=tfidf_weight
    X_train_w2v.append(vector) 

100%|██████████| 306733/306733 [01:15<00:00, 4047.12it/s]


In [23]:
dictionary=dict(zip(vectorizer.get_feature_names(),list(vectorizer.idf_)))
word_list=set(vectorizer.get_feature_names())
X_test_w2v=[]
count={0}
for sentence in tqdm(X_test['clean_text']):
    vector=np.zeros(300)    
    tfidf_weight=0    
    l=len(sentence.split())
    for word in sentence.split():
        if word in glove_words and word in word_list:
            count.add(word)
            vec=glove[word]               
            tfidf=dictionary[word]*(sentence.count(word)/l)            
            vector+=(vec*tfidf)            
            tfidf_weight+=tfidf
    if tfidf_weight !=0:
        vector/=tfidf_weight
    X_test_w2v.append(vector)

100%|██████████| 165165/165165 [00:40<00:00, 4125.72it/s]


In [24]:
parameters={'max_depth':[4,5,6,7,8]}
clf = HistGradientBoostingClassifier()
clf = GridSearchCV(clf, parameters,cv=5,scoring='roc_auc')
clf.fit(X_train_w2v,y_train)

GridSearchCV(cv=5, estimator=HistGradientBoostingClassifier(),
             param_grid={'max_depth': [4, 5, 6, 7, 8]}, scoring='roc_auc')

In [25]:
clf.best_estimator_

HistGradientBoostingClassifier(max_depth=7)

In [26]:
clf = HistGradientBoostingClassifier(max_depth=7)
clf.fit(X_train_w2v,y_train)
Y_pred_tr=clf.predict_proba(X_train_w2v)

In [27]:
Y_pred_te=clf.predict_proba(X_test_w2v)

In [28]:
roc_auc_score(y_train, Y_pred_tr[:,1])

0.8737458734989059

In [29]:
roc_auc_score(y_test, Y_pred_te[:,1])

0.8564822982455768

In [30]:
Y_pred_te=clf.predict_proba(X_test_w2v)

In [31]:
X_te_bias=X_test[identity_columns]
X_te_bias['y_pred']=Y_pred_te[:,1]
X_te_bias['y']=y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [32]:
identity_columns1=['latino','transgender','asian','psychiatric_or_mental_illness','jewish','homosexual_gay_or_lesbian',
                  'black','muslim','white','christian','male','female']

In [21]:
SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive

def compute_auc(y_true, y_pred):
    try:
        return metrics.roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_subgroup_auc(df, subgroup):
    subgroup_examples = df[df[subgroup]==1]
    return compute_auc(subgroup_examples['y'], subgroup_examples['y_pred'])

def compute_bpsn_auc(df, subgroup):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
    subgroup_negative_examples = df[df[subgroup]==1]
    subgroup_negative_examples = subgroup_negative_examples[subgroup_negative_examples['y']==0]
    non_subgroup_positive_examples = df[df[subgroup]==0]
    non_subgroup_positive_examples = non_subgroup_positive_examples[non_subgroup_positive_examples['y']==1]
    examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
    return compute_auc(examples['y'], examples['y_pred'])

def compute_bnsp_auc(df, subgroup):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
    subgroup_positive_examples = df[df[subgroup]==1]
    subgroup_positive_examples = subgroup_positive_examples[subgroup_positive_examples['y']==1]
    non_subgroup_negative_examples = df[df[subgroup]==0]
    non_subgroup_negative_examples = non_subgroup_negative_examples[non_subgroup_negative_examples['y']==0]
    examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
    return compute_auc(examples['y'], examples['y_pred'])

def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   include_asegs=False):
    """Computes per-subgroup metrics for all subgroups and one model."""
    records = []
    for subgroup in subgroups:
        record = {'subgroup': subgroup,'subgroup_size': len(dataset[dataset[subgroup]==1])}
        record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup)
        record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup)
        record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup)
        records.append(record)
    return pd.DataFrame(records).sort_values('subgroup_size', ascending=True)

In [33]:
bias_metrics_df = compute_bias_metrics_for_model(X_te_bias, identity_columns1)
bias_metrics_df

Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
0,latino,246,0.814362,0.726606,0.906942
1,transgender,336,0.72687,0.745714,0.843739
2,asian,478,0.783865,0.762872,0.873998
3,psychiatric_or_mental_illness,634,0.777998,0.788042,0.850897
4,jewish,888,0.771024,0.700495,0.905522
5,homosexual_gay_or_lesbian,1605,0.728077,0.685156,0.890661
6,black,2332,0.71847,0.645256,0.901859
7,muslim,2799,0.735687,0.644767,0.915201
8,white,3789,0.73989,0.630131,0.919295
9,christian,3819,0.826721,0.825794,0.859672


In [34]:
def score(df,overall_auc):
    l=len(df)
    x=df.drop(['subgroup','subgroup_size'],axis=1)
    x=x.apply((lambda x: x**(-5)),axis=1)
    s=x.sum(axis=0)
    s=s.apply(lambda x: (x/l)**(1/(-5)))
    s=s.sum(axis=0)
    score=0.25*(s+overall_auc)
    return score    

In [35]:
bias_score=score(bias_metrics_df,0.8576)
bias_score

0.8030411746534422

In [36]:
from sklearn.linear_model import LogisticRegression
parameters={'C':[0.0001,0.001,0.01,1,10,100]}
clf = LogisticRegression(solver='saga',random_state=42)
clf = GridSearchCV(clf, parameters,cv=5,scoring='roc_auc')
clf.fit(X_train_w2v,y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(random_state=42, solver='saga'),
             param_grid={'C': [0.0001, 0.001, 0.01, 1, 10, 100]},
             scoring='roc_auc')

In [37]:
clf.best_estimator_

LogisticRegression(C=1, random_state=42, solver='saga')

In [38]:
clf=LogisticRegression(C=1, random_state=42, solver='saga')
clf.fit(X_train_w2v,y_train)
Y_pred_tr=clf.predict_proba(X_train_w2v)

In [39]:
Y_pred_te=clf.predict_proba(X_test_w2v)

In [40]:
roc_auc_score(y_train, Y_pred_tr[:,1])

0.8547150453437975

In [41]:
roc_auc_score(y_test, Y_pred_te[:,1])

0.8536722982629703

In [42]:
Y_pred_te=clf.predict_proba(X_test_w2v)
X_te_bias=X_test[identity_columns]
X_te_bias['y_pred']=Y_pred_te[:,1]
X_te_bias['y']=y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [43]:
bias_metrics_df = compute_bias_metrics_for_model(X_te_bias, identity_columns1)
bias_metrics_df

Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
0,latino,246,0.798314,0.754518,0.881741
1,transgender,336,0.73752,0.750204,0.85061
2,asian,478,0.764138,0.775869,0.848931
3,psychiatric_or_mental_illness,634,0.762848,0.746944,0.868408
4,jewish,888,0.752704,0.709805,0.888251
5,homosexual_gay_or_lesbian,1605,0.735672,0.695151,0.886815
6,black,2332,0.740588,0.68328,0.892797
7,muslim,2799,0.732132,0.683701,0.891541
8,white,3789,0.74344,0.670221,0.901312
9,christian,3819,0.820711,0.847022,0.830273


In [44]:
bias_score=score(bias_metrics_df,0.8521)
bias_score

0.8034781526695195

In [None]:
from sklearn.ensemble import RandomForestClassifier
parameters={'n_estimators':[10,20,30],'max_depth':[3,4,5,6],'max_samples':[0.4,0.5,0.7]}
clf = RandomForestClassifier(n_jobs=-1, random_state=0)
clf = GridSearchCV(clf, parameters,cv=5,scoring='roc_auc')
clf.fit(X_train_w2v,y_train)

In [24]:
clf.best_estimator_

RandomForestClassifier(max_depth=6, max_samples=0.4, n_estimators=30, n_jobs=-1,
                       random_state=0)

In [25]:
clf=RandomForestClassifier(max_depth=6, max_samples=0.4, n_estimators=30, n_jobs=-1,
                       random_state=0)
clf.fit(X_train_w2v,y_train)
Y_pred_tr=clf.predict_proba(X_train_w2v)

In [26]:
Y_pred_te=clf.predict_proba(X_test_w2v)

In [27]:
roc_auc_score(y_train, Y_pred_tr[:,1])

0.8165089385303973

In [28]:
roc_auc_score(y_test, Y_pred_te[:,1])

0.8097501889546995

In [29]:
Y_pred_te=clf.predict_proba(X_test_w2v)
X_te_bias=X_test[identity_columns]
X_te_bias['y_pred']=Y_pred_te[:,1]
X_te_bias['y']=y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [34]:
bias_metrics_df = compute_bias_metrics_for_model(X_te_bias, identity_columns1)
bias_metrics_df

Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
0,latino,245,0.76323,0.668504,0.884667
1,transgender,340,0.727344,0.781446,0.762993
2,asian,514,0.742505,0.733849,0.818869
3,psychiatric_or_mental_illness,639,0.805775,0.830321,0.784712
4,jewish,899,0.732394,0.612696,0.891894
5,homosexual_gay_or_lesbian,1581,0.723626,0.671915,0.854462
6,black,2353,0.703127,0.633883,0.861104
7,muslim,2810,0.709023,0.590273,0.892805
8,white,3736,0.712274,0.597909,0.886143
9,christian,3785,0.797961,0.756562,0.845823


In [35]:
bias_score=score(bias_metrics_df,0.8521)
bias_score

0.7760749490172429

In [23]:
dictionary=dict(zip(vectorizer1.get_feature_names(),list(vectorizer1.idf_)))
word_list=set(vectorizer1.get_feature_names())
X_train_scraped_w2v=[]
count={0}
for sentence in tqdm(X_train_scrapped['clean_text']):
    vector=np.zeros(300)    
    tfidf_weight=0    
    l=len(sentence.split())
    for word in sentence.split():
        if word in glove_words and word in word_list:
            count.add(word)
            vec=glove[word]               
            tfidf=dictionary[word]*(sentence.count(word)/l)            
            vector+=(vec*tfidf)            
            tfidf_weight+=tfidf
    if tfidf_weight !=0:
        vector/=tfidf_weight
    X_train_scraped_w2v.append(vector)  

100%|██████████| 320871/320871 [01:17<00:00, 4148.58it/s]


In [24]:
dictionary=dict(zip(vectorizer1.get_feature_names(),list(vectorizer1.idf_)))
word_list=set(vectorizer1.get_feature_names())
X_test_scraped_w2v=[]
count={0}
for sentence in tqdm(X_test['clean_text']):
    vector=np.zeros(300)    
    tfidf_weight=0    
    l=len(sentence.split())
    for word in sentence.split():
        if word in glove_words and word in word_list:
            count.add(word)
            vec=glove[word]               
            tfidf=dictionary[word]*(sentence.count(word)/l)            
            vector+=(vec*tfidf)            
            tfidf_weight+=tfidf
    if tfidf_weight !=0:
        vector/=tfidf_weight
    X_test_scraped_w2v.append(vector) 

100%|██████████| 165165/165165 [00:39<00:00, 4189.33it/s]


In [None]:
parameters={'max_depth':[4,5,6,7,8]}
clf = HistGradientBoostingClassifier()
clf = GridSearchCV(clf, parameters,cv=5,scoring='roc_auc')
clf.fit(X_train_scraped_w2v,y_train_scraped)

GridSearchCV(cv=5, estimator=HistGradientBoostingClassifier(),
             param_grid={'max_depth': [4, 5, 6, 7, 8]}, scoring='roc_auc')

In [None]:
clf.best_estimator_

HistGradientBoostingClassifier(max_depth=8)

In [None]:
clf = HistGradientBoostingClassifier(max_depth=8)
clf.fit(X_train_scraped_w2v,y_train_scraped)
Y_pred_tr=clf.predict_proba(X_train_scraped_w2v)

In [None]:
Y_pred_te=clf.predict_proba(X_test_scraped_w2v)

In [None]:
roc_auc_score(y_train_scraped, Y_pred_tr[:,1])

0.8827457462430206

In [None]:
roc_auc_score(y_test, Y_pred_te[:,1])

0.8520936277352503

In [None]:
X_te_bias=X_test_scraped[identity_columns]
X_te_bias['y_pred']=Y_pred_te[:,1]
X_te_bias['y']=y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
bias_metrics_df = compute_bias_metrics_for_model(X_te_bias, identity_columns1)
bias_metrics_df

Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
0,latino,198,0.759318,0.710047,0.890941
1,transgender,254,0.735246,0.814572,0.783749
2,asian,332,0.795904,0.77959,0.86816
3,psychiatric_or_mental_illness,502,0.75611,0.801848,0.815547
4,jewish,647,0.747204,0.720878,0.874166
5,homosexual_gay_or_lesbian,1339,0.715342,0.733147,0.845361
6,black,2034,0.713714,0.647084,0.894949
7,muslim,2221,0.742333,0.677204,0.895699
9,christian,2554,0.822677,0.850253,0.825847
8,white,3055,0.719175,0.632638,0.903999


In [None]:
bias_score=score(bias_metrics_df,0.852)
bias_score

0.7969099640313553

In [21]:
from sklearn.linear_model import LogisticRegression
parameters={'C':[0.0001,0.001,0.01,1,10,100]}
clf = LogisticRegression(solver='saga',random_state=42)
clf = GridSearchCV(clf, parameters,cv=5,scoring='roc_auc')
clf.fit(X_train_scraped_w2v,y_train_scraped)

GridSearchCV(cv=5, estimator=LogisticRegression(random_state=42, solver='saga'),
             param_grid={'C': [0.0001, 0.001, 0.01, 1, 10, 100]},
             scoring='roc_auc')

In [24]:
clf.best_estimator_

LogisticRegression(C=1, random_state=42, solver='saga')

In [25]:
clf=LogisticRegression(C=1, random_state=42, solver='saga')
clf.fit(X_train_scraped_w2v,y_train_scraped)
Y_pred_tr=clf.predict_proba(X_train_scraped_w2v)

In [26]:
Y_pred_te=clf.predict_proba(X_test_scraped_w2v)

In [27]:
roc_auc_score(y_train_scraped, Y_pred_tr[:,1])

0.8565867798764764

In [28]:
roc_auc_score(y_test, Y_pred_te[:,1])

0.8510600862512072

In [29]:
X_te_bias=X_test[identity_columns]
X_te_bias['y_pred']=Y_pred_te[:,1]
X_te_bias['y']=y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [33]:
bias_metrics_df = compute_bias_metrics_for_model(X_te_bias, identity_columns1)
bias_metrics_df

NameError: ignored

In [35]:
bias_score=score(bias_metrics_df,0.851)
bias_score

0.8102160379072746

In [25]:
from sklearn.ensemble import RandomForestClassifier
parameters={'n_estimators':[10,20,30],'max_depth':[3,4,5,6],'max_samples':[0.4,0.5,0.7]}
clf = RandomForestClassifier(n_jobs=-1, random_state=0)
clf = GridSearchCV(clf, parameters,cv=5,scoring='roc_auc')
clf.fit(X_train_scraped_w2v,y_train_scraped)

GridSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1, random_state=0),
             param_grid={'max_depth': [3, 4, 5, 6],
                         'max_samples': [0.4, 0.5, 0.7],
                         'n_estimators': [10, 20, 30]},
             scoring='roc_auc')

In [26]:
clf.best_estimator_

RandomForestClassifier(max_depth=6, max_samples=0.5, n_estimators=30, n_jobs=-1,
                       random_state=0)

In [28]:
clf=RandomForestClassifier(max_depth=6, max_samples=0.5, n_estimators=30, n_jobs=-1,
                       random_state=0)
clf.fit(X_train_scraped_w2v,y_train_scraped)
Y_pred_tr=clf.predict_proba(X_train_scraped_w2v)

In [29]:
Y_pred_te=clf.predict_proba(X_test_scraped_w2v)

In [30]:
roc_auc_score(y_train_scraped, Y_pred_tr[:,1])

0.8224430145048388

In [31]:
roc_auc_score(y_test, Y_pred_te[:,1])

0.8116417374113483

In [41]:
X_te_bias=X_test[identity_columns]
X_te_bias['y_pred']=Y_pred_te[:,1]
X_te_bias['y']=y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [42]:
bias_metrics_df = compute_bias_metrics_for_model(X_te_bias, identity_columns1)
bias_metrics_df

Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
0,latino,246,0.77546,0.678902,0.888943
1,transgender,339,0.721327,0.799409,0.743708
2,asian,478,0.776727,0.789906,0.799699
3,psychiatric_or_mental_illness,634,0.763031,0.807152,0.766291
4,jewish,835,0.724776,0.628813,0.882075
5,homosexual_gay_or_lesbian,1595,0.714674,0.7033,0.82867
6,black,2310,0.685484,0.628498,0.85742
7,muslim,2847,0.702236,0.611607,0.881882
8,white,3685,0.721996,0.620978,0.881622
9,christian,3766,0.79755,0.781703,0.826268


In [43]:
bias_score=score(bias_metrics_df,0.81164)
bias_score

0.765481959446413

In [49]:
x = PrettyTable()
x.field_names = ['Feature',"Train AUC", "Test AUC", "Bias_Score"]
x.add_rows(
    [
        ["GBDT", 0.8827, 0.8551, 0.8054],
        ["GBDT with scraped data",  0.8742, 0.8520, 0.7969],    
        ["Logistic Regression", 0.8541, 0.8522, 0.8057],
        ["LR with scraped data",  0.8566, 0.8511, 0.8102],  
        ["Random Forest", 0.8165, 0.8097, 0.7760],
        ["RF with scraped data",  0.8224, 0.8116, 0.7654]        
    ]
)

In [50]:
print(x)

+------------------------+-----------+----------+------------+
|        Feature         | Train AUC | Test AUC | Bias_Score |
+------------------------+-----------+----------+------------+
|          GBDT          |   0.8827  |  0.8551  |   0.8054   |
| GBDT with scraped data |   0.8742  |  0.852   |   0.7969   |
|  Logistic Regression   |   0.8541  |  0.8522  |   0.8057   |
|  LR with scraped data  |   0.8566  |  0.8511  |   0.8102   |
|     Random Forest      |   0.8165  |  0.8097  |   0.776    |
|  RF with scraped data  |   0.8224  |  0.8116  |   0.7654   |
+------------------------+-----------+----------+------------+
