- Get datasets ready for the models to use --> .csv files of 'text' and 'label'.
- You can also do the train/val/test split.

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [1]:
import pandas as pd
import numpy as np
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
data_folder = "/content/drive/MyDrive/Colab Notebooks/Amplifi Project/Data/Processed Datasets/"

In [2]:
### POS and dep tagger
def pos_tagger(text):
    doc = nlp(text)
    pos = [token.pos_ for token in doc]

    return ' '.join(pos)

def dep_tagger(text):
    doc = nlp(text)
    dep = [token.dep_ for token in doc]

    return ' '.join(dep)

# Wiki NPOV

In [None]:
npov_root = '/content/drive/MyDrive/Colab Notebooks/Amplifi Project/Data/ReidPryzant_data/WNC/new_biased_full.csv'
npov_df = pd.read_csv(npov_root, delimiter=',', header=None, names=['id','biased','unbiased']) # names=['...','...']

In [None]:
df1 = npov_df[['biased']].copy()
df1.rename({'biased': 'text'}, axis='columns', inplace=True)
df1['label'] = 1

df2 = npov_df[['unbiased']].copy()
df2.rename({'unbiased': 'text'}, axis='columns', inplace=True)
df2['label'] = 0

df = pd.concat([df1, df2]).sort_index(kind='merge').reset_index(drop=True)

df['pos_tags'] = df['text'].apply(lambda x: pos_tagger(x))
df['rel_tags'] = df['text'].apply(lambda x: dep_tagger(x))

# save data
data_folder = "/content/drive/MyDrive/Colab Notebooks/Amplifi Project/Data/Processed Datasets/"
df.to_csv(data_folder + 'NPOV.csv', index=False)

In [None]:
npov_root = '/content/drive/MyDrive/Colab Notebooks/Amplifi Project/Data/Processed Datasets/NPOV.csv'
npov_df = pd.read_csv(npov_root, delimiter=',')

In [None]:
# split reduced data into train / val / test
training_data = npov_df.sample(frac=0.8, random_state=25)
leftover_data = npov_df.drop(training_data.index)
validating_data = leftover_data.sample(frac=0.5, random_state=25)
testing_data = leftover_data.drop(validating_data.index)

# save splits
training_data.to_csv(data_folder + 'NPOV_train.csv', index=False)
validating_data.to_csv(data_folder + 'NPOV_valid.csv', index=False)
testing_data.to_csv(data_folder + 'NPOV_test.csv', index=False)

# Wiki Neutrality Corpus

In [None]:
wnc_root = '/content/drive/MyDrive/Colab Notebooks/Amplifi Project/Data/ReidPryzant_data/WNC/neutral.csv'
wnc_df = pd.read_csv(wnc_root, delimiter=',', header=None, names=['id','1','2','text','4'])

In [None]:
df = wnc_df[['text']].copy()
df['label'] = 0
df.dropna(inplace=True)
df['pos_tags'] = df['text'].apply(lambda x: pos_tagger(x))
df['rel_tags'] = df['text'].apply(lambda x: dep_tagger(x))

# save data
data_folder = "/content/drive/MyDrive/Colab Notebooks/Amplifi Project/Data/Processed Datasets/"
df.to_csv(data_folder + 'WNC.csv', index=False)

In [None]:
wnc_root = '/content/drive/MyDrive/Colab Notebooks/Amplifi Project/Data/Processed Datasets/WNC.csv'
wnc_df = pd.read_csv(wnc_root, delimiter=',')

In [None]:
# split reduced data into train / val / test
training_data = wnc_df.sample(frac=0.8, random_state=25)
leftover_data = wnc_df.drop(training_data.index)
validating_data = leftover_data.sample(frac=0.5, random_state=25)
testing_data = leftover_data.drop(validating_data.index)

# save splits
training_data.to_csv(data_folder + 'WNC_train.csv', index=False)
validating_data.to_csv(data_folder + 'WNC_valid.csv', index=False)
testing_data.to_csv(data_folder + 'WNC_test.csv', index=False)

# CrowS-Pairs

In [None]:
crows_root = '/content/drive/MyDrive/Colab Notebooks/Amplifi Project/Data/Other Data/crows_pairs_anonymized.csv'
crows_df = pd.read_csv(crows_root, delimiter=',', header=None, skiprows=1, names=['id','text1','text2','type','4','5','6','7'])

In [None]:
df1 = crows_df[['text1']].copy()
df1.rename({'text1': 'text'}, axis='columns', inplace=True)
df1['label'] = 1

df2 = crows_df[['text2']].copy()
df2.rename({'text2': 'text'}, axis='columns', inplace=True)
df2['label'] = 1

df = pd.concat([df1, df2]).sort_index(kind='merge').reset_index(drop=True)

df['pos_tags'] = df['text'].apply(lambda x: pos_tagger(x))
df['rel_tags'] = df['text'].apply(lambda x: dep_tagger(x))

# save data
data_folder = "/content/drive/MyDrive/Colab Notebooks/Amplifi Project/Data/Processed Datasets/"
df.to_csv(data_folder + 'CrowS-Pairs.csv', index=False)

In [None]:
crows_root = '/content/drive/MyDrive/Colab Notebooks/Amplifi Project/Data/Processed Datasets/CrowS-Pairs.csv'
crows_df = pd.read_csv(crows_root, delimiter=',')

In [None]:
# split reduced data into train / val / test
training_data = crows_df.sample(frac=0.8, random_state=25)
leftover_data = crows_df.drop(training_data.index)
validating_data = leftover_data.sample(frac=0.5, random_state=25)
testing_data = leftover_data.drop(validating_data.index)

# save splits
training_data.to_csv(data_folder + 'CrowS-Pairs_train.csv', index=False)
validating_data.to_csv(data_folder + 'CrowS-Pairs_valid.csv', index=False)
testing_data.to_csv(data_folder + 'CrowS-Pairs_test.csv', index=False)

# Stereotype Dataset

In [None]:
stereo_root = '/content/drive/MyDrive/Colab Notebooks/Amplifi Project/Data/Other Data/annotated_data.csv'
stereo_df = pd.read_csv(stereo_root, delimiter=',', header=None, skiprows=1, names=['id','text','explicit','implicit'])

In [None]:
def labeller(row):
    if row['explicit'] == 'no' and row['implicit'] == 'no':
      label = 0
    else:
      label = 1

    return label

In [None]:
stereo_df['label'] = stereo_df.apply(lambda x: labeller(x), axis=1)

In [None]:
df = stereo_df[['text','label']].copy()

df['pos_tags'] = df['text'].apply(lambda x: pos_tagger(x))
df['rel_tags'] = df['text'].apply(lambda x: dep_tagger(x))

# save data
data_folder = "/content/drive/MyDrive/Colab Notebooks/Amplifi Project/Data/Processed Datasets/"
df.to_csv(data_folder + 'Stereo.csv', index=False)

In [None]:
stereo_root = '/content/drive/MyDrive/Colab Notebooks/Amplifi Project/Data/Processed Datasets/Stereo.csv'
stereo_df = pd.read_csv(stereo_root, delimiter=',')

In [None]:
# split reduced data into train / val / test
training_data = stereo_df.sample(frac=0.8, random_state=25)
leftover_data = stereo_df.drop(training_data.index)
validating_data = leftover_data.sample(frac=0.5, random_state=25)
testing_data = leftover_data.drop(validating_data.index)

# save splits
training_data.to_csv(data_folder + 'Stereo_train.csv', index=False)
validating_data.to_csv(data_folder + 'Stereo_valid.csv', index=False)
testing_data.to_csv(data_folder + 'Stereo_test.csv', index=False)

# Mixed Dataset

In [None]:
df1 = npov_df[npov_df['label'] == 1].sample(n=3000)
df2 = npov_df[npov_df['label'] == 0].sample(n=3000)
df3 = stereo_df[stereo_df['label'] == 1].sample(n=1000)
df4 = stereo_df[stereo_df['label'] == 0].sample(n=1000)
df5 = crows_df.sample(n=1000)
df6 = wnc_df.sample(n=1000)

mixed_df = pd.concat([df1, df2, df3, df4, df5, df6]).reset_index(drop=True)
mixed_df = mixed_df.sample(frac=1).reset_index(drop=True) # shuffle

# save data
mixed_df.to_csv(data_folder + 'Mixed.csv', index=False)

In [None]:
mixed_root = '/content/drive/MyDrive/Colab Notebooks/Amplifi Project/Data/Processed Datasets/Mixed.csv'
mixed_df = pd.read_csv(mixed_root, delimiter=',')

In [None]:
# split reduced data into train / val / test
training_data = mixed_df.sample(frac=0.8, random_state=25)
leftover_data = mixed_df.drop(training_data.index)
validating_data = leftover_data.sample(frac=0.5, random_state=25)
testing_data = leftover_data.drop(validating_data.index)

# save splits
training_data.to_csv(data_folder + 'Mixed_train.csv', index=False)
validating_data.to_csv(data_folder + 'Mixed_valid.csv', index=False)
testing_data.to_csv(data_folder + 'Mixed_test.csv', index=False)