In [8]:
!pip install gender-guesser
!pip install genderize

Collecting genderize
  Downloading Genderize-0.3.1-py3-none-any.whl (5.7 kB)
Installing collected packages: genderize
Successfully installed genderize-0.3.1


In [45]:
import pandas as pd
import numpy as np
import gender_guesser.detector as gender
from genderize import Genderize
import matplotlib.pyplot as plt
import seaborn as sns
import os
import spacy
import warnings
warnings.filterwarnings('ignore')

In [46]:
# develop help function


def remove_subsets(names):
    names_sorted = sorted(names, key=len, reverse=True)

    final_names = set(names_sorted)

    for name in names_sorted:
        if name in final_names:
            subsets = {name[:i] for i in range(len(name))}.union({name[i:] for i in range(1, len(name))})

            final_names.difference_update(subsets)

    return list(final_names)

def extract_names(text):
    spacy.prefer_gpu()
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    # Extract entities recognized as PERSON
    names = remove_subsets([ent.text.lower() for ent in doc.ents if ent.label_ == "PERSON"])
    pronouns = list(set([token.text.lower() for token in doc if token.pos_ == "PRON"]))
    # only keep the 'he', 'she', 'her', 'his', 'him', 'himself', 'herself'
    pronouns = [pronoun for pronoun in pronouns if pronoun in ['he', 'she', 'her', 'his', 'him', 'himself', 'herself']]
    return names + pronouns


# Function to detect gender
def detect_gender(name):
    d = gender.Detector(case_sensitive=False)
    return d.get_gender(name)

# Function to decide the gender based on the highest count category for each name component
def decide_gender_for_full_name(full_name):
    pronouns = {
        'he': 'male',
        'she': 'female',
        'her': 'female',
        'his': 'male',
        'him': 'male',
        'himself': 'male',
        'herself': 'female'
    }
    name_parts = full_name.split()
    gender_counts = {'male': 0, 'female': 0, 'unknown': 0}

    for part in name_parts:
        if part.lower() in pronouns:
            gender_counts[pronouns[part.lower()]] += 1
        else:
            detected_gender = detect_gender(part)
            if detected_gender == 'male':
                gender_counts['male'] += 1
            elif detected_gender == 'female':
                gender_counts['female'] += 1
            else:
                gender_counts['unknown'] += 1

    # Decide the gender based on the highest count
    final_gender = max(gender_counts, key=gender_counts.get)
    return final_gender if gender_counts[final_gender] > gender_counts['unknown'] else 'unknown'

# Function to count genders in a list of full names
def count_genders(names_list):
    male_count = 0
    female_count = 0
    unknown_count = 0

    for full_name in names_list:
        print(full_name)

        final_gender = decide_gender_for_full_name(full_name)
        if final_gender == 'male':
            male_count += 1
            print('male')
        elif final_gender == 'female':
            female_count += 1
            print('female')
        else:
            unknown_count += 1
            print('unknown')


    if male_count != 0 or female_count != 0:
        return 'male' if male_count > female_count else 'female'
    return 'unknown'


In [47]:
df_fake = pd.read_csv("/content/drive/MyDrive/MisleadingContent/Fake.csv")
df_true = pd.read_csv("/content/drive/MyDrive/MisleadingContent/True.csv")
print(df_fake.shape)
print(df_true.shape)


(23481, 4)
(21417, 4)


In [48]:
# since the model max_length is 512, so we need to drop new's length over 512

df_fake['length'] = df_fake['title'].str.len() + df_fake['text'].str.len()
df_fake = df_fake[df_fake['length'] <= 512]

print(df_fake.shape)

# delete the row where the length of title + text is over 512
df_true['length'] = df_true['title'].str.len() + df_true['text'].str.len()
df_true = df_true[df_true['length'] <= 512]
print(df_true.shape)

(2175, 5)
(1479, 5)


In [None]:
# create name and pronouns
from tqdm import tqdm
tqdm.pandas(desc="My Progress Bar")


df_fake['names_pronouns'] = df_fake['text'].apply(extract_names)
df_true['names_pronouns'] = df_true['text'].apply(extract_names)

# define gender
df_fake['gender'] = df_fake['names_pronouns'].apply(count_genders)
df_true['gender'] = df_true['names_pronouns'].apply(count_genders)

