In [1]:
# Standard DS imports
import pandas as pd
import numpy as np

# Vizualization imports
import matplotlib.pyplot as plt
import seaborn as sns

# NLP imports
import re
import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

# Custom imports
import acquire as a
import env

import os
import requests
import random

## Acquire

In [17]:
df = a.process_all_repos()

## Prepare

##### Prepare Steps:
- Lowercase All Text
- Remove accented characters and non-ASCII characters
- Remove special characters
- Tokenize
- Lemmatize
- Remove Stopwords
- Add extra Stopwords

In [3]:
'''def clean(text):
    ‘A simple function to cleanup text data’
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words(‘english’)
    text = (unicodedata.normalize(‘NFKD’, text)
             .encode(‘ascii’, ‘ignore’)
             .decode(‘utf-8’, ‘ignore’)
             .lower())
    text = text.replace(‘/’, ' ‘)
    text = text.replace(‘-’, ' ‘)
    words = re.sub(r”[^a-z0-9’+\s]“, ‘’, text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]'''

"def clean(text):\n    ‘A simple function to cleanup text data’\n    wnl = nltk.stem.WordNetLemmatizer()\n    stopwords = nltk.corpus.stopwords.words(‘english’)\n    text = (unicodedata.normalize(‘NFKD’, text)\n             .encode(‘ascii’, ‘ignore’)\n             .decode(‘utf-8’, ‘ignore’)\n             .lower())\n    text = text.replace(‘/’, ' ‘)\n    text = text.replace(‘-’, ' ‘)\n    words = re.sub(r”[^a-z0-9’+\\s]“, ‘’, text).split()\n    return [wnl.lemmatize(word) for word in words if word not in stopwords]"

In [4]:
def basic_clean(string):
    '''
    This function takes in the original text.
    The text is all lowercased, 
    the text is encoded in ascii and any characters that are not ascii are ignored.
    The text is then decoded in utf-8 and any characters that are not ascii are ignored
    Additionally, special characters are all removed.
    A clean article is then returned
    '''
    #lowercase
    string = string.lower()
    
    #normalize
    string = unicodedata.normalize('NFKD', string)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    string = string.replace('/',' ')
    string = string.replace('-',' ')
    #remove special characters and replaces it with blank
    string = re.sub(r"[^a-z0-9'\s]", '', string)
    
    return string

In [5]:
def tokenize(string):
    '''
    This function takes in a string
    and returns the string as individual tokens put back into the string
    '''
    #create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()

    #use the tokenizer
    string = tokenizer.tokenize(string, return_str = True)

    return string

In [6]:
def lemmatize(string):
    '''
    This function takes in a string
    and returns the lemmatized word joined back into the string
    '''
    #create the lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()
    
    #look at the article 
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    #join lemmatized words into article
    string = ' '.join(lemmas)

    return string

In [7]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in text, extra words and exclude words
    and returns a list of text with stopword removed
    '''
    #create stopword list
    stopword_list = stopwords.words('english')
    
    #remove excluded words from list
    stopword_list = set(stopword_list) - set(exclude_words)
    
    #add the extra words to the list
    stopword_list = stopword_list.union(set(extra_words))
    
    #split the string into different words
    words = string.split()
    
    #create a list of words that are not in the list
    filtered_words = [word for word in words if word not in stopword_list]
    
    #join the words that are not stopwords (filtered words) back into the string
    string = ' '.join(filtered_words)
    
    return string

In [8]:
def transform_data(df):
    df = df.rename(columns={'readme_contents':'original'})
    # df['clean'] = cleaned and tokenized version with stopwords removed
    df['clean'] = df['original'].apply(basic_clean
                                      ).apply(tokenize
                                             ).apply(remove_stopwords)
    # df['lemmatized'] = lemmatized version of clean data
    df['lematized'] = df['clean'].apply(lemmatize)
    
    for i, x in enumerate(df.lematized):
        df['lematized'][i] = x.split()
    
    return df

In [9]:
def transform_data(df):
    df = df.rename(columns={'readme_contents':'original'})
    # df['clean'] = cleaned and tokenized version with stopwords removed
    df['clean'] = df['original'].apply(basic_clean
                                      ).apply(tokenize
                                             ).apply(remove_stopwords)
    # df['lematized'] = lemmatized version of clean data
    df['lematized'] = df['clean'].apply(lemmatize)
    
    # Split lemmatized strings into lists of words
    df['lematized'] = df['lematized'].apply(lambda x: x.split())
    
    # Drop strings longer than 15 characters
    df['lematized'] = df['lematized'].apply(lambda x: [word for word in x if len(word) <= 15])
    
    return df

In [21]:
def transform_data(df, extra_stopwords= ['repository', 'githubcom', 'host']):
    df = df.rename(columns={'readme_contents':'original'})
    # df['clean'] = cleaned and tokenized version with stopwords removed
    df['clean'] = df['original'].apply(basic_clean
                                      ).apply(tokenize
                                             ).apply(remove_stopwords, extra_words=extra_stopwords)
    # df['lematized'] = lemmatized version of clean data
    df['lematized'] = df['clean'].apply(lemmatize)
    
    return df

In [24]:
def transform_data(df, extra_stopwords= ['repository', 'githubcom', 'host']):
    df = df.rename(columns={'readme_contents':'original'})
    # df['clean'] = cleaned and tokenized version with stopwords removed
    df['clean'] = df['original'].apply(basic_clean
                                      ).apply(tokenize
                                             ).apply(remove_stopwords, extra_words=extra_stopwords)
    # df['lematized'] = lemmatized version of clean data
    df['lematized'] = df['clean'].apply(lemmatize)
    
    # Split lemmatized strings into lists of words
    df['lematized'] = df['lematized'].apply(lambda x: x.split())
    
    # Drop words longer than 15 characters
    df['lematized'] = df['lematized'].apply(lambda x: [word for word in x if len(word) <= 15])
    
    # Join lists of words back into strings
    df['lematized'] = df['lematized'].apply(lambda x: ' '.join(x))
    
    return df

In [25]:
df = transform_data(df)

In [29]:
df.lematized[0].value_counts()

AttributeError: 'str' object has no attribute 'value_counts'

In [None]:
for i, x in enumerate(df.lematized[10]):
    for content in x:
        if len(content) > 15:
            x.replace(content, np.nan)
            df.lematized[10][i].dropna()

In [None]:
type(df.lematized[10])

str

In [None]:
df.lematized[0].split()

['repository',
 'host',
 'code',
 'datasets',
 'relating',
 'responsible',
 'nlp',
 'project',
 'meta',
 'ai',
 'project',
 'holisticbiashttpsgithubcomfacebookresearchresponsiblenlptreemainholisticbias',
 'eric',
 'michael',
 'smith',
 'melissa',
 'hall',
 'melanie',
 'kambadur',
 'eleonora',
 'presani',
 'adina',
 'williams',
 "'",
 'sorry',
 'hear',
 'finding',
 'bias',
 'language',
 'model',
 'holistic',
 'descriptor',
 'dataset',
 '2022httpsarxivorgpdf220509209pdf',
 'code',
 'generate',
 'dataset',
 'holisticbias',
 'consisting',
 'nearly',
 '600',
 'demographic',
 'term',
 '450k',
 'sentence',
 'prompt',
 'code',
 'calculate',
 'likelihood',
 'bias',
 'metric',
 'amount',
 'bias',
 'language',
 'model',
 'defined',
 'holisticbias',
 'demographic',
 'term',
 'fairscorehttpsgithubcomfacebookresearchresponsiblenlptreemainfairscore',
 'rebecca',
 'qian',
 'candace',
 'ross',
 'jude',
 'fernandes',
 'eric',
 'smith',
 'douwe',
 'kiela',
 'adina',
 'williams',
 'perturbation',
 'augmen