In [2]:
#import relevant libraries
import en_core_web_sm
from spacy_langdetect import LanguageDetector
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#set up the nlp pipeline from spaCy, and import the quotes dataframe (two column dataframe: Quote and Author)
nlp = en_core_web_sm.load()
nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

df = pd.read_csv(r'C:\Users\MainUser\Desktop\Quotes.csv')

In [3]:
#modify quotes to strip away unnessary white space and make all words lowercase
df['Quote'] = df['Quote'].apply(lambda qt : qt.strip())
df['Quote'] = df['Quote'].apply(lambda qt : qt.lower())

In [5]:
#define two functions required to break long quotes into individual sentences
def break_sentences(quote, author, df, index):
    '''
    Turns a single quote into a dataframe, with each sentence of the quote as a new line in the dataframe.
    This dataframe is concatonated to an existing dataframe.  Sentences identified using the nlp library spaCy.

    Args:
        Quote (str): a quote to be broken into individual sentences. May be of any length.
        Author (str): Author to be listed for the quote in the dataframe
        df (dataframe): a dataframe with a column for quote and a column for author. May be blank or contain
                        prior quotes already broken into sentences
        index (int): starting index to add additional quotes to the dataframe

    Returns:
        df (dataframe): the input dataframe with a new line concatonated for each sentence in the input quote
        index (int): starting index for any future quotes to be added to the dataframe
    '''
    doc = nlp(quote)
    for sentence in list(doc.sents):
        df_sentence = pd.DataFrame({'Quote':str(sentence),'Author':author},index=[index])
        df = pd.concat([df,df_sentence])
        index += 1
    return df, index

def break_long_quotes_into_sentences(df):
    '''
    Breaks each quote in the input dataframe into seperate quotes, each one sentence long, and
    returns each of these new one-sentence quotes as a seperate line in the dataframe. Sentences
    identified using the nlp library spaCy

    Args:
        df (dataframe): a dataframe with a column for quote and a column for author. Quotes are of any length

    Returns:
        df_updated (dataframe): a dataframe with a column for quote and a column for author. Quotes are one sentence long.

    '''
    df_updated = pd.DataFrame(columns = ['Quote','Author'])
    index = 0
    for i in range(df.shape[0]):
        quote = df['Quote'][i]
        author = df['Author'][i]
        df_updated, index = break_sentences(quote, author, df_updated, index)
    return df_updated

In [7]:
#break long quotes into individual sentences
df = break_long_quotes_into_sentences(df)

In [8]:
#function to execute the nlp pipeline to extract relevant information about a single quote
def nlp_dict(quote):
    '''
    Uses the nlp library spaCy to:
     -- Determine the language of the quote
     -- Break the quote up into words
     -- Identify if each word is a stop word
     -- Identify the lemma for each word
    In addition, this calculates the length of the quote, and the fraction of words in the quote that are stop words
    
    Args:
        quote (str): a sentence-length quote 
    
    Returns:
        nlp_dict (dict): A dictionary containing the following for the quote:
        language (str), language_score (float), sentence_length (int),
        stop_word_pct (float), words (list), words_ex_stopwords (list), and lemmas (list)
    
    '''
    doc = nlp(quote)
    language = doc._.language['language']
    language_score = doc._.language['score']
    words = []
    words_ex_stopwords = []
    lemmas = []
    sentence_length = 0
    stop_word_count = 0
    for token in doc:
        if token.is_alpha:
            sentence_length += 1
            words.append(token.text)
            if not token.is_stop: 
                words_ex_stopwords.append(token.text)
                lemmas.append(token.lemma_)
            if token.is_stop:
                stop_word_count += 1
    if sentence_length > 0:
        stop_word_pct = stop_word_count/ sentence_length
    else:
        stop_word_pct = np.nan
    return dict(language = language,
                language_score = language_score,
                sentence_length = sentence_length,
                stop_word_pct = stop_word_pct,
                words = words,
                words_ex_stopwords = words_ex_stopwords,
                lemmas = lemmas)

In [9]:
#extracting relevant information about each quote using the function above and saving each item of the dictionary to
#its own column in the dataframe
df['dict'] = df['Quote'].apply(nlp_dict)
df['language'] = df['dict'].apply(lambda x: x['language'])
df['language_score'] = df['dict'].apply(lambda x: x['language_score'])
df['sentence_length'] = df['dict'].apply(lambda x: x['sentence_length'])
df['stop_word_pct'] = df['dict'].apply(lambda x: x['stop_word_pct'])
df['words'] = df['dict'].apply(lambda x: x['words'])
df['words_ex_stopwords'] = df['dict'].apply(lambda x: x['words_ex_stopwords'])
df['lemmas'] = df['dict'].apply(lambda x: x['lemmas'])
df['lemmas_text'] = df['lemmas'].apply(lambda x: ' '.join(x))
df.drop(columns = ['dict'], inplace=True)

In [11]:
#dropping quotes where there isn't at least one non-stop word
df.drop(df[df['lemmas'].apply(len)== 0].index,inplace=True)

In [12]:
#dropping duplicate quotes, and printing information on the number of duplicate quotes dropped
index_to_keep = df.astype(str).drop_duplicates(['words'],keep='first').index
print("Number of Duplicates Dropped: "+ str(len(df.index) - len(index_to_keep)))
print("Fraction Duplicates: "+ str(1-len(index_to_keep)/len(df.index)))
df.drop(set(df.index) - set(index_to_keep),inplace=True)

Number of Duplicates Dropped: 1720
Fraction Duplicates: 0.13756698392385824


# Removing Non-English Quotes

### After examining Spacy's language classification of quotes, it became clear we needed a better system to clean the quotes than just deleting all non-English quotes.

I found a better method for language detection was to use a combination of Spacy's language classification and my stop words percentage metric to determine non-English quotes *(stop words are all English, so non-English quotes are likely to have a small percentage of stop words)*. Removing non-English quotes is done on a language-by-language basis because of the differences across languages. This process can be followed in the steps below (starting with the least common languages and building up to the most common)

Note that many of these cells have not been executed in the version uploaded to github so viewing the code is managable, but execution of the code would return a dataframe which is then visually examined to determine which quotes to remove.

In [95]:
# Many of the quotes SpaCy classified as non-English quotes appear to actually be English!
df[df['language'] != 'en'].head()

Unnamed: 0,Quote,Author,language,language_score,sentence_length,stop_word_pct,words,words_ex_stopwords,lemmas,lemmas_text
8,“in vain have i struggled.,Jane Austen,da,0.999993,5,0.6,"[in, vain, have, i, struggled]","[vain, struggled]","[vain, struggle]",vain struggle
14,i require so much!”,Jane Austen,ca,0.571429,4,0.75,"[i, require, so, much]",[require],[require],require
19,it is too long ago.,Jane Austen,tl,0.999996,5,0.6,"[it, is, too, long, ago]","[long, ago]","[long, ago]",long ago
28,“you pierce my soul.,Jane Austen,fr,0.999997,4,0.5,"[you, pierce, my, soul]","[pierce, soul]","[pierce, soul]",pierce soul
29,"i am half agony, half hope...",Jane Austen,cy,0.999996,6,0.333333,"[i, am, half, agony, half, hope]","[half, agony, half, hope]","[half, agony, half, hope]",half agony half hope


In [96]:
# Let's start by reviewing the least common languages all together, and then review the top 14 on a language-by-language basis
df['language'].value_counts()

en    8767
es     609
it     240
fr     144
cy     137
pt     112
af      89
no      67
de      62
da      58
nl      47
id      45
ro      42
tr      42
ca      39
et      38
tl      37
hr      34
so      31
fi      20
pl      15
vi      14
sq      11
el      11
lt      11
cs      10
sl      10
sw       9
hu       8
sv       7
sk       7
ru       5
lv       2
fa       1
bg       1
ja       1
Name: language, dtype: int64

In [None]:
# Change the pandas display setting to show all of the rows, and then show only the least common languages. My assessment:
# drop: the languages other the the top 14 with stop word percent < 22%, plus rows 4685, 4667, 9115, 5433, 2268, 3976, 1245. 
# Drops ~20 english quotes.
pd.set_option('display.max_rows', None)
cond1 = df['language'] != 'en'
cond2 = df['language'] != 'es'
cond3 = df['language'] != 'it'
cond4 = df['language'] != 'fr'
cond5 = df['language'] != 'cy'
cond6 = df['language'] != 'pt'
cond7 = df['language'] != 'af'
cond8 = df['language'] != 'no'
cond9 = df['language'] != 'de'
cond10 = df['language'] != 'da'
cond11 = df['language'] != 'nl'
cond12 = df['language'] != 'id'
cond13 = df['language'] != 'ro'
cond14 = df['language'] != 'tr'
df[(cond1)&(cond2)&(cond3)&(cond4)&(cond5)&(cond6)&(cond7)&(cond8)&(cond9)&(cond10)&(cond11)&(cond12)&(cond13)&(cond14)].sort_values(by=['stop_word_pct','language'])


In [None]:
df[df['language'] == 'tr'].sort_values(by=['stop_word_pct'])
## drop 'tr' with stop word percent < 30%. Drops ~ 5 english quotes.

In [None]:
df[df['language'] == 'ro'].sort_values(by=['stop_word_pct'])
## drop 'ro' with stop word percent < 30%, and row 11499. Drops ~5 english quotes.

In [None]:
df[df['language'] == 'id'].sort_values(by=['stop_word_pct'])
## drop 'id' with stop word percent < 30%. Drops ~2 english quotes.

In [None]:
df[df['language'] == 'nl'].sort_values(by=['stop_word_pct'])
## keep 'nl'

In [None]:
df[df['language'] == 'da'].sort_values(by=['stop_word_pct'])
## keep 'da'

In [None]:
df[df['language'] == 'de'].sort_values(by=['stop_word_pct'])
## drop 'de' with stop word percent < 30%. Drops ~5 english quotes.

In [None]:
df[df['language'] == 'no'].sort_values(by=['stop_word_pct'])
## keep 'no'

In [None]:
df[df['language'] == 'af'].sort_values(by=['stop_word_pct'])
## keep 'af'

In [None]:
df[df['language'] == 'pt'].sort_values(by=['stop_word_pct'])
## drop 'pt' with stop word percent < 30%, and rows 5813, 5827, 5801, 5028. Drops ~5 english quotes.

In [None]:
df[df['language'] == 'cy'].sort_values(by=['stop_word_pct'])
## keep 'cy'

In [None]:
df[df['language'] == 'fr'].sort_values(by=['stop_word_pct'])
# drop 'fr' with stop word percent < 25%. Drops ~10 english quotes.

In [None]:
df[df['language'] == 'it'].sort_values(by=['stop_word_pct'])
# drop 'it' with stop word percent < 41%, and row 3464. I observed 5 english quotes dropped, but I also
# didn't scroll as carefully through all of these quotes. I excpet there are < 10 english quotes dropped.

In [None]:
df[df['language'] == 'es'].sort_values(by=['stop_word_pct'])
# drop 'es' with stop word percent < 39%, and rows 2872, 1251. I didn't observe any english quotes dropped, but I also
# didn't scroll as carefully through all of these quotes. I excpet there are < 5 english quotes dropped.

In [None]:
df[df['language'] == 'en'].sort_values(by=['stop_word_pct'])
## glancing through there doesn't seem to be any misclassified, but let's take a closer look at longer sentence with no
## stop words, as they are the most likely to be misclassified.

In [None]:
cond1 = df['language'] == 'en'
cond2 = df['stop_word_pct'] < 0.15
cond3 = df['sentence_length'] > 5
df[(cond1)&(cond2)&(cond3)]
## should drop row 5660. It's english, but not really a quote per say, and row 7608 doesn't appear to be english

In [111]:
# dropping the non-English quotes identified in the steps above. Estimated total number of english quotes dropped is 60 quotes.

# dropping top 14 languages with specified stop word percents
to_drop_lang_pct = [('es',.39),('it',.41),('fr',.25),('pt',.3),('de',.3),('tr',.3),('ro',.3),('id',.3)]
for tup in to_drop_lang_pct:
    lang = tup[0]
    pct = tup[1]
    lang_condition = df['language'] == lang
    stop_words_condition = df['stop_word_pct'] < pct
    df.drop(df[(lang_condition)&(stop_words_condition)].index, inplace=True)

# dropping spotted outlier rows    
to_drop_index = [4685,4667,9115,5433,2268,3976,1245,11499,5813,5827,5801,5028,3464,5660,7680]
df.drop(to_drop_index,inplace=True)

# dropping less-common languages (below the top 14) with stop word percentage less than 22%
df.reset_index(drop=True,inplace=True)
to_drop_top14langs = ['en','es','it','fr','cy','pt','af','no','de','da','nl','tr','ro','id']
lang_cond_inv = []
for i in range(len(to_drop_top14langs)):
    lang = to_drop_top14langs[i]
    lang_cond_inv.append(df['language'] != lang)
lang_cond = (lang_cond_inv[0])
for i in range(1, len(lang_cond_inv)):
    lang_cond = (lang_cond) & (lang_cond_inv[i])
stop_words_cond = df['stop_word_pct'] < .22    
df.drop(df[(lang_cond)&(stop_words_cond)].index, inplace=True)

In [157]:
# saving clened quotes dataframe to CSV
df.to_csv(path_or_buf=r'C:\Users\MainUser\Desktop\Quotes_Cleaned.csv', index=False)