In [1]:
import numpy as np
import pandas as pd
from IPython.display import display

In [2]:
RATIO_THRESHOLD = 85 # filter out the nouns whose gender-ratio is below this value
MAX_SUFFIX_LENGTH = 4 # the analysis is carried out on the suffixes from 1 to this value (included)
ROW_COUNT = 100 # number of suffixes to display

In [3]:
PATH_OR_URL = "https://raw.githubusercontent.com/laowantong/Google-French-Nouns-with-Genders/master/french_nouns.tsv"
df = pd.read_csv(PATH_OR_URL, sep="\t", encoding="utf8")

# Calculating the results

In [4]:
def group_by_suffix_length(df, n):
    """Calculate a table of frequencies indexed by total count, suffix and gender."""
    
    # First step. Create a left table similar to (example for n == 3):
    #
    #               freq                                                     nouns  
    # noun gender     
    # ...       
    # ace  f      6572736 (place, surface, menace, face, race, trace, préface, ...  
    #      m      1034050 (espace, face, place, menace, cyberespace, palace, ra...  
    # ach  m        23832 (krach, coach, almanach, midrach)  
    # ack  m        14987 (feedback, crack, pack, black, snack, biofeedback, back)  
    # ...
    #
    # NB: An erroneous feminine gender is assigned to "face", "place", and "menace".
    #     This corresponds to relatively high-frequency errors in the Google digram dataset.
    
    df["count_and_noun"] = df[["count", "noun"]].apply(tuple, axis=1) # helper column for the sort
    left = \
        df[
            df["noun"].str.len() >= n # filter out any word shorter than the desired suffix length
        ].groupby([
            df["noun"].str.slice(-n), # group by both suffix
            "gender"                  #   and gender
        ]).agg({
            "count": "sum",             # totalize the frequencies for each couple (noun, gender)
            "count_and_noun": lambda l: # transform each list of count and noun sharing both suffix and gender
                tuple( #                  into a tuple of
                    item[1] for item in sorted(l, reverse=True) # nouns sorted by decreasing frequency
                ),
        })
    left.columns = ["freq", "nouns"] # rename the resulting columns
    df.drop(["count_and_noun"], axis=1, inplace=True) # restore df in its original state
    
    # Second step. Create a right table similar to:
    #
    #          total
    # noun          
    # ...
    # ace    7606786
    # ach      23832
    # ack      14987
    # ...

    right = \
        df[
            df["noun"].str.len() >= n # filter out any word shorter than the desired suffix length
        ].groupby(
            df["noun"].str.slice(-n)  # group by suffix
        ).agg({
            "count": "sum" # totalize the frequencies for each noun
        })
    right.columns = ["total"]

    # Final step. Join the left and right tables on column "noun".
    #
    #                         freq                                                    nouns
    # total   suffix gender          
    # 7606786 ace    f       86.41 (place, surface, menace, face, race, trace, préface, ...  
    #                m       13.59 (espace, face, place, menace, cyberespace, palace, ra...  
    # 23832   ach    m      100.00 (krach, coach, almanach, midrach)    
    # 14987   ack    m      100.00 (feedback, crack, pack, black, snack, biofeedback, back)  
    
    result = pd.merge(
            left.reset_index(),  # transform the indexes into columns
            right.reset_index(), #   to make them possible to join
            on=["noun"]
        ).set_index(["total", "noun", "gender"])       # transform these columns back into indexes
    result.index.names = ["total", "suffix", "gender"] # and rename the second one
    result["freq"] = 100 * result["freq"] / result.index.get_level_values("total") # frequencies as percentages

    return result

In [5]:
def collect_all_data(df):
    """Calculate a table with ALL suffixes between 1 and MAX_SUFFIX_LENGTH."""
    
    # Call the previous function for all the desired suffix lengths
    #   and concatenate the results in a single table
    result = pd.DataFrame()
    for i in range(1, MAX_SUFFIX_LENGTH + 1):
        result = result.append(group_by_suffix_length(df, i))
    
    # Upgrade temporarily two indexes to columns to make them possible to sort
    result["total_from_index"] = result.index.get_level_values("total")
    result["suffix_from_index"] = result.index.get_level_values("suffix")
    
    # Sort by total, suffix and gender frequency
    result.sort_values(by=["total_from_index", "suffix_from_index", "freq"], inplace=True, ascending=False)
    
    # Drop the temporary columns
    result.drop(["total_from_index", "suffix_from_index"], axis=1, inplace=True)
    
    # Drop the "total" index, which is useless since the table is now sorted by total first
    result.reset_index(level=0, drop=True, inplace=True)
    
    # The result is ordered by decreasing suffix frequencies, suffixes and gender frequencies:
    #
    #                 freq                                                           nouns
    # suffix gender          
    # e      f       70.79  (vie, politique, guerre, ville, nature, forme, mise, recher... 
    #        m       29.21  (monde, rôle, système, cadre, nombre, problème, texte, doma... 
    # n      f       86.63  (fin, question, situation, population, production, région, ... 
    #        m       13.37  (plan, terrain, moyen, lien, roman, an, chemin, besoin, len... 
    # on     f       96.74  (question, situation, population, production, région, créat... 
    #        m        3.26  (ton, don, garçon, bouton, million, son, patron, échantillo... 
    # ion    f       99.65  (question, situation, population, production, région, créat... 
    #        m        0.35  (million, avion, camion, lion, champion, bastion, pion, esp... 
    # tion   f       99.97  (question, situation, population, production, création, for... 
    #        m        0.03  (bastion, cation, fonction, question, situation, direction,...
    # ...
    
    return result

In [6]:
def best_suffixes_to_learn(df, how_many=None):
    """Eliminate the suffixes whose gender repartition is not "interesting" enough."""
    
    result = pd.DataFrame()
    seen = dict()
    df = collect_all_data(df)
    
    # Iterate on the suffixes (sorted by decreasing frequency)
    for suffix in df.index.get_level_values(0).unique(): # iterate on the suffixes
        
        # Create a small dataframe consisting only in the rows of the current suffix
        current_group = df.xs(suffix, drop_level=False)
        
        # Extract the row corresponding to the most common gender for this suffix
        group_first_row = current_group.head(1)["freq"]
        gender = group_first_row.index.values[0][0] # retrieve the gender
        freq = group_first_row.item()               # and its frequency
        
        # Keep "interesting" suffixes
        if freq > seen.get(suffix[1:], # if the suffix does not end with a smaller suffix already kept
                           RATIO_THRESHOLD # or has a frequency greater than the RATIO_THRESHOLD
                  ) + len(suffix) - 1: # plus a certain amount depending of its length - 1
            seen[suffix] = freq # keep it
            result = result.append(current_group)
            if how_many and len(seen) > how_many:
                break # stop after a given number of rows in the result
    
    return result

In [7]:
# Calculate the ROW_COUNT first interesting rows
best = best_suffixes_to_learn(df, ROW_COUNT)

# Displaying the results

In [8]:
pd.options.display.float_format = "{:,.2f}".format # two decimals ought to be enough for anybody
pd.options.display.max_colwidth = 100 # longer lists of nouns in each row

In [9]:
def show(*suffixes):
    """A helper function displaying some or all rows of the resulting table."""
    try:
        if suffixes:
            display(best.ix[list(suffixes)])
        else:
            with pd.option_context('display.max_rows', len(best)):
                display(best.iloc[:len(best)])
    except KeyError:
        print "Missing suffix! You may decrease RATIO_THRESHOLD and/or increase ROW_COUNT."

## Displaying one given suffix

In [10]:
show("age")

Unnamed: 0_level_0,Unnamed: 1_level_0,freq,nouns
suffix,gender,Unnamed: 2_level_1,Unnamed: 3_level_1
age,m,90.18,"(passage, langage, personnage, mariage, village, visage, message, voyage, paysage, ouvrage, chôm..."
age,f,9.82,"(image, page, plage, rage, cage, sage, nage, partage, sauvage, langage, passage, mariage, dévisa..."


## Displaying several given suffixes

In [11]:
show("ment", "ure")

Unnamed: 0_level_0,Unnamed: 1_level_0,freq,nouns
suffix,gender,Unnamed: 2_level_1,Unnamed: 3_level_1
ment,m,99.93,"(développement, gouvernement, mouvement, moment, sentiment, traitement, changement, comportement..."
ment,f,0.07,"(jument, recherchedéveloppement, développement, gouvernement, ment, transforment, mouvement, lég..."
ure,f,99.58,"(nature, culture, mesure, structure, littérature, lecture, figure, procédure, peinture, rupture,..."
ure,m,0.42,"(murmure, mercure, chlorure, mesure, jure, rassure, centaure, demeure, parjure, sulfure, figure,..."


## Displaying all suffixes

In [12]:
show()

Unnamed: 0_level_0,Unnamed: 1_level_0,freq,nouns
suffix,gender,Unnamed: 2_level_1,Unnamed: 3_level_1
n,f,86.63,"(fin, question, situation, population, production, région, création, formation, maison, notion, ..."
n,m,13.37,"(plan, terrain, moyen, lien, roman, an, chemin, besoin, lendemain, bien, soutien, maintien, méde..."
on,f,96.74,"(question, situation, population, production, région, création, formation, maison, notion, const..."
on,m,3.26,"(ton, don, garçon, bouton, million, son, patron, échantillon, salon, poisson, rayon, canton, bar..."
ion,f,99.65,"(question, situation, population, production, région, création, formation, notion, construction,..."
ion,m,0.35,"(million, avion, camion, lion, champion, bastion, pion, espion, ion, scorpion, centurion, gangli..."
t,m,88.2,"(droit, fait, développement, gouvernement, point, rapport, projet, mouvement, mot, moment, sujet..."
t,f,11.8,"(part, plupart, mort, nuit, forêt, dot, dent, fait, composent, est, enfant, jument, t, gent, pos..."
é,f,86.93,"(société, réalité, liberté, communauté, volonté, nécessité, qualité, possibilité, vérité, majori..."
é,m,13.07,"(marché, passé, côté, traité, degré, comité, procédé, député, café, curé, clergé, salarié, péché..."
