<a href="https://colab.research.google.com/github/lauramanor/snippits/blob/master/simple_environment_parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This is a simple parser for finding minimal pairs and environments

You will need to upload your file to your google drive. 

When prompted, click on the link to get authentication to allow Google to access your Drive. You should see a screen with “Google Cloud SDK wants to access your Google Account” at the top. After you allow permission, copy the given verification code and paste it in the box in Colab.

Once you have completed verification, go to the CSV file in Google Drive,right-click on it and select “Get shareable link”. The link will be copied into your clipboard. Paste this link into a string variable in Colab.


In [0]:
#@title Imports

import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
from collections import defaultdict

import difflib

!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



## Enter applicable information necessary here:


In [0]:


#@markdown  Enter Sharable Link for your xcls here: 
link = "https://drive.google.com/open?id=1X-owhxYxF6AhQra8IRAtaqNTVN5u2Lav" #@param {type:"string"}




#@markdown  Enter the name of your file here, it should include any extensions
filename = 'ANG-201907-Elicitation Master.csv' #@param {type:"string"}
#@markdown  Enter the sheetname here (for xlsx)
sheetname = '' #@param {type:"string"}
#@markdown  Enter the row that contains the headding info
heading_row = 7 #@param {type:"integer"}
#@markdown  Enter the data column. MUST BE EXACT!!!  (currently supoports only one column)
data_column = "Angaite 9/2019" #@param {type:"string"}
#@markdown  Enter the id columns(s). MUST BE EXACT!!!  (sperate columns with tabs)
header_columns = "Date\tSession\tItem#" #@param {type:"string"}

header_columns = header_columns.split("\t")


![](https://drive.google.com/uc?export=view&id=1Mylvy4Yyp0ZFM8vr_SI2H2hlc5dN3IgU)

In [3]:
#@title Upload File

if "id" in link:
  fluff, id = link.split('id=')
elif "/d/" in link:
  fluff, id = link.split("/d/")
else:
  print("Link not supported")
  raise SystemExit
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile(filename)  
if filename.endswith(".xlsx"):
  df = pd.read_excel(filename, sheet_name=sheetname, header=heading_row-1)
elif filename.endswith(".csv"):
    df = pd.read_csv(filename, header=heading_row-1 )

print(header_columns)
g = [df[x].fillna("0").astype(int).astype(str) for x in header_columns]
# print(g)
uid = pd.concat(g,axis=1 ).apply(lambda x: '_'.join(x) if not x.all() == '' else x.sum(),axis=1)
df["uid"] = uid

['Date', 'Session', 'Item#']



## Check to make sure this looks like the data you want to parse:


In [4]:
df[[data_column, "uid"]].head(10)

Unnamed: 0,Angaite 9/2019,uid
0,(kilkaske?) nepkeˈsek,20190716_1_23
1,(Lea) aklooˈma,20190708_1_80
2,aaˈLa,20190717_2_39
3,aaˈLa aLwasekˈsek,20190710_2_63
4,aaˈLa aLwasekˈsek nentoˈma,20190723_1_43
5,aaLajkoˈ'ok,20190729_0_12
6,aaˈmaj (aˈmaj?),20190708_2_34
7,aaˈnik,20190720_2_42
8,aaˈwa (awa?),20190723_1_26
9,ahanˈkok,20190723_1_54


## Are there any multi-glyph phones we should know about?

for example, if you want sh in ship to count as one, note that below. Each groupone should be separated with a space


In [0]:
#@title multi-glyph

multi_glyphs = "" #@param {type:"string"}

multi_glyphs = multi_glyphs.split(" ")

## Basica Parse

In [0]:
#@title create special dictionary


class ThingDict(object):
  """
    blueprint doing stuff with these words
  """

  def __init__(self, idx_adjust, ):
    self.keys = []
    self.data_idxs = []
    self.idx_adjust = idx_adjust + 1
    self.uids = []

  def add(self, key, new_value, uid):
    adjusted_value = new_value + self.idx_adjust
    if key in self.keys:
      idx = self.keys.index(key)
      self.data_idxs[idx].append(adjusted_value)
      self.uids[idx].append(uid)

    else:
      self.keys.append(key)
      self.data_idxs.append([adjusted_value])
      self.uids.append([uid])



  def keys(self):
    return self.keys

  def get_idxs(self, key):
    idx = self.keys.index(key)
    return self.data_idxs[idx]

  def get_uids(self, key):
    idx = self.keys.index(key)
    return self.uids[idx]


In [0]:
#@title do a simple parse for word boundaries, add words to dict, and remove anything in parens  

def split_word_boundary(data_str):
  data_str = str(data_str)
  wrds = data_str.split(" ")
  to_remove = []
  for wrd in wrds:
    if "(" in wrd:
        to_remove.append(wrd)
    elif ")" in wrd:
        to_remove.append(wrd)
  for rm in to_remove:
    wrds.remove(rm)
  return wrds

df["split"] = df[data_column].apply(split_word_boundary)


words_dict = ThingDict(heading_row)

for idx, row in df.iterrows():
    # print(idx, row)
    for r in row["split"]:
        # print(r, idx)
        words_dict.add(r, idx, row["uid"])
        # print(words_dict.get_uids(r))

  

## Get close matches

Similarity score refers to how similar the words must be (0,1]

In [0]:
#@title close matches

import pprint
import json
import csv
import io
pp = pprint.PrettyPrinter(indent=4)

def pretty_print_matches(matches):
    for key in matches:
        print(f"{key}: {matches[key]['idxs']}")
        to_sort = matches[key]
        for match in sorted(to_sort, key=lambda k: len(to_sort[k]), reverse=True):
            if match != "idxs":
                print(f"\t{match}: {matches[key][match]}")

def save_as_csv(matches):
    output = io.StringIO()

    writer = csv.writer(output, delimiter=",", quotechar='"', )
    writer.writerow(["word1", "indexes1", "word2", "indexes2"])
    for key in matches:
        for match in matches[key]:
            if match != "idxs":
                writer.writerow([key, matches[key]['idxs'], match, matches[key][match]])

    print(f"\nCOPY AFTER THIS: \n{output.getvalue()}\n")

def close_matches(thing_dict, one_word=None, print_me=True, print_csv=False, similarity_score=.8):
    all_words = thing_dict.keys
    to_return = dict()
    if one_word:
        test_words = [one_word]
    else:
        test_words = all_words

    for test in test_words:
        if len(test) > 0:
            matches = difflib.get_close_matches(test, all_words, n=10, cutoff=similarity_score)
            match_dict = dict()
            for match in matches:
                if match != test:
                    match_dict[match] = thing_dict.get_uids(match)
            match_dict["idxs"] = thing_dict.get_uids(test)
            to_return[test] = match_dict

    if print_me:
        pretty_print_matches(to_return)

    if print_csv:
        save_as_csv(to_return)

    return to_return


In [9]:
one_close_match = close_matches(words_dict, one_word="taaˈta", print_csv=True)


taaˈta: ['20190710_2_17']
	taaˈLa: ['20190720_1_1', '20190720_1_2', '20190708_1_1']
	naaˈta: ['20170722_1_11', '20190715_1_20', '20190716_1_14']
	jatanaˈta: ['20190715_2_43', '20190715_2_44', '20190715_2_45']
	taaˈma: ['20190722_2_70', '20190710_2_54']
	jetanaˈta: ['20190722_1_12']

COPY AFTER THIS: 
word1,indexes1,word2,indexes2
taaˈta,['20190710_2_17'],taaˈma,"['20190722_2_70', '20190710_2_54']"
taaˈta,['20190710_2_17'],taaˈLa,"['20190720_1_1', '20190720_1_2', '20190708_1_1']"
taaˈta,['20190710_2_17'],naaˈta,"['20170722_1_11', '20190715_1_20', '20190716_1_14']"
taaˈta,['20190710_2_17'],jetanaˈta,['20190722_1_12']
taaˈta,['20190710_2_17'],jatanaˈta,"['20190715_2_43', '20190715_2_44', '20190715_2_45']"




In [0]:

all_close_matches = close_matches(words_dict, print_me=False, print_csv=False)

## Now, let's find some environments


In [0]:
#@title some stuff



def pretty_print_envs(envs, max_examples=100):
    for pattern in envs:
        for thing in envs[pattern]:
            print(f"{thing.upper()} environments for {pattern} (envs={len(envs[pattern][thing])})")
            to_sort = envs[pattern][thing]
            for env in sorted(to_sort, key=lambda k: len(to_sort[k]), reverse=True):
                print(f"\t {env} (examples={len(envs[pattern][thing][env])})")
                examples = envs[pattern][thing][env]
                examples.sort(key=lambda t: len(t[1]), reverse=True)
                for example in examples[:max_examples]:
                    print(f"\t\t{example}")

def prep(ngram, pattern):
    return ngram.replace(pattern, "_")

def find_environments(words_dict,patterns,left=True, right=True, n=2,print_me=False):

    patterns_dict = dict((p, {}) for p in patterns)

    for pattern in patterns:
        starts = defaultdict(list)
        ends = defaultdict(list)
        lefts = defaultdict(list)
        rights = defaultdict(list)
        alls = defaultdict(list)
        for word in words_dict.keys:

            if pattern in word:
                if len(pattern) + n < len(word):
                    ngrams = ["".join(ngram) for ngram in zip(*[word[i:] for i in range(len(pattern)+n)])]
                else: 
                    ngrams = [word]

                if word.startswith(pattern):
                    starts[prep(ngrams[0], pattern)].append((word, words_dict.get_uids(word)))

                if word.endswith(pattern):
                    ends[prep(ngrams[-1], pattern)].append((word, words_dict.get_uids(word)))

                for ngram in ngrams:
                    if pattern in ngram:
                        alls[prep(ngram, pattern)].append((word, words_dict.get_uids(word)))
                        if ngram.endswith(pattern):
                            lefts[prep(ngram, pattern)].append((word, words_dict.get_uids(word)))
                        if ngram.startswith(pattern):
                            rights[prep(ngram, pattern)].append((word, words_dict.get_uids(word)))


        patterns_dict[pattern]["all"] = dict(alls)
        patterns_dict[pattern]["start"] = dict(starts)
        patterns_dict[pattern]["end"] = dict(ends)
        patterns_dict[pattern]["right"] = dict(rights)
        patterns_dict[pattern]["left"] = dict(lefts)
        

    if print_me:
        pretty_print_envs(patterns, max_examples=4)

    return(patterns_dict)




In [12]:

envs = find_environments(words_dict, patterns=["nepk"], print_me=False )
pretty_print_envs(envs)

ALL environments for nepk (envs=2)
	 _eˈ (examples=1)
		('nepkeˈsek', ['20190716_1_23', '20190716_1_27', '20190716_1_57', '20190716_1_58', '20190708_3_45', '20190722_2_35', '20190722_2_37', '20190708_3_44', '20190716_1_30'])
	 _ee (examples=1)
		('nepkeeˈsek', ['20190719_1_35', '20190708_3_43', '20190708_3_43'])
START environments for nepk (envs=2)
	 _eˈ (examples=1)
		('nepkeˈsek', ['20190716_1_23', '20190716_1_27', '20190716_1_57', '20190716_1_58', '20190708_3_45', '20190722_2_35', '20190722_2_37', '20190708_3_44', '20190716_1_30'])
	 _ee (examples=1)
		('nepkeeˈsek', ['20190719_1_35', '20190708_3_43', '20190708_3_43'])
END environments for nepk (envs=0)
RIGHT environments for nepk (envs=2)
	 _eˈ (examples=1)
		('nepkeˈsek', ['20190716_1_23', '20190716_1_27', '20190716_1_57', '20190716_1_58', '20190708_3_45', '20190722_2_35', '20190722_2_37', '20190708_3_44', '20190716_1_30'])
	 _ee (examples=1)
		('nepkeeˈsek', ['20190719_1_35', '20190708_3_43', '20190708_3_43'])
LEFT environments f