# Getting the Git

In [1]:
import os
%matplotlib inline
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
import matplotlib as mpl
os.environ['PATH'] += ";C\Program Files\Git\\bin"

In [2]:
%%bash
rm -rf fec-data
git clone git@github.com:jbisbee1/fec-data.git

Cloning into 'fec-data'...


In [4]:
%%bash
cd ./fec-data/; git log

commit 4cd451b67957990f506074a7f38e200ceb053862
Author: jbisbee1 <jimbisbee@yahoo.com>
Date:   Thu Jul 21 16:37:22 2016 -0400

    Updated the readme.md file to remind myself of fuzzyfuzzy.

commit 3f7beb7944abf2f86654fc6b9c15fcfdb61f4091
Author: jbisbee1 <jimbisbee@yahoo.com>
Date:   Thu Jul 21 12:15:12 2016 -0400

    Updated the readme.md file with a brief description of the purpose for this repository.


In [3]:
%%bash
cd ./fec-data
git add fec-string-prep.ipynb
git commit -m "Implementing fuzzy wuzzy and using it to create Qualtrics surveys for manual checking."
git push
git status

On branch master
Your branch is up-to-date with 'origin/master'.
nothing to commit, working tree clean
On branch master
Your branch is up-to-date with 'origin/master'.
nothing to commit, working tree clean


fatal: pathspec 'fec-string-prep.ipynb' did not match any files
Everything up-to-date


# Opening some text data

In [13]:
recipients = pd.read_csv("../earmarks/recipient.csv")
recipients.head()

Unnamed: 0,id,earmark_id,raw_recipient,standardized_recipient
0,1,6,Ocanit,
1,3,12,Rescue Technolgies Corporation,
2,4,18,APHIS Hawaii,
3,5,25,University of Hawaii - University Affliated Re...,
4,6,32,Alaka'i Consulting and Engineering,


# Cleaning the data

In [14]:
recraw = recipients['raw_recipient']
rectokens=recraw.str.split()
totaltokens = len(rectokens)

In [15]:
recclean = recraw.str.lower()
recclean = recclean.str.replace(',',' ',case=False)
recclean = recclean.str.replace('\&',' and ',case=False)
recclean = recclean.str.replace('-',' ',case=False)
recclean = recclean.str.replace('\'','',case=False)
recclean = recclean.str.replace('.','')
recclean = recclean.str.replace('[ ]{2,}',' ',case=False)
recclean = recclean.str.normalize('NFC')

# Trying to Estimate Text Similarities

In [16]:
test = recclean.str.split(expand = True)
type(test[0])
#test['universities'] = test[0].apply(lambda x: x.count('university'))
#test.head()

pandas.core.series.Series

In [17]:
recclean.str.extractall('(university)|(inc)')[:5]

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1
3,0,university,
3,1,university,
5,0,,inc
6,0,university,
7,0,,inc


In [18]:
recclean.str.count(" university").sum() #This is strange...two similar methods yield slightly different counts

725.0

In [19]:
recclean.str.contains(' university').sum()

724

In [20]:
rectokens = recclean.str.split()

# It seems there are packages already developed

In [21]:
%%bash
pip install fuzzywuzzy



In [23]:
%%bash
easy_install python-Levenshtein

Searching for python-Levenshtein
Reading https://pypi.python.org/simple/python-Levenshtein/
Best match: python-Levenshtein 0.12.0
Downloading https://pypi.python.org/packages/42/a9/d1785c85ebf9b7dfacd08938dd028209c34a0ea3b1bcdb895208bd40a67d/python-Levenshtein-0.12.0.tar.gz#md5=e8cde197d6d304bbdc3adae66fec99fb
Processing python-Levenshtein-0.12.0.tar.gz
Writing C:\Users\jhb362\AppData\Local\Temp\easy_install-nu_74o3v\python-Levenshtein-0.12.0\setup.cfg
Running python-Levenshtein-0.12.0\setup.py -q bdist_egg --dist-dir C:\Users\jhb362\AppData\Local\Temp\easy_install-nu_74o3v\python-Levenshtein-0.12.0\egg-dist-tmp-6etqmn8w


error: Setup script exited with error: Unable to find vcvarsall.bat


In [35]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [24]:
fuzz.ratio("this is a test","this is also a test")

85

In [25]:
fuzz.token_sort_ratio("test this is one",'this is one test')

100

In [26]:
fuzz.token_set_ratio("this is a test","this is also a test")

100

In [27]:
recclean.head()

0                                               ocanit
1                       rescue technolgies corporation
2                                         aphis hawaii
3    university of hawaii university affliated rese...
4                    alakai consulting and engineering
Name: raw_recipient, dtype: object

In [36]:
choices = ["atlanta falcons","new york jets","new york giants","dallas cowboys"]
process.extract("new york jets",choices,limit=2)

[('new york jets', 100), ('new york giants', 79)]

In [183]:
uniques = recclean.drop_duplicates()
select = [isinstance(e,str) for e in uniques]
uniques2 = uniques[select]

select = [isinstance(e,str) for e in recclean]
recclean2 = recclean[select]
select[:3]

[True, True, True]

In [217]:
test = process.extractBests(recclean2[0],uniques2,scorer = fuzz.token_set_ratio,score_cutoff = 80)
select = [test[i][1] != 100 for i in range(len(test))]
test2 = [i for i in test if i[1] != 100]
print(test,test2)

[('ocanit', 100, 0), ('oceanit', 92, 4779)] [('oceanit', 92, 4779)]


In [150]:
pairings={}
for rec in recclean2[:20]:
    pairings[rec]=process.extract(rec,uniques2,scorer=fuzz.token_set_ratio,limit=2)

In [151]:
dict(list(pairings.items())[:3])

{'rescue technolgies corporation': [('rescue technolgies corporation', 100, 1),
  ('c9 corporation', 88, 598)],
 'summa technology inc': [('summa technology inc', 100, 5),
  ('arc technology', 83, 1270)],
 'university of hawaii university affliated research center': [('university of hawaii university affliated research center',
   100,
   3),
  ('university of hawaii', 100, 4581)]}

# This is getting somewhere now. Here I explore how to write a .txt file in the format for importing to Qualtrics

In [None]:
survey_file = open("survey-test.txt","w")
survey_file.write("Testing: %s" % test[1][0])
survey_file.close()