## Score the Organizations Using the Name, Alternative Name, and Previous Name Fields

Scores the matches between the PatentsView and OpenCorporates data sets using the file generated via the Prepare OpenCorporates Data with No States script. The resulting scores are utilized to determine the confidence levels for each record and bound between 1 and 10. If any duplications exist, they are dropped and the first record is selected as the representative record.

In [1]:
### import the libraries used to process the PatentsView and OC data.
import pandas as pd
import numpy as np
import time
import os
import re
import string
import warnings
warnings.filterwarnings('ignore')

### start timer
t0=time.time()

### set the path for the input file and save to variable
res_folder = "../csvResults/"
input_file = "OcResults1000PreparedForScoring.csv"
a_full=os.path.join(res_folder,input_file)

OCScoringDF=pd.read_csv(a_full)

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset 
display(OCScoringDF.info(null_counts=True),OCScoringDF.head())

Total time is 0.000166 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1516 entries, 0 to 1515
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  1516 non-null   int64  
 1   assignee_id         1516 non-null   object 
 2   location_id         1516 non-null   object 
 3   organization        1516 non-null   object 
 4   city                1516 non-null   object 
 5   state               1516 non-null   object 
 6   city_latitude       1472 non-null   float64
 7   city_longitude      1472 non-null   float64
 8   dateOfFirstPat      1516 non-null   object 
 9   nameScores          1516 non-null   float64
 10  matchNames          1516 non-null   object 
 11  subJurisCode        1516 non-null   object 
 12  subCntlEntity       848 non-null    object 
 13  stateMatch          1516 non-null   float64
 14  bchStatus           1516 non-null   float64
 15  minIncDate          1516 no

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,nameScores,...,agent_longitude,data_city,data_state,data_latitude,data_longitude,orgLoc,cityToAddrDistance,cityToAgtDistance,cityToDataDistance,dateDiff
0,875,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,2001-01-30,100.0,...,-79.9602,,,,,Va,0.0,0.0,,3.11
1,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,2018-10-16,100.0,...,-71.4282,,,,,"Ri,De",0.0,49.5,,1.78
2,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,2018-10-16,100.0,...,,,,,,"Ri,De",,,,1.78
3,1667,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,2017-06-27,100.0,...,,,,,,De,,,,4.41
4,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,1983-03-15,100.0,...,,Las Vegas,Nv,36.1716,-115.1391,Ca,0.0,,0.0,24.89


## Scoring Algorithm

In [2]:
totalScore=[]
d=len(OCScoringDF)
x=''

### score the fuzzy match percentages

for s in range(d):
    ### scores all records with a fuzzy match score with a 100%; account for organization
    ### name lengths (i.e., shorter names are scored lower than longer names)
    
    if OCScoringDF['nameScores'][s] == 100:

        if len(OCScoringDF['organization'][s]) < 5:
            x=0
        
        elif 5 <= len(OCScoringDF['organization'][s]) < 10:
            x=2
            
        elif 10 <= len(OCScoringDF['organization'][s]) < 15:
            x=4
            
        elif len(OCScoringDF['organization'][s]) >= 15:
            x=5
    
    ### visual inspection of the data indicates a discrete group between 95% and 100%,
    ### resulting in the next set. Name lengths are again accounted for and shorter
    ### names/scores are downweighted even more
    
    elif 95 <= OCScoringDF['nameScores'][s] < 100:
        
        if len(OCScoringDF['organization'][s]) < 5:
            x=-1
        
        elif 5 <= len(OCScoringDF['organization'][s]) < 10:
            x=1
            
        elif 10 <= len(OCScoringDF['organization'][s]) < 15:
            x=2
            
        elif len(OCScoringDF['organization'][s]) >= 15:
            x=3

    ### many of the fuzzy matches in this range are wrong, but there are a couple correct
    ### that should not be discounted. While the weights are not as high as the previous
    ### sections, correct matches will be given better scores than the next section
    
    elif 90 <= OCScoringDF['nameScores'][s] < 95:
        
        if len(OCScoringDF['organization'][s]) < 5:
            x=-2
        
        elif 5 <= len(OCScoringDF['organization'][s]) < 10:
            x=1
            
        elif 10 <= len(OCScoringDF['organization'][s]) < 15:
            x=2
            
        elif len(OCScoringDF['organization'][s]) >= 15:
            x=3
            
    elif OCScoringDF['nameScores'][s] < 90:
        
        if len(OCScoringDF['organization'][s]) < 5:
            x=-3
        
        elif 5 <= len(OCScoringDF['organization'][s]) < 10:
            x=0
            
        elif 10 <= len(OCScoringDF['organization'][s]) < 15:
            x=1
            
        elif len(OCScoringDF['organization'][s]) >= 15:
            x=2

    ### scoring the different features that contain state information for each
    ### organization. The jurisdictionScore feature was created by extracting
    ### the state from the jurisdiction_code field. The jurisdiction_code
    ### feature is the primary metric used to match patentsview and OC records
    ### and therefore, given a larger weight. The stateAddScore is given the
    ### second highest weight because it is the primary address that is listed
    ### in an OC record. stateAgtScore is given the least amount of weight
    ### because the agent may not always be located at the registered address 
    ### for the organization.
    
    if OCScoringDF.iloc[s,13] == 0:
        x=x+5
        
    elif OCScoringDF.iloc[s,13] == 1:
        x=x+3
        
    elif OCScoringDF.iloc[s,13] == 2:
        x=x+1
        
    if ((OCScoringDF.iloc[s,14] == 2 or OCScoringDF.iloc[s,14] == 1)):
        x=x+5
        
#     elif OCScoringDF.iloc[s,14] == 1:
#         x=x+3
        
#     elif OCScoringDF.iloc[s,14] == 0:
#         x=x+1
        
    
    if ((OCScoringDF.iloc[s,5] == 'De') & (OCScoringDF.iloc[s,11] == 'De')):
        x=x+5
    

    if (OCScoringDF.iloc[s,5] == OCScoringDF.iloc[s,17] or OCScoringDF.iloc[s,5] == OCScoringDF.iloc[s,21] or 
        OCScoringDF.iloc[s,5] == OCScoringDF.iloc[s,25]):
        x=x+5
        
    else:
        x=x+0

    
    if (OCScoringDF.iloc[s,4] == OCScoringDF.iloc[s,16] or OCScoringDF.iloc[s,4] == OCScoringDF.iloc[s,20] or 
        OCScoringDF.iloc[s,4] == OCScoringDF.iloc[s,24]):
        x=x+5
        
    else:
        x=x+0
        
        
#     ### the address_city feature is weighted more than the agent_city column for similar
#     ### reasons stated in the states section above. Cities less than 4 characters long
#     ### are penalized and gradually score better as the character length increase. Moreover,
#     ### the score from fuzzy matching is used to create groups as shown below. Fuzzy
#     ### scores below 90% are weighted negatively
    

    if (OCScoringDF.iloc[s,29] == 0 or OCScoringDF.iloc[s,30] == 0 or OCScoringDF.iloc[s,31] == 0):
        x=x+5
        
    elif (0 < OCScoringDF.iloc[s,29] < 10 or 0 < OCScoringDF.iloc[s,30] < 10 or 0 < OCScoringDF.iloc[s,31] < 10):
        x=x+4
    
    elif (10 <= OCScoringDF.iloc[s,29] < 50 or 10 <= OCScoringDF.iloc[s,30] < 50 or 10 <= OCScoringDF.iloc[s,31] < 50):
        x=x+3
        
    elif (50 <= OCScoringDF.iloc[s,29] < 100 or 50 <= OCScoringDF.iloc[s,30] < 100 or 50 <= OCScoringDF.iloc[s,31] < 100):
        x=x+2
        
    elif (100 <= OCScoringDF.iloc[s,29] < 200 or 100 <= OCScoringDF.iloc[s,30] < 200 or 100 <= OCScoringDF.iloc[s,31] < 200):
        x=x+1
    
    elif (OCScoringDF.iloc[s,29] >= 200 or OCScoringDF.iloc[s,30] >= 200 or OCScoringDF.iloc[s,31] >= 200):
        x=x-2
    
    
#     try:
    if OCScoringDF['dateDiff'][s] <= 5:
        x=x+5

    elif 5 < OCScoringDF['dateDiff'][s] <= 10:
        x=x+4

    elif 10 < OCScoringDF['dateDiff'][s] <= 15:
        x=x+3

    elif 15 < OCScoringDF['dateDiff'][s] <= 20:
        x=x+2

    elif OCScoringDF['dateDiff'][s] > 20:
        x=x+1

#     except:
#         pass
    
    totalScore.append(x)

In [3]:
OCScoringDF['totalScore'] = totalScore

display(OCScoringDF.info(),OCScoringDF.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1516 entries, 0 to 1515
Data columns (total 34 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  1516 non-null   int64  
 1   assignee_id         1516 non-null   object 
 2   location_id         1516 non-null   object 
 3   organization        1516 non-null   object 
 4   city                1516 non-null   object 
 5   state               1516 non-null   object 
 6   city_latitude       1472 non-null   float64
 7   city_longitude      1472 non-null   float64
 8   dateOfFirstPat      1516 non-null   object 
 9   nameScores          1516 non-null   float64
 10  matchNames          1516 non-null   object 
 11  subJurisCode        1516 non-null   object 
 12  subCntlEntity       848 non-null    object 
 13  stateMatch          1516 non-null   float64
 14  bchStatus           1516 non-null   float64
 15  minIncDate          1516 non-null   object 
 16  addres

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,nameScores,...,data_city,data_state,data_latitude,data_longitude,orgLoc,cityToAddrDistance,cityToAgtDistance,cityToDataDistance,dateDiff,totalScore
0,875,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,2001-01-30,100.0,...,,,,,Va,0.0,0.0,,3.11,30
1,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,2018-10-16,100.0,...,,,,,"Ri,De",0.0,49.5,,1.78,32
2,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,2018-10-16,100.0,...,,,,,"Ri,De",,,,1.78,10
3,1667,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,2017-06-27,100.0,...,,,,,De,,,,4.41,8
4,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,1983-03-15,100.0,...,Las Vegas,Nv,36.1716,-115.1391,Ca,0.0,,0.0,24.89,24


In [4]:
OCScoringDF1=OCScoringDF.sort_values(by=['organization','totalScore'],ascending=[True,False]).reset_index(drop=True)

display(OCScoringDF1.info(),OCScoringDF1.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1516 entries, 0 to 1515
Data columns (total 34 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  1516 non-null   int64  
 1   assignee_id         1516 non-null   object 
 2   location_id         1516 non-null   object 
 3   organization        1516 non-null   object 
 4   city                1516 non-null   object 
 5   state               1516 non-null   object 
 6   city_latitude       1472 non-null   float64
 7   city_longitude      1472 non-null   float64
 8   dateOfFirstPat      1516 non-null   object 
 9   nameScores          1516 non-null   float64
 10  matchNames          1516 non-null   object 
 11  subJurisCode        1516 non-null   object 
 12  subCntlEntity       848 non-null    object 
 13  stateMatch          1516 non-null   float64
 14  bchStatus           1516 non-null   float64
 15  minIncDate          1516 non-null   object 
 16  addres

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,nameScores,...,data_city,data_state,data_latitude,data_longitude,orgLoc,cityToAddrDistance,cityToAgtDistance,cityToDataDistance,dateDiff,totalScore
0,167963,7cf429dc-109b-429b-9742-7c9b6325a68a,f204a38d-cb90-11eb-9615-121df0c29c1e,5381 Partners,Fairfield,Ct,41.1412,-73.2637,2010-09-28,100.0,...,,,,,De,,,,10.62,8
1,38944,158f69f1-ed59-45d2-a9f8-90cf0ed28d20,ecedeef2-cb8f-11eb-9615-121df0c29c1e,700 Solar Club,Perrysburg,Oh,41.5356,-83.646,1992-06-30,100.0,...,,,,,"Oh,De",,,,1.29,19
2,38944,158f69f1-ed59-45d2-a9f8-90cf0ed28d20,ecedeef2-cb8f-11eb-9615-121df0c29c1e,700 Solar Club,Perrysburg,Oh,41.5356,-83.646,1992-06-30,100.0,...,,,,,"Oh,De",,,,1.29,10
3,222842,3fda41e2-c3e8-463a-af82-71dfe215a574,f93ac3c6-cb8e-11eb-9615-121df0c29c1e,APL,South Bend,In,41.6779,-86.2594,1999-09-07,100.0,...,,,,,Ny,614.8,,,0.24,6
4,139335,9ce56c89-7d33-4eb1-89a5-243a2ed15578,95daa592-cb8f-11eb-9615-121df0c29c1e,AY Mcdonald Manufacturing Company,Dubuque,Ia,42.5006,-90.6648,1976-12-14,100.0,...,,,,,Ok,,,,50.21,14


In [5]:
OCScoringDF2=OCScoringDF1.drop_duplicates(subset=['assignee_id','organization'],
                                          keep='first').sort_values(by=['ID']).reset_index(drop=True)

display(OCScoringDF2.info(),OCScoringDF2.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384 entries, 0 to 383
Data columns (total 34 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  384 non-null    int64  
 1   assignee_id         384 non-null    object 
 2   location_id         384 non-null    object 
 3   organization        384 non-null    object 
 4   city                384 non-null    object 
 5   state               384 non-null    object 
 6   city_latitude       379 non-null    float64
 7   city_longitude      379 non-null    float64
 8   dateOfFirstPat      384 non-null    object 
 9   nameScores          384 non-null    float64
 10  matchNames          384 non-null    object 
 11  subJurisCode        384 non-null    object 
 12  subCntlEntity       137 non-null    object 
 13  stateMatch          384 non-null    float64
 14  bchStatus           384 non-null    float64
 15  minIncDate          384 non-null    object 
 16  address_

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,nameScores,...,data_city,data_state,data_latitude,data_longitude,orgLoc,cityToAddrDistance,cityToAgtDistance,cityToDataDistance,dateDiff,totalScore
0,875,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,2001-01-30,100.0,...,,,,,Va,0.0,0.0,,3.11,30
1,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,2018-10-16,100.0,...,,,,,"Ri,De",0.0,49.5,,1.78,32
2,1667,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,2017-06-27,100.0,...,,,,,De,,,,4.41,8
3,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,1983-03-15,100.0,...,Las Vegas,Nv,36.1716,-115.1391,Ca,0.0,,0.0,24.89,24
4,2729,e5614631-da49-4351-a6c6-8f81358b767d,fb7257e4-cb8f-11eb-9615-121df0c29c1e,Fanamana,Washington,Dc,38.8937,-76.9879,2016-07-26,100.0,...,,Dc,,,"De,Tn",,,,3.56,20


In [6]:
OCScoringDF2['assignee_id'].nunique()

384

In [7]:
print("The number of state-to-state matches is:",OCScoringDF2['stateMatch'].value_counts().sort_index()[0])
print("The number of state-to-non state matches is:",OCScoringDF2['stateMatch'].value_counts().sort_index()[1])
print("The number of state-DE matches is:",OCScoringDF2['stateMatch'].value_counts().sort_index()[2])

The number of state-to-state matches is: 103
The number of state-to-non state matches is: 212
The number of state-DE matches is: 69


In [8]:
OCScoringDF2['confidenceScore']=((10-1)*((OCScoringDF2['totalScore']-min(OCScoringDF2['totalScore']))/
                                         (max(OCScoringDF2['totalScore'])-min(OCScoringDF2['totalScore']))))+1

OCScoringDF2['confidenceScore']=[round(num1, 2) for num1 in OCScoringDF2['confidenceScore']]

display(OCScoringDF2.info(),OCScoringDF2.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384 entries, 0 to 383
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  384 non-null    int64  
 1   assignee_id         384 non-null    object 
 2   location_id         384 non-null    object 
 3   organization        384 non-null    object 
 4   city                384 non-null    object 
 5   state               384 non-null    object 
 6   city_latitude       379 non-null    float64
 7   city_longitude      379 non-null    float64
 8   dateOfFirstPat      384 non-null    object 
 9   nameScores          384 non-null    float64
 10  matchNames          384 non-null    object 
 11  subJurisCode        384 non-null    object 
 12  subCntlEntity       137 non-null    object 
 13  stateMatch          384 non-null    float64
 14  bchStatus           384 non-null    float64
 15  minIncDate          384 non-null    object 
 16  address_

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,nameScores,...,data_state,data_latitude,data_longitude,orgLoc,cityToAddrDistance,cityToAgtDistance,cityToDataDistance,dateDiff,totalScore,confidenceScore
0,875,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,2001-01-30,100.0,...,,,,Va,0.0,0.0,,3.11,30,8.59
1,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,2018-10-16,100.0,...,,,,"Ri,De",0.0,49.5,,1.78,32,9.16
2,1667,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,2017-06-27,100.0,...,,,,De,,,,4.41,8,2.41
3,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,1983-03-15,100.0,...,Nv,36.1716,-115.1391,Ca,0.0,,0.0,24.89,24,6.91
4,2729,e5614631-da49-4351-a6c6-8f81358b767d,fb7257e4-cb8f-11eb-9615-121df0c29c1e,Fanamana,Washington,Dc,38.8937,-76.9879,2016-07-26,100.0,...,Dc,,,"De,Tn",,,,3.56,20,5.78


In [9]:
col         = 'confidenceScore'
conditions  = [ (OCScoringDF2[col] >= 1) & (OCScoringDF2[col] < 2),
                (OCScoringDF2[col] >= 2) & (OCScoringDF2[col] < 3), 
                (OCScoringDF2[col] >= 3) & (OCScoringDF2[col] < 4),
                (OCScoringDF2[col] >= 4) & (OCScoringDF2[col] < 5), 
                (OCScoringDF2[col] >= 5) & (OCScoringDF2[col] < 6), 
                (OCScoringDF2[col] >= 6) & (OCScoringDF2[col] < 7), 
                (OCScoringDF2[col] >= 7) & (OCScoringDF2[col] < 8), 
                (OCScoringDF2[col] >= 8) & (OCScoringDF2[col] < 9), 
                (OCScoringDF2[col] >= 9) & (OCScoringDF2[col] < 10),
                (OCScoringDF2[col] == 10) ]
choices     = [ 1,2,3,4,5,6,7,8,9,10 ]
    
OCScoringDF2["score"] = np.select(conditions, choices, default=np.nan)
OCScoringDF2.drop(columns=['confidenceScore'],inplace=True)
OCScoringDF3=OCScoringDF2.sort_values(by=['ID']).reset_index(drop=True)

display(OCScoringDF3.info(),OCScoringDF3.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384 entries, 0 to 383
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  384 non-null    int64  
 1   assignee_id         384 non-null    object 
 2   location_id         384 non-null    object 
 3   organization        384 non-null    object 
 4   city                384 non-null    object 
 5   state               384 non-null    object 
 6   city_latitude       379 non-null    float64
 7   city_longitude      379 non-null    float64
 8   dateOfFirstPat      384 non-null    object 
 9   nameScores          384 non-null    float64
 10  matchNames          384 non-null    object 
 11  subJurisCode        384 non-null    object 
 12  subCntlEntity       137 non-null    object 
 13  stateMatch          384 non-null    float64
 14  bchStatus           384 non-null    float64
 15  minIncDate          384 non-null    object 
 16  address_

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,nameScores,...,data_state,data_latitude,data_longitude,orgLoc,cityToAddrDistance,cityToAgtDistance,cityToDataDistance,dateDiff,totalScore,score
0,875,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,2001-01-30,100.0,...,,,,Va,0.0,0.0,,3.11,30,8.0
1,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,2018-10-16,100.0,...,,,,"Ri,De",0.0,49.5,,1.78,32,9.0
2,1667,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,2017-06-27,100.0,...,,,,De,,,,4.41,8,2.0
3,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,1983-03-15,100.0,...,Nv,36.1716,-115.1391,Ca,0.0,,0.0,24.89,24,6.0
4,2729,e5614631-da49-4351-a6c6-8f81358b767d,fb7257e4-cb8f-11eb-9615-121df0c29c1e,Fanamana,Washington,Dc,38.8937,-76.9879,2016-07-26,100.0,...,Dc,,,"De,Tn",,,,3.56,20,5.0


In [10]:
finalScoresWoStates=OCScoringDF3.iloc[:,[0,1,2,3,4,5,8,9,10,11,12,13,14,15,16,17,
                                         20,21,24,25,28,33,34]].sort_values(by=['ID'])

display(finalScoresWoStates.info(),finalScoresWoStates.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 384 entries, 0 to 383
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ID              384 non-null    int64  
 1   assignee_id     384 non-null    object 
 2   location_id     384 non-null    object 
 3   organization    384 non-null    object 
 4   city            384 non-null    object 
 5   state           384 non-null    object 
 6   dateOfFirstPat  384 non-null    object 
 7   nameScores      384 non-null    float64
 8   matchNames      384 non-null    object 
 9   subJurisCode    384 non-null    object 
 10  subCntlEntity   137 non-null    object 
 11  stateMatch      384 non-null    float64
 12  bchStatus       384 non-null    float64
 13  minIncDate      384 non-null    object 
 14  address_city    201 non-null    object 
 15  address_state   198 non-null    object 
 16  agent_city      53 non-null     object 
 17  agent_state     52 non-null     obj

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,nameScores,matchNames,subJurisCode,...,minIncDate,address_city,address_state,agent_city,agent_state,data_city,data_state,orgLoc,totalScore,score
0,875,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,2001-01-30,100.0,The Egg Factory,Va,...,1997-12-22,Roanoke,Va,Roanoke,Va,,,Va,30,8.0
1,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,2018-10-16,100.0,Tank Vision,Ri,...,2017-01-03,Vineyard Haven,Ma,Providence,Ri,,,"Ri,De",32,9.0
2,1667,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,2017-06-27,100.0,Infinibox,De,...,2013-01-31,,,,,,,De,8,2.0
3,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,1983-03-15,100.0,Agricultural Aviation Engineering Company,Ca,...,1958-05-01,Las Vegas,Nv,,,Las Vegas,Nv,Ca,24,6.0
4,2729,e5614631-da49-4351-a6c6-8f81358b767d,fb7257e4-cb8f-11eb-9615-121df0c29c1e,Fanamana,Washington,Dc,2016-07-26,100.0,Fanamana,Tn,...,2013-01-02,,,,,,Dc,"De,Tn",20,5.0


In [11]:
finalScoresWoStates['assignee_id'].nunique()

384

In [12]:
finalScoresWoStates['score'].value_counts().sort_index(ascending=False)

10.0     4
9.0     38
8.0     62
7.0     17
6.0     16
5.0     25
4.0     34
3.0     59
2.0     92
1.0     37
Name: score, dtype: int64

In [13]:
### save the new dataset as a checkpoint
res_folder = "../csvResults/"
outpt_file = "OcResults1000ScoredData.csv"
a_full = os.path.join(res_folder,outpt_file)

finalScoresWoStates.to_csv(a_full,index=False)