Emad Siddiq

University of California Berkeley 

emadsiddiq@berkeley.edu 

Undergraduate Research Apprentice Program

# <center>Trademark Dataset Analysis</center>

### Notes:
• Analysis on the USPTO dataset obtained from the [USPTO website](https://www.uspto.gov/learning-and-resources/electronic-data-products/trademark-case-files-dataset-0).

• SEC dataset obtained from Haas School of Business through Dr. Su Li. File can be found in the DropBox.

• TF-IDF methodology referenced from [this link](https://bergvca.github.io/2017/10/14/super-fast-string-matching.html) 
and adapted to match the USPTO dataset with the help of [this StackOverFlow post](https://stackoverflow.com/questions/53827339/string-matching-using-tf-idf-ngrams-and-cosine-similarity-in-python)

### Methodology:

1) Download owner, case_file and prior_mark files from USPTO website.

2) Merge owner, relevant columns from case_file, and prior_mark documents on their serial number column.

3) Calculate prior_marks by serial number and add totaal_prior_marks as a new column to the USPTO dataset.

4) Use string matching algorithm (TFIDF) to find companies from the SEC List that are a part of the USPTO dataset



## Part 1: Downloading data

In [1]:
import pandas as pd

In [2]:
owner = pd.read_csv('owner.csv')
case_file = pd.read_csv('case_file.csv')
prior_mark = pd.read_csv('prior_mark.csv')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
#choose relevant columns from owner file
owner = owner[['own_addr_1', 'own_addr_city', 'own_entity_cd', 'own_name', 'own_type_cd', 'own_addr_postal', 'serial_no']]

In [4]:
#choose relevant columns from case_file
case_file = case_file[['serial_no', 'abandon_dt', 'amend_reg_dt','reg_cancel_cd', 'reg_cancel_dt','file_location', 'filing_dt','registration_dt', 'renewal_dt' ]]

In [5]:
#group prior_mark by serial number to get the number of prior_marks for each serial_number
prior_mark = prior_mark.groupby('serial_no').count().reset_index()

In [6]:
prior_mark.columns

Index(['serial_no', 'prior_no', 'prior_type_cd', 'rec_error'], dtype='object')

In [7]:
#drop unnecessary columns and rename columns for readibility
prior_mark = prior_mark.drop(columns = ['rec_error','prior_type_cd'])
prior_mark = prior_mark.rename(columns = {'prior_no': 'prior_marks'})

In [8]:
#merge owner and casefile
merged = owner.merge(case_file, on = ['serial_no', 'serial_no'])

In [9]:
#merge prior_mark, owner and case_file
merged = merged.merge(prior_mark, on = ['serial_no', 'serial_no'])

In [10]:
#rename to trademark because merged is an awkward name to have
trademark = merged

In [11]:
#export to a csv file
trademark.to_csv('USPTO_clean_data_1870_to_2019.csv')

# Part 2: Use TD-IDF to extract company names

In [1]:
import pandas as pd
import re
import time
from sklearn.feature_extraction.text import TfidfVectorizer
import sparse_dot_topn.sparse_dot_topn as ct
from scipy.sparse import csr_matrix
from sparse_dot_topn import awesome_cossim_topn
import numpy as np

In [13]:
sec = pd.read_csv('sec_list.csv')
trademark = pd.read_csv('USPTO_clean_data_1870_to_2019.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [14]:
#Regex cleaning of both trademark['own_name'] and sec['companyName']

#lowercase
trademark['own_name_clean'] = trademark['own_name'].str.lower()
sec['company_clean'] = sec['companyName'].str.lower()

#strip
trademark['own_name_clean'] = trademark['own_name_clean'].str.strip()
sec['company_clean'] = sec['company_clean'].str.strip()


#remove anything that is not a number, a digit(\d), or a whitespace character (\s)
re_remove = r'[^a-zA-Z\d\s]'

#remove inc. and anything that follows inc.
remove_inc = r' inc.*'

#remove ltd. and anything that follows ltd.
remove_ltd = r' ltd.*'

#remove corp. and anything that follows corp.
remove_corp = r' corp.*'

#remove anything that matches "co."
remove_co = r' co\.'

#remove ''the if its at the beginning or the end of the name
remove_the_end = r'the$'
remove_the_beg = r'^the'

sec['company_clean'] = sec['company_clean'].str.replace(re_remove, '')
sec['company_clean'] = sec['company_clean'].str.replace(remove_inc, '')
sec['company_clean'] = sec['company_clean'].str.replace(remove_ltd, '')
sec['company_clean'] = sec['company_clean'].str.replace(remove_corp, '')
trademark['own_name_clean'] = trademark['own_name_clean'].str.replace(re_remove, '')
trademark['own_name_clean'] = trademark['own_name_clean'].str.replace(remove_inc, '')
trademark['own_name_clean'] = trademark['own_name_clean'].str.replace(remove_ltd, '')
trademark['own_name_clean'] = trademark['own_name_clean'].str.replace(remove_corp, '')

In [15]:
"""
Function to create ngrams from string entry. 
"An n-gram is a contiguous sequence of n items from a given sample of text or speech."
"""

def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [16]:
"""
Use term frequency inverse document frequency to get matches. 
Read more on this link:
https://bergvca.github.io/2017/10/14/super-fast-string-matching.html

"""

def get_matches_df(sparse_matrix, A, B, top=100):
    non_zeros = sparse_matrix.nonzero()

    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]

    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size

    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)

    for index in range(0, nr_matches):
        left_side[index] = A[sparserows[index]]
        right_side[index] = B[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]

    return pd.DataFrame({'left_side': left_side,
                         'right_side': right_side,
                         'similairity': similairity})

In [17]:
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix_clean = vectorizer.fit_transform(sec['company_clean'])
tf_idf_matrix_dirty = vectorizer.transform(trademark['own_name_clean'])

In [18]:
t1 = time.time()
matches = awesome_cossim_topn(tf_idf_matrix_dirty, tf_idf_matrix_clean.transpose(), 1, 0)
t = time.time()-t1
print("SELFTIMED:", t)

SELFTIMED: 93.67338681221008


In [19]:
matches_df = get_matches_df(matches, trademark['own_name_clean'], sec['company_clean'], top=0)

In [20]:
#Rename for more clarity
matches_df = matches_df.rename(columns = {"left_side": "own_name_clean",
                                          "right_side": "company_name_clean",
                                          "similairity": "similarity"})

In [21]:
print(str(len(matches_df)) + "is the current length of the dataset")

3055199is the current length of the dataset


In [22]:
#Drop duplicates based on own_name_clean since they are redundant
matches_df = matches_df.drop_duplicates(subset = 'own_name_clean')

In [23]:
#Export resulting dataframe to csv for safekeeping
matches_df.to_csv('Matches of SEC list and Trademark data.csv')

### Merge trademark with our matches and the SEC companies list

In [24]:
trademark = trademark.merge(matches_df, on = ['own_name_clean', 'own_name_clean'])

In [25]:
trademark = trademark.merge(sec, left_on = 'company_name_clean', right_on = 'company_clean')

In [26]:
trademark = trademark.drop(columns = ['company_clean'])

In [27]:
trademark = trademark.rename(columns = {"companyName": "SEC company name"})

In [28]:
#export trademark to csv
trademark.to_csv('USPTO_Trademark_Similarity.csv')

In [29]:
## Now that we have our matches, we match them with our orginal dataset and choose our threshold by picking random samples and making sure they match

In [30]:
company_output = trademark[['own_name', 'SEC company name', 'similarity']]

In [31]:
company_output = company_output.drop_duplicates(subset = ['own_name'])

In [32]:
company_output = company_output.reset_index(drop = True)

In [33]:
company_output.to_csv('Company List matched.csv')

In [34]:
over_85_percent = company_output.loc[company_output['similarity'] > 0.85].drop_duplicates()

In [35]:
over_85_percent = over_85_percent.sort_values(by = 'similarity', ascending = True)

In [36]:
over_85_percent = over_85_percent.reset_index(drop = True)

In [37]:
len(over_85_percent['own_name'])

8626

In [38]:
over_85_percent.to_csv('company_list_over_85percent.csv', index = False)

In [39]:
over_85_percent.sample(30)

Unnamed: 0,own_name,SEC company name,similarity
5619,"CenturyLink, Inc.","Centurylink, Inc",1.0
6997,TransAlta Corporation,TRANSALTA CORP,1.0
3110,Thoratec Corporation,Thoratec Corp,1.0
4909,"MERU NETWORKS, INC.",Meru Networks Inc,1.0
5657,"LG Display Co., Ltd.","LG Display Co., Ltd.",1.0
1970,Hewitt Associates LLC,HEWITT ASSOCIATES INC,0.938469
4134,"PRIMERICA, INC.","Primerica, Inc.",1.0
3746,Buckeye Technologies Inc.,BUCKEYE TECHNOLOGIES INC,1.0
8136,Rally Software Development Corporation,Rally Software Development Corp,1.0
6008,"NL INDUSTRIES, INC.",Nl Industries Inc,1.0


In [40]:
over = over_85_percent.groupby('SEC company name')

In [41]:
over = over[['SEC company name', 'own_name', 'similarity']]

In [42]:
over.apply(pd.DataFrame.sort_values, 'similarity').reset_index(
    drop = True).drop_duplicates().to_csv(
    'List_for_manual_matching.csv')

In [3]:
manu_match = pd.read_csv('manually_matched.csv')

In [9]:
manu_match.loc[manu_match['SEC company name'].str.contains('Ltd') & manu_match['own_name'].str.contains('Inc'), 'Keep? 0 or 1'] = 0

In [12]:
manu_match.loc[manu_match['SEC company name'].str.contains('Inc') & manu_match['own_name'].str.contains('Ltd'), 'Keep? 0 or 1'] = 0

In [16]:
final_company_list = manu_match.loc[manu_match['Keep? 0 or 1'] == 1]

In [18]:
final_company_list = final_company_list.drop(columns = ['Unnamed: 0', 'Unnamed: 5'])

In [21]:
final_company_list.to_csv('final_company_list.csv')

## The next part of the data analysis includes downloading the daily XML files from the USPTO database


The USPTO database involves data from the Daily Trademark Application Files found on their online database. The data stores the files in bulk in XML files based on year. The purpose of accessing the daily XML files was to gather case-file-statements in the hopes of getting more insight into why a company files trademark applications. To extract the case file statements, each XML file had to be downloaded individually and then the relevant data downloaded and convereted to a csv format. The code that follows does that. It is to be noted that the script was run on a cloud server to avoid interruption.

In [1]:
import io,re,os
import requests
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import zipfile
import pandas as pd
from concurrent.futures import ProcessPoolExecutor, as_completed
import time


def get_zip_links(tm_url):
    dl_url = 'http://trademarks.reedtech.com/'
    r = rget(tm_url)
    soup = BeautifulSoup(r.content,'html5lib')
    links = soup.findAll('a')
    zip_links = [dl_url + link['href'] for link in links if link['href'].endswith('.zip')]
    return zip_links

def download(link):
    file_name = link.split('/')[-1]
    xml_name = file_name[:-3] + "xml"
    if os.path.isfile(xml_name):
        return str(os.getcwd()) + "/" + xml_name
    r = rget(link)
    if r == -1:
        return -1
    else:
        z = zipfile.ZipFile(io.BytesIO(r.content))
        z.extractall()
        return str(os.getcwd()) + "/" + xml_name


def rget(link):
    try:
        time.sleep(3)
        r = requests.get(link)
    except requests.exceptions.RequestException as e:
        print(e)
        return -1
    else:
        return r

def parse_xml(xmlfile):
    tree = ET.parse(xmlfile)
    root = tree.getroot()
    return root


def extract(root):
    trademark = []
    reg_element = root.find('application-information').find('file-segments').find('action-keys').find('case-file').find('case-file-statements').find('case-file-statement').find('text').text
    if reg_element is None:
        pass
    else:
        for i in root.find('application-information').find('file-segments').find('action-keys').findall('case-file'):
            temp = []
            serial_no = i.find('serial-number').text
            temp.append(serial_no)
            reg_text = i.find('case-file-statements').find('case-file-statement').find('text').text
            temp.append(reg_text)
            trademark.append(temp)
    return trademark

def delete_file(file_dir):
    os.remove(file_dir)


def d_and_e(link):
    start_time = time.time()
    trademark_data = []
    directory = download(link)
    root = parse_xml(directory)
    trademark_data += extract(root)
    delete_file(directory)
    file_name = link.split('/')[-1]
    end_time = time.time()
    print("Done with " + file_name + " in "  + str((start_time - end_time)/60) + " minutes")
    return trademark_data

In [None]:
#Main process to download the files using the above functions
links = get_zip_links('http://trademarks.reedtech.com/tmappxml.php#2019/')
links = list(set(links))
links.sort()
links2019 = [i for i in links if i.split('/')[-1][0:5] == ("apc19")]

links2018 = [i for i in links if i.split('/')[-1][0:5] == ("apc18")]

links2017 = [i for i in links if i.split('/')[-1][0:5] == ("apc17")]

with ProcessPoolExecutor(max_workers=6) as executor:
    futures = [executor.submit(d_and_e, url) for url in links2019]
    results = []
    for i in as_completed(futures):
        results += i.result()
Reasons2019 = pd.DataFrame(results, columns = ["serial_number", "reason"])
Reasons2019.to_csv("Reasons2019.csv")

with ProcessPoolExecutor(max_workers=6) as executor:
    futures = [executor.submit(d_and_e, url) for url in links2018]
    results = []
    for i in as_completed(futures):
        results += i.result()
Reasons2018 = pd.DataFrame(results, columns = ["serial_number", "reason"])
Reasons2018.to_csv("Reasons2018.csv")

with ProcessPoolExecutor(max_workers=4) as executor:
    futures = [executor.submit(d_and_e, url) for url in links2017]
    results = []
    for i in as_completed(futures):
        results += i.result()
Reasons2017 = pd.DataFrame(results, columns = ["serial_number", "reason"])
Reasons2017.to_csv("Reasons2017.csv")


The case file statements were compiled into one file but never analyzed. All the files can be found in the dropbox folder.