In [26]:
import pandas as pd
from rapidfuzz import process, fuzz

# Load company data from two Excel files
df1 = pd.read_excel("data/companies_data_1.xlsx")
print("Dataframe 1:")
print(df1)

df2 = pd.read_excel("data/companies_data_2.xlsx")
print("Dataframe 2:")
print(df2)

companies = pd.concat([df1, df2], ignore_index=True)

print("Merged Comapnies:")
print(companies)

companies = companies["Company"].dropna().tolist()

threshold = 60  # similarity threshold
deduped = []

while companies:
    name = companies.pop(0)
    similar = [name]
    
    # Find all similar names in the remaining list
    matches = process.extract(
        name, companies, scorer=fuzz.token_sort_ratio, score_cutoff=threshold
    )
    
    for match in matches:
        similar.append(match[0])
        companies.remove(match[0])
    
    # Keep the longest/more complete name
    best_name = max(similar, key=len)
    deduped.append(best_name)

# Save results
cleaned_df = pd.DataFrame({"Company": deduped})

print("Cleaned Dataframe:")
print(cleaned_df)

cleaned_df.to_excel("data/cleaned_companies.xlsx", index=False)

print("✅ Deduplicated companies saved to data/cleaned_companies.xlsx")

Dataframe 1:
           Company
0       Apple Inc.
1       Google LLC
2  Microsoft Corp.
3       Amazon.com
4     Tesla Motors
Dataframe 2:
              Company
0  Apple Incorporated
1   Alphabet (Google)
2           Microsoft
3              Amazon
4           Tesla Inc
Merged Comapnies:
              Company
0          Apple Inc.
1          Google LLC
2     Microsoft Corp.
3          Amazon.com
4        Tesla Motors
5  Apple Incorporated
6   Alphabet (Google)
7           Microsoft
8              Amazon
9           Tesla Inc
Cleaned Dataframe:
              Company
0  Apple Incorporated
1          Google LLC
2     Microsoft Corp.
3          Amazon.com
4        Tesla Motors
5   Alphabet (Google)
6           Tesla Inc
✅ Deduplicated companies saved to data/cleaned_companies.xlsx


Deduplicated Company Names:
['Google Inc.', 'Alphabet Inc.', 'Microsoft Corporation', 'Apple']
