In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from unidecode import unidecode
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/joao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df

Unnamed: 0,name,description,employees,total_funding,city,subcountry,lat,lng
0,fitin,fitin is the latest fit tech platform that pro...,1.0,,toronto,ontario,43.700109999999995,-79.4163
1,quinditech,our services include independent & collaborati...,10.0,,montreal,quebec,45.50884,-73.58781
2,botfirst,chatlanding is a digital chatbot platform wher...,1.0,,toronto,ontario,43.700109999999995,-79.4163
3,bravver,ai-powered army of health professionals in you...,1.0,,quebec,quebec,46.8259601,-71.23522259999999
4,stylify,stylify is a mobile platform that connects peo...,1.0,,toronto,ontario,43.700109999999995,-79.4163
...,...,...,...,...,...,...,...,...
9569,understoodit,understoodit is a simple web-based tool that a...,1.0,,toronto,ontario,43.700109999999995,-79.4163
9570,managinglife,chronic pain affects 1 in 5 people and drives ...,1.0,,toronto,ontario,43.700109999999995,-79.4163
9571,venuespace,finding an ideal venue to host an event is an ...,1.0,,toronto,ontario,43.700109999999995,-79.4163
9572,fanzine by picmile network,fanzine is a photo saving and sharing platform...,1.0,,montreal,quebec,45.50884,-73.58781


In [4]:
#Preprocessando textos da descrição. Lower case + unidecode (retirar caracteres especiais)
df.dropna(subset=['description'], inplace=True)
df['description'] = df['description'].str.lower().apply(unidecode)

In [5]:
#Estou usando para calcular a similaridade as strings solicitadas no documento, removi stopwords
strings_water_treatment = [
    "water treatment",
    "waste water",
    "water quality",
    "water use",
    "water contamination",
    "water human consumption",
    "water resources"
]

In [6]:
#Removendo Stopwords e fazendo o vectorizes do TF-IDF de todas as descrições e vetor de strings
stop_words = stopwords.words('english')
vectorizer = TfidfVectorizer(stop_words=stop_words)
all_texts = strings_water_treatment + df['description'].tolist()
tfidf_matrix = vectorizer.fit_transform(all_texts)

In [7]:
#Calculando a similaridade entre ambos
strings_water_tfidf = tfidf_matrix[:len(strings_water_treatment)]
description_tfidf = tfidf_matrix[len(strings_water_treatment):]
similarities = cosine_similarity(description_tfidf, strings_water_tfidf)

In [8]:
#Definindo um threshold. O valor baixo é porque preferi fazer uma média. Casos como o de Water treatment só sendo similar
#podiam procurar pela presença da palavra treatment, preferi fazer uma média de todas as strings, dessa forma, garantiria que 
#a descrição estivesse em linha com o que estava pedido no desafio.
similarity_threshold = 0.20

In [9]:
avg_similarities = similarities.mean(axis=1)

In [10]:
similar_indices = avg_similarities > similarity_threshold
similar_descriptions = df[similar_indices]

In [11]:
similar_descriptions

Unnamed: 0,name,description,employees,total_funding,city,subcountry,lat,lng
47,abanban,for people on the go who want to have easy acc...,1.0,,toronto,ontario,43.70011,-79.4163
369,aqua air 247,"o waste r/o out door water vending, electronic...",1.0,,kelowna,british columbia,49.88307,-119.48568
608,taphax,taphax is a real time water consumption tool t...,0.0,,kitchener,ontario,43.42537,-80.5112
1588,ontario safety standards,"water purification services, heating & air con...",0.0,,ottawa,ontario,45.41117,-75.69811999999997
2130,z3 controls,z3 controls inc.is an advanced energy technolo...,0.0,,markham,ontario,43.86682,-79.2663
2163,canadian comfort home services,"hvac, water heaters, water filtration, air pur...",50.0,,toronto,ontario,43.70011,-79.4163
3836,noddis water security,this is a custom designed water treatment plan...,0.0,,calgary,alberta,51.05011,-114.08529
4284,alert labs,alert labsaEUR(tm) insight series protects our...,10.0,,kitchener,ontario,43.42537,-80.5112
4958,steam,steam puts the smart into water heating. water...,1.0,,montreal,quebec,45.50884,-73.58781
4992,aquatic informatics,aquatic informatics provides leading software ...,0.0,,vancouver,british columbia,49.24966,-123.11934


In [12]:
for index, row in similar_descriptions.iterrows():
    print("Company Name:", row['name'])
    print("Description:", row['description'])
    print("-----------------------------------")

Company Name: abanban
Description: for people on the go who want to have easy access to small items like pens and earbuds, the kombine by abanban is a great product. compared to a regular water bottle, it has the benefit of keeping the things you need close to you, in one place that is easily accessible. the kombine is a water bottle with an easy-to-access storage compartment that can hold 650ml of water, pencils, pens, erasers, earbuds, snacks like granola bars, coins, sticky notes, lip balm, keys and other small items. on top of that, the kombine is easy to carry around with its sturdy handle and convenient size. there is no water bottle on the market that has a storage compartment that is as convenient to access and is able to store so many different items.
-----------------------------------
Company Name: aqua air 247 
Description: o waste r/o out door water vending, electronically controlled, environmentally friendly. provides pure r/o and alkiline water direct to the consumer in 

In [13]:
similar_descriptions.to_csv("water_companies_by_similarities.csv", index=False)