In [35]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
pd.set_option('display.max_colwidth', None)

In [30]:
# Read all datasets 
datasets = pd.read_csv("datasets.csv")

In [31]:
datasets.head()

Unnamed: 0,Dataset Title,Overview,id
0,Agricultural Crop Yield Analysis,"A dataset containing historical crop yield data for various agricultural crops., agriculture, crop yield, agriculture data, agriculture statistics",1
1,Farm Equipment Usage Statistics,"Data on the usage of farm equipment and machinery in agriculture., agriculture, farm equipment, machinery, agriculture statistics",2
2,Soil Quality Measurements,"A collection of soil quality measurements and analysis results from agricultural fields., agriculture, soil quality, agriculture data, agriculture research",3
3,Irrigation Water Usage Trends,"Data on the trends in irrigation water usage in agriculture over the years., agriculture, irrigation, water usage, agriculture statistics",4
4,Crop Pest and Disease Records,"Records of pest and disease incidents in agricultural crops., agriculture, crop pests, diseases, agriculture data",5


In [32]:
#Filling NaNs with empty string
datasets["Overview"] = datasets["Overview"].fillna('')

In [83]:
# Create a TFIDFVectorizer Object
tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents="unicode", analyzer="word", token_pattern=r"\w{1,}", ngram_range=(1, 3), stop_words="english")

In [84]:
# Fit the TfidfVectorizer on the "overview" text
tfv_matrix = tfv.fit_transform(datasets["Overview"])

In [87]:
tfv_matrix

<40x33 sparse matrix of type '<class 'numpy.float64'>'
	with 210 stored elements in Compressed Sparse Row format>

In [85]:
# Calculate the cosine similarity between all dataset
cos_sim = cosine_similarity(tfv_matrix, tfv_matrix)

In [86]:
cos_sim[39]

array([0.02800006, 0.20016712, 0.01472482, 0.33171022, 0.0203072 ,
       0.        , 0.02075624, 0.03328244, 0.        , 0.02553401,
       0.03169988, 0.        , 0.01391844, 0.03290229, 0.02639258,
       0.01758212, 0.        , 0.01182928, 0.02345602, 0.35919182,
       0.4341347 , 0.49043308, 0.54287725, 0.31868186, 0.42479788,
       0.22317823, 0.34351843, 0.52265197, 0.38113554, 0.44810864,
       0.3382512 , 0.4510665 , 0.31249892, 0.35817201, 0.72368689,
       0.37160245, 0.20529618, 0.27100939, 0.13365889, 1.        ])

In [96]:
# Create a reverse mapping of indices
indices = pd.Series(datasets.index, index = datasets["Dataset Title"]).drop_duplicates()

In [102]:
indices1.head()

Dataset Title
Agricultural Crop Yield Analysis    0
Farm Equipment Usage Statistics     1
Soil Quality Measurements           2
Irrigation Water Usage Trends       3
Crop Pest and Disease Records       4
dtype: int64

In [88]:
indices["Technology Trends Analysis"]

19

In [89]:
cos_sim[19]

array([0.16206983, 0.07131406, 0.12149777, 0.06166652, 0.0206611 ,
       0.15076719, 0.02111797, 0.03386246, 0.10530433, 0.025979  ,
       0.08369286, 0.13179012, 0.014161  , 0.03347569, 0.02685253,
       0.01788852, 0.        , 0.05042702, 0.02386479, 1.        ,
       0.16269137, 0.49897995, 0.3503026 , 0.60226098, 0.43220091,
       0.38393567, 0.34950499, 0.37218836, 0.70137839, 0.45591792,
       0.0177611 , 0.07109581, 0.01640888, 0.        , 0.25045639,
       0.03819408, 0.        , 0.01423032, 0.07223632, 0.35919182])

In [95]:
list(enumerate(cos_sim[indices["Technology Trends Analysis"]]))

[(0, 0.1620698267863877),
 (1, 0.07131405517640607),
 (2, 0.12149776554622396),
 (3, 0.06166652268117996),
 (4, 0.020661098498155277),
 (5, 0.15076719344836415),
 (6, 0.02111796640647431),
 (7, 0.03386245782743031),
 (8, 0.10530432622822877),
 (9, 0.025979000707892615),
 (10, 0.08369286042114904),
 (11, 0.13179011816948807),
 (12, 0.014160997611208564),
 (13, 0.033475688453915915),
 (14, 0.02685253074309168),
 (15, 0.017888521961657602),
 (16, 0.0),
 (17, 0.05042702019089328),
 (18, 0.02386479070040818),
 (19, 1.0),
 (20, 0.16269137468329353),
 (21, 0.4989799468920756),
 (22, 0.35030259833118255),
 (23, 0.602260984188812),
 (24, 0.4322009112527446),
 (25, 0.38393567432982956),
 (26, 0.34950498978947414),
 (27, 0.37218836241287784),
 (28, 0.701378387422296),
 (29, 0.4559179185848733),
 (30, 0.01776109543811648),
 (31, 0.07109581478947077),
 (32, 0.01640887923155652),
 (33, 0.0),
 (34, 0.25045638903940726),
 (35, 0.038194075519841275),
 (36, 0.0),
 (37, 0.014230322569203716),
 (38, 0.072

In [93]:
sorted(list(enumerate(cos_sim[indices["Technology Trends Analysis"]])), key=lambda x: x[1], reverse=True)

[(19, 1.0),
 (28, 0.701378387422296),
 (23, 0.602260984188812),
 (21, 0.4989799468920756),
 (29, 0.4559179185848733),
 (24, 0.4322009112527446),
 (25, 0.38393567432982956),
 (27, 0.37218836241287784),
 (39, 0.3591918152736984),
 (22, 0.35030259833118255),
 (26, 0.34950498978947414),
 (34, 0.25045638903940726),
 (20, 0.16269137468329353),
 (0, 0.1620698267863877),
 (5, 0.15076719344836415),
 (11, 0.13179011816948807),
 (2, 0.12149776554622396),
 (8, 0.10530432622822877),
 (10, 0.08369286042114904),
 (38, 0.07223632482851024),
 (1, 0.07131405517640607),
 (31, 0.07109581478947077),
 (3, 0.06166652268117996),
 (17, 0.05042702019089328),
 (35, 0.038194075519841275),
 (7, 0.03386245782743031),
 (13, 0.033475688453915915),
 (14, 0.02685253074309168),
 (9, 0.025979000707892615),
 (18, 0.02386479070040818),
 (6, 0.02111796640647431),
 (4, 0.020661098498155277),
 (15, 0.017888521961657602),
 (30, 0.01776109543811648),
 (32, 0.01640887923155652),
 (37, 0.014230322569203716),
 (12, 0.0141609976112