In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# pd.set_option('display.max_colwidth', None)

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/kasra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/kasra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/kasra/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
df = pd.read_csv('./datasets/freelance-projects.csv')
df.dropna(inplace=True)

In [3]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words]
    
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
    
    preprocessed_text = ' '.join(lemmatized_tokens)
    
    return preprocessed_text


df['Description'] = df['Description'].apply(preprocess_text)

In [5]:
vectorizer = TfidfVectorizer()
vectorizer.fit(df['Description'])

description_vector = vectorizer.transform(df['Description'])

In [6]:
description_similarity_matrix = cosine_similarity(description_vector)

related_indices = description_similarity_matrix[0].argsort()[::-1]
closest_indices = related_indices[1:11]

print(f"Closest to index 0: {closest_indices}")

Closest to index 0: [444 581 579 132 530 510 509 705 480 759]
