In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##Importing necessary Libraries

In [None]:
import pandas as pd
import re
import nltk
import math
import numpy as np
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer




##Loading the datasets

In [None]:
# Define paths to train and test folders
train_df =pd.read_csv("/content/drive/MyDrive/Data3/train.csv")
test_df =pd.read_csv("/content/drive/MyDrive/Data3/test.csv")

In [None]:
#printing the head and tail of train and test datasets
print(train_df.head())
print(test_df.head())


                     title                                        description
0                   Tsotsi  south african hoodlum named tsotsi presley chw...
1  Abducted in Plain Sight  year old girl abducted small church going comm...
2        My Life Is Murder  private investigator alexa crow always fight g...
3                   Empire  hip hop artist ceo empire entertainment luciou...
4              Latter Days  aaron davis steve sandvoss young mormon arrive...
                   title                                        description
0  Splitting Up Together  ellen degeneres serf executive producer comedy...
1               The Trip  collage film image ambient dance sound jacques...
2   The Kids Are Alright  set ensemble comedy follows traditional irish ...
3               Breeders  breeder explores paradox experienced nearly pa...
4   Let the Right One In  life mark daughter eleanor forever changed yea...


##Data Preprocessing

In [None]:
# Download necessary natural language toolkit data

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Set of English stopwords
stop_words = set(stopwords.words('english'))
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Check if the input is a string
    if not isinstance(text, str):
        text = ""
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'\W', ' ', text)
    # Remove numerical values
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize
    words = text.split()
    # Remove stop words and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)


# Apply preprocessing to the 'description' column in the train and test datasets

train_df['description'] = train_df['description'].apply(preprocess_text)
test_df['description'] = test_df['description'].apply(preprocess_text)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
# Print the first few rows of the train dataframe
print(train_df.head())
print(test_df.head())


                     title                                        description
0                   Tsotsi  south african hoodlum named tsotsi presley chw...
1  Abducted in Plain Sight  year old girl abducted small church going comm...
2        My Life Is Murder  private investigator alexa crow always fight g...
3                   Empire  hip hop artist ceo empire entertainment luciou...
4              Latter Days  aaron davis steve sandvoss young mormon arrive...
                   title                                        description
0  Splitting Up Together  ellen degeneres serf executive producer comedy...
1               The Trip  collage film image ambient dance sound jacques...
2   The Kids Are Alright  set ensemble comedy follows traditional irish ...
3               Breeders  breeder explores paradox experienced nearly pa...
4   Let the Right One In  life mark daughter eleanor forever changed yea...


##Building the inverted index

In [None]:
inverted_index = defaultdict(list)
document_frequencies = defaultdict(int)
total_documents = len(train_df)

for idx, description in enumerate(train_df['description']):
    unique_terms = set(description.split())
    for term in unique_terms:
        inverted_index[term].append(idx)
        document_frequencies[term] += 1

##Calculate TF-IDF scores

In [None]:
# Calculate TF-IDF scores for each document
def calculate_tf_idf(term, document, document_frequencies, total_documents):
    term_frequency = document.count(term)
    if term_frequency == 0:
        return 0
    tf = term_frequency / len(document)
    idf = math.log(total_documents / (1 + document_frequencies[term]))
    return tf * idf

tf_idf_scores = []

for idx, description in enumerate(train_df['description']):
    doc_tf_idf = {}
    for term in description.split():
        doc_tf_idf[term] = calculate_tf_idf(term, description, document_frequencies, total_documents)
    tf_idf_scores.append(doc_tf_idf)


##Showing Inverted Index first 5 terms

In [None]:
# Print the first few rows of the inverted index and TF-IDF scores for verification
print("Inverted Index (first 5 terms):")
for term in list(inverted_index.keys())[:5]:
    print(f"{term}: {inverted_index[term]}")

Inverted Index (first 5 terms):
thug: [0, 196, 879, 1531, 1856, 2217, 2823, 3222, 3351, 3362, 3739, 4005, 4172, 4345, 4413, 4495, 4774, 5146, 5232, 5598, 5720, 6076]
johannesburg: [0, 3015, 6099]
want: [0, 5, 19, 29, 36, 48, 102, 138, 163, 173, 246, 288, 316, 345, 349, 419, 438, 527, 537, 638, 644, 667, 681, 700, 805, 833, 892, 934, 974, 976, 997, 1125, 1197, 1240, 1250, 1262, 1310, 1344, 1353, 1440, 1474, 1515, 1521, 1556, 1561, 1704, 1767, 1776, 1813, 1892, 1960, 1984, 1991, 1998, 2099, 2136, 2155, 2183, 2193, 2226, 2351, 2383, 2405, 2446, 2478, 2625, 2663, 2687, 2769, 2817, 2831, 2845, 2850, 2947, 2952, 2992, 3017, 3083, 3120, 3150, 3154, 3260, 3301, 3322, 3332, 3396, 3443, 3482, 3510, 3561, 3603, 3605, 3704, 3718, 3727, 3735, 3799, 3844, 3947, 4002, 4040, 4050, 4072, 4137, 4180, 4233, 4270, 4291, 4298, 4360, 4365, 4394, 4410, 4456, 4534, 4535, 4567, 4580, 4598, 4635, 4644, 4672, 4696, 4758, 4762, 4772, 4776, 4827, 4835, 4843, 4855, 4856, 4898, 4950, 4952, 4984, 4999, 5043, 5047, 50

##Showing TF-IDF Scores

In [None]:
print("\nTF-IDF Scores (first document):")
print(tf_idf_scores[0])


TF-IDF Scores (first document):
{'south': 0.013774338554758635, 'african': 0.017458641131847193, 'hoodlum': 0.022064270238557795, 'named': 0.011701959641926598, 'tsotsi': 0.02666989934526839, 'presley': 0.02250789612767581, 'chweneyagae': 0.02666989934526839, 'life': 0.005011602707592661, 'code': 0.019370149918902214, 'violence': 0.016477032451136776, 'gang': 0.013774338554758635, 'thug': 0.03711157918642247, 'prowl': 0.025322839185440607, 'street': 0.01306742571662354, 'johannesburg': 0.024367084791913095, 'day': 0.009767585939180739, 'night': 0.012484857984138953, 'attacking': 0.02362574408987253, 'fail': 0.02132292953651723, 'give': 0.013123263319003545, 'want': 0.011795026207467042, 'casually': 0.02666989934526839, 'shooting': 0.01997586937668944, 'woman': 0.008064344201459303, 'stealing': 0.020717210078730005, 'car': 0.02754867710951727, 'discovers': 0.012029636065320709, 'baby': 0.01591017026831032, 'back': 0.010079108063607681, 'seat': 0.021006284420888244, 'instead': 0.0164770

##Searching and Ranking and use of Cosine Similarity

In [None]:
def preprocess_query(query):
    return preprocess_text(query)

def calculate_query_tf_idf(query_terms, document_frequencies, total_documents):
    query_tf_idf = {}
    for term in query_terms.split():
        tf = query_terms.split().count(term) / len(query_terms.split())
        idf = math.log(total_documents / (1 + document_frequencies.get(term, 0)))
        query_tf_idf[term] = tf * idf
    return query_tf_idf

#Function for cosine similarity
def calculate_cosine_similarity(doc_vector, query_vector):
    dot_product = sum(doc_vector.get(term, 0) * query_vector.get(term, 0) for term in query_vector)
    doc_norm = np.sqrt(sum(val**2 for val in doc_vector.values()))
    query_norm = np.sqrt(sum(val**2 for val in query_vector.values()))
    if doc_norm == 0 or query_norm == 0:
        return 0.0
    return dot_product / (doc_norm * query_norm)

#Function to search top matches
def search(query, inverted_index, tf_idf_scores, document_frequencies, total_documents):
    query_terms = preprocess_query(query)
    query_tf_idf = calculate_query_tf_idf(query_terms, document_frequencies, total_documents)

    document_scores = defaultdict(float)
    for term in query_terms.split():
        for doc_id in inverted_index.get(term, []):
            document_scores[doc_id] += calculate_cosine_similarity(tf_idf_scores[doc_id], query_tf_idf)

    ranked_docs = sorted(document_scores.items(), key=lambda item: item[1], reverse=True)
    return ranked_docs






##Finding top 3 movies / shows

In [None]:
# Find top 3 matches for each description in the test set
top_matches = []

for test_description in test_df['description']:
    ranked_results = search(test_description, inverted_index, tf_idf_scores, document_frequencies, total_documents)
    top_3 = ranked_results[:3]
    top_matches.append(top_3)

# Print top 3 matches for each test description
for i, matches in enumerate(top_matches):
    print(f"Test description {i+1}: {' '.join(test_df['description'].iloc[i])}\n")
    for rank, (doc_id, score) in enumerate(matches):
        print(f"Rank {rank + 1}: Movie/Show ID: {doc_id}, Score: {score}")
        print(f"Description: {' '.join(train_df['description'].iloc[doc_id])}\n")



Test description 1: e l l e n   d e g e n e r e s   s e r f   e x e c u t i v e   p r o d u c e r   c o m e d y   b a s e d   d a n i s h   s e r i e s   n a m e   l e n a   m a r t i n   c e r t a i n   e n o u g h   m a r r i a g e   m a k e   o f f i c i a l   g o   c o m p l i c a t e d   u n t a n g l i n g   i n v o l v e d   f i l i n g   d i v o r c e   l a s t   t h i n g   e x p e c t e d   b r o u g h t   b a c k   t o g e t h e r   e x p e r i e n c e   f i n d   r e l a t i o n s h i p   s t r a n g e l y   r e i g n i t e d   e x p e r i e n c e   t o g e t h e r   n a v i g a t e   e v o l v i n g   r e l a t i o n s h i p   o p e n   m i n d   n e w l y   r e o p e n e d   h e a r t

Rank 1: Movie/Show ID: 1802, Score: 1.4449450199859808
Description: f r a n   d r e s c h e r   s t a r   c o m e d y   s e r i e s   b a s e d   r e a l   l i f e   e x p e r i e n c e   d r e s c h e r   p l a y   l o s   a n g e l e s   f l o r i s t   f r a n   w h o s e   m a r r i a g