AP22110010128 | Krishna Sharma

In [2]:
import math
import re
from collections import defaultdict, Counter
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [3]:
documents = [
    "machine learning algorithms use neural networks to process data and make predictions",
    "artificial intelligence involves deep learning models trained on large datasets", 
    "computer vision algorithms analyze images using convolutional neural networks",
    "natural language processing uses transformers and neural networks for text analysis",
    "data science combines statistics machine learning and programming to extract insights",
    "deep learning models require extensive training data and computational resources",
    "supervised learning algorithms learn from labeled training data to make predictions",
    "unsupervised learning finds patterns in data without labeled examples"
]

Binary Independence Model

In [4]:
query = "neural networks deep learning"
query_terms = set(re.findall(r'\b\w+\b', query.lower()))

inverted_index = defaultdict(set)
for doc_id, doc in enumerate(documents):
    terms = re.findall(r'\b\w+\b', doc.lower())
    for term in set(terms):
        inverted_index[term].add(doc_id)

In [5]:
N = len(documents)
bim_scores = {}

for doc_id in range(N):
    score = 0
    for term in query_terms:
        if term in inverted_index:
            df = len(inverted_index[term])  
            if doc_id in inverted_index[term]:
                score += math.log((N - df + 0.5) / (df + 0.5))
    bim_scores[doc_id] = score

In [6]:
bim_ranked = sorted(bim_scores.items(), key=lambda x: x[1], reverse=True)

print(f"Query: '{query}'\n")
for rank, (doc_id, score) in enumerate(bim_ranked[:3], 1):
    print(f"Rank {rank} (Score: {score:.4f}):")
    print(f"  Document {doc_id}: {documents[doc_id]}\n")

Query: 'neural networks deep learning'

Rank 1 (Score: 0.9040):
  Document 2: computer vision algorithms analyze images using convolutional neural networks

Rank 2 (Score: 0.9040):
  Document 3: natural language processing uses transformers and neural networks for text analysis

Rank 3 (Score: 0.0000):
  Document 1: artificial intelligence involves deep learning models trained on large datasets



Okapi BM25

In [7]:
query2 = "data analysis machine learning"
query2_terms = re.findall(r'\b\w+\b', query2.lower())

k1 = 1.5
b = 0.75

doc_terms = [re.findall(r'\b\w+\b', doc.lower()) for doc in documents]

avg_dl = sum(len(doc) for doc in doc_terms) / N

df = defaultdict(int)
for doc in doc_terms:
    for term in set(doc):
        df[term] += 1

In [8]:
bm25_scores = {}
for doc_id, doc in enumerate(doc_terms):
    score = 0
    dl = len(doc)
    term_freq = Counter(doc)
    
    for term in query2_terms:
        if term in term_freq:
            tf = term_freq[term]
            df_term = df[term]
            
            idf = math.log((N - df_term + 0.5) / (df_term + 0.5) + 1.0)
            
            numerator = tf * (k1 + 1)
            denominator = tf + k1 * (1 - b + b * (dl / avg_dl))
            score += idf * (numerator / denominator)
    
    bm25_scores[doc_id] = score

In [9]:
bm25_ranked = sorted(bm25_scores.items(), key=lambda x: x[1], reverse=True)

print(f"Query: '{query2}'\n")
for rank, (doc_id, score) in enumerate(bm25_ranked[:3], 1):
    print(f"Rank {rank} (Score: {score:.4f}):")
    print(f"  Document {doc_id}: {documents[doc_id]}\n")

Query: 'data analysis machine learning'

Rank 1 (Score: 2.0434):
  Document 4: data science combines statistics machine learning and programming to extract insights

Rank 2 (Score: 1.9606):
  Document 0: machine learning algorithms use neural networks to process data and make predictions

Rank 3 (Score: 1.7445):
  Document 3: natural language processing uses transformers and neural networks for text analysis



Text Classification

In [10]:
texts = [
    "machine learning algorithms use neural networks",
    "deep learning models trained on datasets",
    "neural networks process data efficiently",
    "convolutional networks analyze images",
    "supervised learning uses labeled data",
    "python programming language for data science",
    "javascript web development framework",
    "java object oriented programming",
    "c++ systems programming language",
    "ruby on rails web application",
    "data visualization with matplotlib",
    "statistical analysis using python",
    "database query optimization techniques",
    "sql joins and aggregations",
    "nosql document databases"
]

In [11]:
labels = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
label_names = ['Machine Learning', 'Programming', 'Data/Database']

X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.3, random_state=42
)

vectorizer = TfidfVectorizer(max_features=50)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [12]:
classifier = MultinomialNB()
classifier.fit(X_train_vec, y_train)

y_pred = classifier.predict(X_test_vec)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}\n")
print(classification_report(y_test, y_pred, target_names=label_names))

Accuracy: 0.4000

                  precision    recall  f1-score   support

Machine Learning       0.25      1.00      0.40         1
     Programming       1.00      0.50      0.67         2
   Data/Database       0.00      0.00      0.00         2

        accuracy                           0.40         5
       macro avg       0.42      0.50      0.36         5
    weighted avg       0.45      0.40      0.35         5



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [14]:
test_docs = [
    "neural networks for deep learning",
    "python programming tutorial",
    "database management system"
]

test_vec = vectorizer.transform(test_docs)
predictions = classifier.predict(test_vec)

print("\nPredictions on new documents:")
for doc, pred in zip(test_docs, predictions):
    print(f"  '{doc}' -> {label_names[pred]}")


Predictions on new documents:
  'neural networks for deep learning' -> Machine Learning
  'python programming tutorial' -> Programming
  'database management system' -> Data/Database


Text Clustering

In [16]:
vectorizer2 = TfidfVectorizer(max_features=50)
X = vectorizer2.fit_transform(documents)

n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X)

print(f"Number of clusters: {n_clusters}")

Number of clusters: 3


In [18]:
cluster_docs = defaultdict(list)
for doc_id, cluster_id in enumerate(clusters):
    cluster_docs[cluster_id].append((doc_id, documents[doc_id]))

for cluster_id in sorted(cluster_docs.keys()):
    print(f"Cluster {cluster_id}:")
    for doc_id, doc in cluster_docs[cluster_id]:
        print(f"  Doc {doc_id}: {doc}")

Cluster 0:
  Doc 2: computer vision algorithms analyze images using convolutional neural networks
  Doc 3: natural language processing uses transformers and neural networks for text analysis
Cluster 1:
  Doc 0: machine learning algorithms use neural networks to process data and make predictions
  Doc 4: data science combines statistics machine learning and programming to extract insights
  Doc 6: supervised learning algorithms learn from labeled training data to make predictions
  Doc 7: unsupervised learning finds patterns in data without labeled examples
Cluster 2:
  Doc 1: artificial intelligence involves deep learning models trained on large datasets
  Doc 5: deep learning models require extensive training data and computational resources


In [19]:
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer2.get_feature_names_out()

print("Top terms per cluster:")
for cluster_id in range(n_clusters):
    top_terms = [terms[ind] for ind in order_centroids[cluster_id, :5]]
    print(f"  Cluster {cluster_id}: {', '.join(top_terms)}")

Top terms per cluster:
  Cluster 0: neural, networks, convolutional, computer, analyze
  Cluster 1: data, to, learning, labeled, make
  Cluster 2: deep, models, computational, extensive, require


AP22110010128 | Krishna Sharma