In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def process_links_and_extract_features(links):
    features = []
    for link in links:
        try:
            # Add your image extraction logic here
            print(f"Processing link: {link}")
            # Example: feature = extract_feature_from_link(link)
            feature = np.random.rand(4096)  # Placeholder for actual feature extraction
            features.append(feature)
        except Exception as e:
            print(f"Invalid or failed extraction for: {link}, Error: {e}")
            # Append a zero vector to ensure consistent shape
            features.append(np.zeros(4096))
    return np.array(features)

# === MAIN PIPELINE ===
# Step 1: Load dataset
CSV_FILE = 'movie_data_chunk_1.csv'
IMAGE_COLUMN = 'Poster'
OUTPUT_FEATURES_FILE = 'output_features.npy'
OUTPUT_CSV_FILE = 'output_results.csv'

df = pd.read_csv(CSV_FILE)
image_links = df[IMAGE_COLUMN].tolist()

# Step 2: Extract features
print("Extracting features...")
features = process_links_and_extract_features(image_links)

# Step 3: Save features for future use
np.save(OUTPUT_FEATURES_FILE, features)
print(f"Features saved to {OUTPUT_FEATURES_FILE}")

# Step 4: Compute pairwise similarity
print("Computing similarity...")
similarity_matrix = cosine_similarity(features)

# Step 5: Save similarity results
print("Saving results...")
results = []
for i in range(len(image_links)):
    similar_indices = np.argsort(-similarity_matrix[i])[:5]  # Top 5 similar posters
    similar_movies = [{"movie_id": idx, "similarity": similarity_matrix[i, idx]} for idx in similar_indices]
    results.append(similar_movies)

# Add results to the dataframe
df["similar_movies"] = results
df.to_csv(OUTPUT_CSV_FILE, index=False)

print("Process completed! Results saved to:", OUTPUT_CSV_FILE)

Extracting features...
Processing link: https://image.tmdb.org/t/p/w500/arSooSSFyjTfh9DU1jy8MnoxrCz.jpg
Processing link: https://image.tmdb.org/t/p/w500/69FC9DA5vuIiVsxdTmXAxj7v9O5.jpg
Processing link: https://image.tmdb.org/t/p/w500/SNEoUInCa5fAgwuEBMIMBGvkkh.jpg
Processing link: https://image.tmdb.org/t/p/w500/JHgkTjcrA8R0FI3RLK9BWm3QwS.jpg
Processing link: https://image.tmdb.org/t/p/w500/vQsMhL00lUOIhUa24xVOgVPjwVs.jpg
Processing link: nan
Processing link: https://image.tmdb.org/t/p/w500/l52k9R7FbTjAlboh4AQ8Tw5DOeZ.jpg
Processing link: https://image.tmdb.org/t/p/w500/4bKlTeOUr5AKrLky8mwWvlQqyVd.jpg
Processing link: nan
Processing link: https://image.tmdb.org/t/p/w500/k8fmhzkofNo6p2JNDjJdaFx3Eu.jpg
Processing link: https://image.tmdb.org/t/p/w500/cUT2d0Czgh3HjGrKL4J8Wo5VMw9.jpg
Processing link: https://image.tmdb.org/t/p/w500/sHweRz8RKrudkpNWru0rTJmEIAO.jpg
Processing link: https://image.tmdb.org/t/p/w500/hiBRNl2AnfbY8llQQP8KIB9kLJ0.jpg
Processing link: https://image.tmdb.org/t/p/w50

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def process_links_and_extract_features(links):
    features = []
    for link in links:
        try:
            # Add your image extraction logic here
            print(f"Processing link: {link}")
            # Example: feature = extract_feature_from_link(link)
            feature = np.random.rand(4096)  # Placeholder for actual feature extraction
            features.append(feature)
        except Exception as e:
            print(f"Invalid or failed extraction for: {link}, Error: {e}")
            # Append a zero vector to ensure consistent shape
            features.append(np.zeros(4096))
    return np.array(features)

# === MAIN PIPELINE ===
# Step 1: Load dataset
CSV_FILE = 'movie_data_chunk_1.csv'
IMAGE_COLUMN = 'Poster'
MOVIE_NAME_COLUMN = 'Title'
OUTPUT_FEATURES_FILE = 'output_features.npy'
OUTPUT_CSV_FILE = 'output_results.csv'

df = pd.read_csv(CSV_FILE)
image_links = df[IMAGE_COLUMN].tolist()
movie_names = df[MOVIE_NAME_COLUMN].tolist()

# Step 2: Extract features
print("Extracting features...")
features = process_links_and_extract_features(image_links)

# Step 3: Save features for future use
np.save(OUTPUT_FEATURES_FILE, features)
print(f"Features saved to {OUTPUT_FEATURES_FILE}")

# Step 4: Compute pairwise similarity
print("Computing similarity...")
similarity_matrix = cosine_similarity(features)

# Step 5: Save similarity results
print("Saving results...")
results = []
for i in range(len(image_links)):
    similar_indices = np.argsort(-similarity_matrix[i])[:5]  # Top 5 similar posters
    similar_movies = [{"movie_name": movie_names[idx], "similarity": similarity_matrix[i, idx]} for idx in similar_indices]
    results.append(similar_movies)

# Add results to the dataframe
df["similar_movies"] = results
df.to_csv(OUTPUT_CSV_FILE, index=False)

print("Process completed! Results saved to:", OUTPUT_CSV_FILE)

Extracting features...
Processing link: https://image.tmdb.org/t/p/w500/arSooSSFyjTfh9DU1jy8MnoxrCz.jpg
Processing link: https://image.tmdb.org/t/p/w500/69FC9DA5vuIiVsxdTmXAxj7v9O5.jpg
Processing link: https://image.tmdb.org/t/p/w500/SNEoUInCa5fAgwuEBMIMBGvkkh.jpg
Processing link: https://image.tmdb.org/t/p/w500/JHgkTjcrA8R0FI3RLK9BWm3QwS.jpg
Processing link: https://image.tmdb.org/t/p/w500/vQsMhL00lUOIhUa24xVOgVPjwVs.jpg
Processing link: nan
Processing link: https://image.tmdb.org/t/p/w500/l52k9R7FbTjAlboh4AQ8Tw5DOeZ.jpg
Processing link: https://image.tmdb.org/t/p/w500/4bKlTeOUr5AKrLky8mwWvlQqyVd.jpg
Processing link: nan
Processing link: https://image.tmdb.org/t/p/w500/k8fmhzkofNo6p2JNDjJdaFx3Eu.jpg
Processing link: https://image.tmdb.org/t/p/w500/cUT2d0Czgh3HjGrKL4J8Wo5VMw9.jpg
Processing link: https://image.tmdb.org/t/p/w500/sHweRz8RKrudkpNWru0rTJmEIAO.jpg
Processing link: https://image.tmdb.org/t/p/w500/hiBRNl2AnfbY8llQQP8KIB9kLJ0.jpg
Processing link: https://image.tmdb.org/t/p/w50