In [1]:
import pandas as pd
import numpy as np
import re
from indicnlp.tokenize import indic_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import joblib

# ========================
# 1. DATA PREPROCESSING
# ========================

def enhanced_preprocessing(df):
    """Clean and prepare the movie review data"""
    
    # Basic cleaning
    df = df.drop(columns=['ReviewId'], errors='ignore')
    
    # Enhanced movie name extraction
    def extract_movie_name(text):
        text = text.replace("<NEWLINE>", " ").replace("\n", " ").replace("\r", " ")
        text = re.sub(r'\s+', ' ', text).strip()
        
        patterns = [
            r'[“"]([^"”]+)[”"]\s*படத்த(ின்|ை)',
            r'படம(்|்)\s*[“"]([^"”]+)[”"]',
            r'[“"]([^"”]+)[”"]\s*திரைப்படம்',
            r'(?:திரைப்பட|பட)மான\s*[“"]([^"”]+)[”"]',
            r'[“"]([^"”]{4,}?)[”"](?:\s*என(?:ப்படும்)?\s*திரைப்படம்)'
        ]
        
        for pattern in patterns:
            match = re.search(pattern, text)
            if match:
                movie_name = next((g for g in match.groups() if g), None)
                if movie_name and movie_name != "NEWLINE":
                    return movie_name
        
        words = indic_tokenize.trivial_tokenize(text, lang='ta')
        proper_nouns = [word for word in words if len(word) > 3 and word[0].isupper()]
        if proper_nouns:
            for word, _ in Counter(proper_nouns).most_common():
                if word != "NEWLINE":
                    return word
        return "Unknown"
    
    df['MovieName'] = df['ReviewInTamil'].apply(extract_movie_name)
    
    # Text cleaning
    tamil_stopwords = set(["மற்றும்", "ஒரு", "என்று", "போன்ற", "இது", "அது"])
    
    def clean_text(text):
        text = re.sub(r'[^\w\s\u0B80-\u0BFF]', ' ', text)
        tokens = indic_tokenize.trivial_tokenize(text, lang='ta')
        tokens = [token for token in tokens if token not in tamil_stopwords and len(token) > 2]
        return ' '.join(tokens)
    
    df['processed_text'] = df['ReviewInTamil'].apply(clean_text)
    return df

# ========================
# 2. TRAINING PHASE
# ========================

# Load and preprocess training data
train_df = pd.read_csv("tamil_movie_reviews_train.csv")
train_df = enhanced_preprocessing(train_df)

# Train TF-IDF Vectorizer
vectorizer = TfidfVectorizer(
    max_features=5000,
    min_df=3,
    max_df=0.85,
    ngram_range=(1, 2))
train_tfidf = vectorizer.fit_transform(train_df['processed_text'])

# Compute similarity matrix
train_similarity = cosine_similarity(train_tfidf)

# Save trained artifacts
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
joblib.dump(train_similarity, 'train_similarity.joblib')

# ========================
# 3. TESTING PHASE
# ========================

# Load test data
test_df = pd.read_csv("tamil_movie_reviews_test.csv")
test_df = enhanced_preprocessing(test_df)

# Load saved models
vectorizer = joblib.load('tfidf_vectorizer.joblib')
train_similarity = joblib.load('train_similarity.joblib')

# Transform test data
test_tfidf = vectorizer.transform(test_df['processed_text'])

# Compute test-train similarity
test_similarity = cosine_similarity(test_tfidf, train_tfidf)

# Recommendation function
def get_recommendations(test_index, top_n=3, min_score=0.5):
    """Get recommendations for a test review"""
    scores = list(enumerate(test_similarity[test_index]))
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    
    recommendations = []
    for train_idx, score in sorted_scores:
        if score < min_score or test_index == train_idx:
            continue
        if len(recommendations) >= top_n:
            break
        recommendations.append({
            'movie': train_df.iloc[train_idx]['MovieName'],
            'score': score,
            'review_snippet': train_df.iloc[train_idx]['processed_text'][:100] + '...'
        })
    
    return recommendations

# ========================
# 4. EVALUATION
# ========================

# Generate recommendations for all test reviews
results = []
for i in range(len(test_df)):
    rec = get_recommendations(i)
    results.append({
        'test_movie': test_df.iloc[i]['MovieName'],
        'test_review': test_df.iloc[i]['processed_text'][:100] + '...',
        'recommendations': rec
    })

# Convert to DataFrame
results_df = pd.DataFrame(results)

del results_df['test_review']
# Save results
results_df.to_csv('movie_recommendations_results.csv', index=False)

# Print sample results
print("\nSample Recommendations:")
for i, row in results_df.head(3).iterrows():
    print(f"\nTest Movie: {row['test_movie']}")
    print(f"Review: {row['test_review']}")
    print("Top Recommendations:")
    for rec in row['recommendations']:
        print(f"- {rec['movie']} (score: {rec['score']:.3f})")


Sample Recommendations:

Test Movie: Unknown


KeyError: 'test_review'