In [46]:
import pandas as pd
import numpy as np
from indicnlp.tokenize import indic_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

# Load data
df = pd.read_csv("tamil_movie_reviews_train.csv")

# Preprocessing improvements
def enhanced_preprocessing(df):
    # Drop unnecessary columns
    df = df.drop(columns=['ReviewId'], errors='ignore')
    def extract_movie_name(text):
        """
        Enhanced movie name extraction that properly handles NEWLINE issues
        """
        # First clean the text by removing problematic patterns
        text = text.replace("<NEWLINE>", " ").replace("\n", " ").replace("\r", " ")
        text = text.replace("Read", " ").replace("\n", " ").replace("\r", " ")
        text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
        
        # Common patterns for Tamil movie names
        patterns = [
            r'[“"]([^"”]+)[”"]\s*படத்த(ின்|ை)',  # "Movie" படத்தின்
            r'படம(்|்)\s*[“"]([^"”]+)[”"]',      # படம் "Movie"
            r'[“"]([^"”]+)[”"]\s*திரைப்படம்',    # "Movie" திரைப்படம்
            r'(?:திரைப்பட|பட)மான\s*[“"]([^"”]+)[”"]',
            r'[“"]([^"”]{4,}?)[”"](?:\s*என(?:ப்படும்)?\s*திரைப்படம்)'
        ]
        
        for pattern in patterns:
            match = re.search(pattern, text)
            if match:
                # Get the first non-None group
                movie_name = next((g for g in match.groups() if g), None)
                if movie_name and movie_name != "NEWLINE":
                    return movie_name
        
        # Fallback: Find capitalized Tamil words (potential proper nouns)
        words = indic_tokenize.trivial_tokenize(text, lang='ta')
        proper_nouns = [word for word in words if len(word) > 3 and word[0].isupper()]
        
        if proper_nouns:
            # Get most frequent proper noun that isn't "NEWLINE"
            for word, _ in Counter(proper_nouns).most_common():
                if word != "NEWLINE":
                    return word
        
        return "Unknown"
    
    df['MovieName'] = df['ReviewInTamil'].apply(extract_movie_name)
    

    # Enhanced tokenization and stopword removal
    tamil_stopwords = set(["மற்றும்", "ஒரு", "என்று", "போன்ற", "இது", "அது", 
                          "இவர்", "அவர்", "என்", "உன்", "தான்", "ஆனால்"])
    
    def clean_text(text):
        # Remove special characters but keep Tamil punctuation
        text = re.sub(r'[^\w\s\u0B80-\u0BFF]', ' ', text)
        tokens = indic_tokenize.trivial_tokenize(text, lang='ta')
        tokens = [token for token in tokens if token not in tamil_stopwords and len(token) > 2]
        return ' '.join(tokens)
    
    df['processed_text'] = df['ReviewInTamil'].apply(clean_text)
    return df

df = enhanced_preprocessing(df)

vectorizer = TfidfVectorizer(
    max_features=5000,
    min_df=3,
    max_df=0.85,
    ngram_range=(1, 2) ) # Include bigrams

tfidf_matrix = vectorizer.fit_transform(df['processed_text'])

def enhanced_similarity(matrix):
    # Apply sigmoid transformation to emphasize higher similarities
    return 1 / (1 + np.exp(-matrix * 5))

similarity_matrix = enhanced_similarity(cosine_similarity(tfidf_matrix))

def recommend_movies(movie_name, top_n=5, similarity_threshold=0.3):
    """Enhanced recommendation with threshold filtering"""
    if movie_name not in df['MovieName'].values:
        return "Movie not found in dataset"
    
    movie_indices = df.index[df['MovieName'] == movie_name].tolist()
    all_recommendations = []
    
    for idx in movie_indices:
        similar_scores = list(enumerate(similarity_matrix[idx]))
        sorted_movies = sorted(similar_scores, key=lambda x: x[1], reverse=True)
        
        # Filter by threshold and exclude self
        filtered = [(i, score) for i, score in sorted_movies 
                   if i != idx and score > similarity_threshold][:top_n]
        
        for i, score in filtered:
            all_recommendations.append({
                'movie': df.iloc[i]['MovieName'],
                'similarity': score,
                'review': df.iloc[i]['ReviewInTamil'][:100] + '...'
            })
    
    recommendations = pd.DataFrame(all_recommendations)
    if not recommendations.empty:
        recommendations = recommendations.sort_values('similarity', ascending=False)
        return recommendations.drop_duplicates('movie').head(top_n)
    return "No similar movies found above similarity threshold"

print(recommend_movies("ரெமோ ", top_n=3))

         movie  similarity                                             review
0         REMO    0.958048  ரெமோ பூஜை போட்ட அன்றே எதிர்ப்பார்ப்பு விண்ணை ம...
1  வேலைக்காரன்    0.906164  கரு: தன் குப்பத்து மக்களை தவறாக வழிநடத்தும் தா...
2         Work    0.905127  Read Velaikkaran Review in English<NEWLINE>தமி...
