In [1]:
import pandas as pd
movies_path = '~/Course_Python/data_splitting/Movie-Recommendation-System-ML/archive/movie.csv'
ratings_path = '~/Course_Python/data_splitting/Movie-Recommendation-System-ML/archive/rating.csv'
tags_path = '~/Course_Python/data_splitting/Movie-Recommendation-System-ML/archive/tag.csv'
links_path = '~/Course_Python/data_splitting/Movie-Recommendation-System-ML/archive/link.csv'
genome_tags_path = '~/Course_Python/data_splitting/Movie-Recommendation-System-ML/archive/genome_tags.csv'
genome_scores_path = '~/Course_Python/data_splitting/Movie-Recommendation-System-ML/archive/genome_scores.csv'

In [2]:
movies = pd.read_csv(movies_path)
print( movies.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [6]:
ratings = pd.read_csv(ratings_path)
print(ratings.head())
  # contains userId, movieId, rating, timestamp

# Merge on 'movieId' to get title and genres for each rating
df = pd.merge(ratings, movies, on='movieId')

# Example preview
print(df.head())

   userId  movieId  rating            timestamp
0       1        2     3.5  2005-04-02 23:53:47
1       1       29     3.5  2005-04-02 23:31:16
2       1       32     3.5  2005-04-02 23:33:39
3       1       47     3.5  2005-04-02 23:32:07
4       1       50     3.5  2005-04-02 23:29:40
   userId  movieId  rating            timestamp  \
0       1        2     3.5  2005-04-02 23:53:47   
1       1       29     3.5  2005-04-02 23:31:16   
2       1       32     3.5  2005-04-02 23:33:39   
3       1       47     3.5  2005-04-02 23:32:07   
4       1       50     3.5  2005-04-02 23:29:40   

                                               title  \
0                                     Jumanji (1995)   
1  City of Lost Children, The (Cité des enfants p...   
2          Twelve Monkeys (a.k.a. 12 Monkeys) (1995)   
3                        Seven (a.k.a. Se7en) (1995)   
4                         Usual Suspects, The (1995)   

                                   genres  
0              Adventure

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

# Use movie titles or genres as the review text
reviews = df['title']  # Or you could use ratings['genres']

# Generate sentiments based on ratings
sentiments = (ratings['rating'] >= 3.5).astype(int)  # 1 for positive, 0 for negative

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(reviews, sentiments, test_size=0.2, random_state=24)

# Convert text into numerical features using TF-IDF
tf_idf = TfidfVectorizer(ngram_range=(1, 3))
X_train_tfidf = tf_idf.fit_transform(X_train)
X_test_tfidf = tf_idf.transform(X_test)

# Train Multinomial Naive Bayes model
mnb_model = MultinomialNB()
mnb_model.fit(X_train_tfidf, y_train)

# Make predictions
preds = mnb_model.predict(X_test_tfidf)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, preds):.3f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, preds))

# Optionally, train on the entire dataset
X_tfidf = tf_idf.fit_transform(reviews)
mnb_model.fit(X_tfidf, sentiments)
preds_all = mnb_model.predict(X_tfidf)
print(f"Accuracy on all data: {accuracy_score(sentiments, preds_all):.3f}")
print("Confusion Matrix (all data):")
print(confusion_matrix(sentiments, preds_all))


Accuracy: 0.687
Confusion Matrix:
[[ 894859  664110]
 [ 587375 1853709]]
Accuracy on all data: 0.688
Confusion Matrix (all data):
[[4491543 3313154]
 [2924195 9271371]]


In [8]:
def recommend_by_model(genre_keyword, model, tfidf_vectorizer, movies_df, top_n=10):
    # Filter movies that contain the genre
    genre_filtered = movies_df[movies_df['genres'].str.contains(genre_keyword, case=False, na=False)]

    # Remove duplicates (some movieIds can repeat)
    genre_filtered = genre_filtered.drop_duplicates(subset='movieId')

    # Get genre text
    genres_text = genre_filtered['genres']

    # Transform genres using same TF-IDF vectorizer
    X = tfidf_vectorizer.transform(genres_text)

    # Predict sentiment (1 = positive, 0 = negative)
    preds = model.predict(X)

    # Add predictions to the filtered dataframe
    genre_filtered = genre_filtered.copy()
    genre_filtered['predicted_sentiment'] = preds

    # Recommend top N with predicted sentiment = 1
    recommendations = genre_filtered[genre_filtered['predicted_sentiment'] == 1]

    return recommendations[['title', 'genres']].head(top_n)


In [11]:
# movies is your original movies dataframe (movieId, title, genres)

recommendations = recommend_by_model(
    genre_keyword="Drama|Thriller",
    model=mnb_model,
    tfidf_vectorizer=tf_idf,
    movies_df=movies,
    top_n=5
)

print(recommendations)

                            title                            genres
20              Get Shorty (1995)             Comedy|Crime|Thriller
46    Seven (a.k.a. Se7en) (1995)                  Mystery|Thriller
130                   Jade (1995)                          Thriller
162  Devil in a Blue Dress (1995)  Crime|Film-Noir|Mystery|Thriller
181           Mute Witness (1994)            Comedy|Horror|Thriller
