In [1]:
import pandas as pd


class MovieRatingProcessor:
    def __init__(self, rating_paths, movies_path):
        self.rating_paths = rating_paths
        self.movies_path = movies_path
        self.ratings = None
        self.movies = None

    def load_ratings(self):
        """Loads and concatenates ratings from multiple files."""
        self.ratings = pd.concat(
            [pd.read_csv(path, delimiter=",", header=None, names=["movie_id", "user_id", "date", "rating"])
                .assign(date=lambda df: pd.to_datetime(df['date'], format='%Y-%m-%d'))
             for path in self.rating_paths],
            ignore_index=True
        )
    
    def load_movies(self):
        """Loads movies from a CSV file."""
        movies = []
        with open(self.movies_path, encoding='ISO-8859-1') as file:
            for line in file:
                splitted_line = line.strip().split(",")
                movie_id, prod_year = splitted_line[:2]
                movie_name = ",".join(splitted_line[2:])

                if movie_id.strip().upper() == "NULL" or not movie_id.strip():
                    continue 
                if prod_year.strip().upper() == "NULL" or not prod_year.strip():
                    continue 
                if movie_name.strip().upper() == "NULL" or not movie_name.strip():
                    continue

                try:
                    movie = {
                        "movie_id": int(movie_id),
                        "movie_name": movie_name,
                        "movie_year": int(prod_year)
                    }
                except ValueError:
                    continue

                movies.append(movie)

        self.movies = pd.DataFrame(movies)
    
    def clean_ratings(self):
        """Cleans the ratings by removing users who rated only 1 or 5."""
        users_all_5 = self.ratings.groupby('user_id').filter(lambda x: (x['rating'] == 5).all())
        users_all_1 = self.ratings.groupby('user_id').filter(lambda x: (x['rating'] == 1).all())
        all_5_user_ids = users_all_5['user_id'].unique()
        all_1_user_ids = users_all_1['user_id'].unique()

        # Remove users who gave 1 or 5 to all movies
        self.ratings = self.ratings[~self.ratings['user_id'].isin(all_5_user_ids)]
        self.ratings = self.ratings[~self.ratings['user_id'].isin(all_1_user_ids)]
    
    def get_recent_ratings(self):
        """Filters ratings from the last 6 months."""
        latest_date = self.ratings['date'].max()
        six_months_ago = latest_date - pd.DateOffset(months=6)
        return self.ratings[self.ratings['date'] >= six_months_ago]
    
    def get_top_movies(self):
        """Returns the top 10 movies based on recent ratings."""
        recent_ratings = self.get_recent_ratings()
        movie_ratings = recent_ratings.groupby('movie_id').agg(
            average_rating=('rating', 'mean'),
            rating_count=('rating', 'count')
        ).reset_index()

        recommended_movies = movie_ratings.sort_values(by=['rating_count', 'average_rating'], ascending=[False, False])
        top_10_movies = recommended_movies.head(10)
        return pd.merge(top_10_movies, self.movies[['movie_id', 'movie_name']], on='movie_id', how='left')

    def display_results(self):
        """Displays the cleaned ratings and the top recommended movies."""
        print("Cleaned dataset:")
        print(self.ratings.head())

        recommended_movies_with_names = self.get_top_movies()
        print(recommended_movies_with_names[['movie_name', 'average_rating', 'rating_count']])


if __name__ == "__main__":
    rating_paths = [
        "C:\\Users\\PC\\Desktop\\turktelekombotcamp\\rating_1.txt",
        "C:\\Users\\PC\\Desktop\\turktelekombotcamp\\rating_2.txt",
        "C:\\Users\\PC\\Desktop\\turktelekombotcamp\\rating_3.txt",
        "C:\\Users\\PC\\Desktop\\turktelekombotcamp\\rating_4.txt"
    ]
    movies_path = 'C:\\Users\\PC\\Desktop\\turktelekombotcamp\\movie_titles.csv'

    processor = MovieRatingProcessor(rating_paths, movies_path)
    processor.load_ratings()
    processor.load_movies()
    processor.clean_ratings()
    processor.display_results()

Cleaned dataset:
   movie_id  user_id       date  rating
0         1  1488844 2005-09-06       3
1         1   822109 2005-05-13       5
2         1   885013 2005-10-19       4
3         1    30878 2005-12-26       4
4         1   823519 2004-05-03       3
               movie_name  average_rating  rating_count
0       National Treasure        3.739509        102107
1     Million Dollar Baby        4.157270         99752
2                   Hitch        3.840542         89503
3       Miss Congeniality        3.319703         72486
4  The Day After Tomorrow        3.405971         68766
5              Armageddon        3.654144         65374
6                   Crash        4.009928         64363
7             The Patriot        3.772275         64174
8             Constantine        3.397934         62739
9              Sister Act        3.156710         62370
