## DBS Movie DB Anomaly Detection

In [2]:
import mysql.connector as mysql
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly import tools
import warnings
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
from os.path import exists
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [3]:
#Initialize database connection
credentials = pd.read_csv('../../credentials/credentials.csv')

database_adress = credentials['Host'].iloc[0]
database_user = credentials['User'].iloc[0]
database_password = credentials['Password'].iloc[0]
database_name = credentials['Database'].iloc[0]

db_connection_str = f"mysql+pymysql://{database_user}:{database_password}@{database_adress}/{database_name}"
db_connection = create_engine(db_connection_str)

In [5]:
#Check for local copy of data - if not present, download from database

if exists('ratings_single_account.csv'):
    ratings_single_account = pd.read_csv('ratings_single_account.csv')
else:
    ratings_single_account = pd.read_sql('SELECT * FROM ratings as r WHERE (SELECT COUNT(*) FROM ratings as ra WHERE r.userId = ra.userId) = 1', con=db_connection)
    ratings_single_account = ratings_single_account.sort_values(by=['movieId'])
    ratings_single_account.to_csv('ratings_single_account.csv', index=False)

if exists('movies_single_account.csv'):
    movies_single_account = pd.read_csv('movies_single_account.csv')
else:
    previous_movie = 0
    movie_ratings = 0
    movies_and_rating_amount = []

    for index, row in ratings_single_account.iterrows():
        if row['movieId'] == previous_movie:
            movie_ratings += 1
            if len(ratings_single_account) - 1 == index:
                movies_and_rating_amount.append([int(previous_movie), int(movie_ratings)])
        else:
            movies_and_rating_amount.append([int(previous_movie), int(movie_ratings)])
            previous_movie = row['movieId']
            movie_ratings = 1

    movies_ratings_from_user_with_only_one_rating = pd.DataFrame(movies_and_rating_amount, columns=['Movie Id', 'Amount'])
    movies_ratings_from_user_with_only_one_rating.sort_values(by=['Amount'], ascending=False, inplace=True)

    #Get relationship table between movieId and movie title
    movies_ids_names = pd.read_sql('SELECT movieId, title, release_year FROM movies', con=db_connection)

    #Drop last 0 entry to prevent error
    movies_ratings_from_user_with_only_one_rating.drop(index=0, inplace=True, errors='ignore')

    movies_ratings_from_user_with_only_one_rating['title'] = movies_ratings_from_user_with_only_one_rating['Movie Id'].apply(lambda x: movies_ids_names.query(f'movieId == {x}').title.values[0])
    movies_ratings_from_user_with_only_one_rating['release_year'] = movies_ratings_from_user_with_only_one_rating['Movie Id'].apply(lambda x: movies_ids_names.query(f'movieId == {x}').release_year.values[0])

    movies_ratings_from_user_with_only_one_rating.to_csv('movies_single_account.csv', index=False)
    movies_single_account = pd.read_csv('movies_single_account.csv')

if exists('all_ratings.csv'):
    all_ratings = pd.read_csv('all_ratings.csv')
else:
    all_ratings = pd.read_sql('SELECT * FROM ratings limit 10000', con=db_connection)
    all_ratings.to_csv('all_ratings.csv', index=False)