## DBS Movie DB Anomaly Detection

In [62]:
import numpy as np
import mysql.connector as mysql
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly import tools
import warnings
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
from os.path import exists
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [3]:
#Initialize database connection
credentials = pd.read_csv('../../credentials/credentials.csv')

database_adress = credentials['Host'].iloc[0]
database_user = credentials['User'].iloc[0]
database_password = credentials['Password'].iloc[0]
database_name = credentials['Database'].iloc[0]

db_connection_str = f"mysql+pymysql://{database_user}:{database_password}@{database_adress}/{database_name}"
db_connection = create_engine(db_connection_str)

## Import

In [7]:
#Check for local copy of data - if not present, download from database

if exists('ratings_single_account.csv'):
    ratings_single_account = pd.read_csv('ratings_single_account.csv')
else:
    ratings_single_account = pd.read_sql('SELECT * FROM ratings as r WHERE (SELECT COUNT(*) FROM ratings as ra WHERE r.userId = ra.userId) = 1', con=db_connection)
    ratings_single_account = ratings_single_account.sort_values(by=['movieId'])
    ratings_single_account.to_csv('ratings_single_account.csv', index=False)

if exists('movies_single_account.csv'):
    movies_single_account = pd.read_csv('movies_single_account.csv')
else:
    previous_movie = 0
    movie_ratings = 0
    movies_and_rating_amount = []

    for index, row in ratings_single_account.iterrows():
        if row['movieId'] == previous_movie:
            movie_ratings += 1
            if len(ratings_single_account) - 1 == index:
                movies_and_rating_amount.append([int(previous_movie), int(movie_ratings)])
        else:
            movies_and_rating_amount.append([int(previous_movie), int(movie_ratings)])
            previous_movie = row['movieId']
            movie_ratings = 1

    movies_ratings_from_user_with_only_one_rating = pd.DataFrame(movies_and_rating_amount, columns=['Movie Id', 'Amount'])
    movies_ratings_from_user_with_only_one_rating.sort_values(by=['Amount'], ascending=False, inplace=True)

    #Get relationship table between movieId and movie title
    movies_ids_names = pd.read_sql('SELECT movieId, title, release_year FROM movies', con=db_connection)

    #Drop last 0 entry to prevent error
    movies_ratings_from_user_with_only_one_rating.drop(index=0, inplace=True, errors='ignore')

    movies_ratings_from_user_with_only_one_rating['title'] = movies_ratings_from_user_with_only_one_rating['Movie Id'].apply(lambda x: movies_ids_names.query(f'movieId == {x}').title.values[0])
    movies_ratings_from_user_with_only_one_rating['release_year'] = movies_ratings_from_user_with_only_one_rating['Movie Id'].apply(lambda x: movies_ids_names.query(f'movieId == {x}').release_year.values[0])

    movies_ratings_from_user_with_only_one_rating.to_csv('movies_single_account.csv', index=False)
    movies_single_account = pd.read_csv('movies_single_account.csv')

if exists('all_ratings.csv'):
    all_ratings = pd.read_csv('all_ratings.csv')
else:
    all_ratings = pd.read_sql('SELECT * FROM ratings', con=db_connection)
    all_ratings.to_csv('all_ratings.csv', index=False)

### (1) Rating Scatter over time (Accounts with single ratings vs. normal ratings)

In [31]:
def ratings_scatter(n_top=1):
    for i, row in movies_single_account.head(n_top).iterrows():
        m_id = row['Movie Id']
        m_name = row['title']
        sus_rating = ratings_single_account.query(f'movieId == {m_id}').copy()
        normal_rating = all_ratings.query(f'movieId == {m_id}').copy()
        sus_rating.sort_values(by=['rating_date'], inplace=True)
        normal_rating.sort_values(by=['rating_date'], inplace=True)
        sus_rating_fig = px.scatter(sus_rating, x='rating_date', y='rating', title=f'Rating of movie <b>{m_name}</b> over time (Accounts with single ratings)', range_x=['1998-01-01', '2019-01-01'], hover_data=['userId'])
        normal_rating_fig = px.scatter(normal_rating, x='rating_date', y='rating', title=f'Rating of movie <b>{m_name}</b> over time (Normal ratings)', range_x=['1998-01-01', '2019-01-01'], hover_data=['userId'])

        sus_rating_fig.show()
        normal_rating_fig.show()

ratings_scatter(1)

### (2) Analyze Single Ratings over time (year)

In [148]:
#convert rating_date from object to datetime
ratings_single_account['rating_date'] = pd.to_datetime(ratings_single_account['rating_date'])


def plot_hist_ratings(r_data):
    """
    Plot histogram of ratings, call this for each year with corresponding rating data
    """
    fig2 = px.histogram(r_data, x='rating_date', y='rating', histfunc='count', title=f'(Accounts with single ratings) Rating histogram for year <b>{r_data.rating_date.dt.year.iloc[0]}</b>',range_y=[0,200],text_auto=True)
    fig2.update_layout(
    xaxis=dict(
        title='time in months',
        titlefont_size=16
    ),
    yaxis=dict(
        title='Amount of Ratings',
        titlefont_size=16
    )
        )
    fig2.update_traces(xbins_size="M1")
    fig2.update_xaxes(showgrid=True, ticklabelmode="period", dtick="M1", tickformat="%b\n%Y")
    fig2.show()

#Set year range here
year_range = np.arange(2014, 2019, 1) 

for year in year_range:
    ratings_single_account_year = ratings_single_account.query(f'rating_date >= "{year}-01-01 00:00:00" & rating_date <= "{year}-12-31 23:59:59"')
    plot_hist_ratings(ratings_single_account_year)



### (3) Favorite hours for Single Ratings

In [151]:
ratings_single_account_onlytime = ratings_single_account.copy()
ratings_single_account_onlytime['hour'] = ratings_single_account_onlytime['rating_date'].dt.hour.astype(int)
ratings_single_account_onlytime['minute'] = ratings_single_account_onlytime['rating_date'].dt.minute.astype(int)
ratings_single_account_onlytime['second'] = ratings_single_account_onlytime['rating_date'].dt.second.astype(int)

for i in ['hour','minute','second']:
    fig3 = px.histogram(ratings_single_account_onlytime, x=f'{i}', y='rating', histfunc='count', title=f'Favorite Timestamp for Single Ratings: <b>{i}</b>',text_auto=True)
    fig3.update_layout(
    xaxis=dict(
        title=f'time [{i}]',
        titlefont_size=16
    ),
    yaxis=dict(
        title='Amount of Ratings',
        titlefont_size=16
    ),
    margin_pad=0
        )
    fig3.update_xaxes(showgrid=True, ticklabelmode="period", dtick="M1", tickformat="%b\n%Y")
    fig3.show()
